AdithyaSK's picture
Upload folder using huggingface_hub
ee4137a verified
raw
history blame
41.9 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4991875203119922,
"eval_steps": 128,
"global_step": 256,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.5149741172790527,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.8709,
"step": 1
},
{
"epoch": 0.0,
"eval_loss": 1.8383064270019531,
"eval_runtime": 707.8127,
"eval_samples_per_second": 7.169,
"eval_steps_per_second": 1.793,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 0.48140937089920044,
"learning_rate": 4.000000000000001e-06,
"loss": 1.7751,
"step": 2
},
{
"epoch": 0.01,
"grad_norm": 0.4886001944541931,
"learning_rate": 6e-06,
"loss": 1.795,
"step": 3
},
{
"epoch": 0.01,
"grad_norm": 0.46349120140075684,
"learning_rate": 8.000000000000001e-06,
"loss": 1.7569,
"step": 4
},
{
"epoch": 0.01,
"grad_norm": 0.5320057272911072,
"learning_rate": 1e-05,
"loss": 1.9278,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 0.48083460330963135,
"learning_rate": 1.2e-05,
"loss": 1.778,
"step": 6
},
{
"epoch": 0.01,
"grad_norm": 0.503804624080658,
"learning_rate": 1.4e-05,
"loss": 1.8358,
"step": 7
},
{
"epoch": 0.02,
"grad_norm": 0.5177507400512695,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.8655,
"step": 8
},
{
"epoch": 0.02,
"grad_norm": 0.5006410479545593,
"learning_rate": 1.8e-05,
"loss": 1.8087,
"step": 9
},
{
"epoch": 0.02,
"grad_norm": 0.500285804271698,
"learning_rate": 2e-05,
"loss": 1.8254,
"step": 10
},
{
"epoch": 0.02,
"grad_norm": 0.4819566607475281,
"learning_rate": 1.9999804178263253e-05,
"loss": 1.7627,
"step": 11
},
{
"epoch": 0.02,
"grad_norm": 0.4860954284667969,
"learning_rate": 1.999921672072223e-05,
"loss": 1.7034,
"step": 12
},
{
"epoch": 0.03,
"grad_norm": 0.5111412405967712,
"learning_rate": 1.9998237650384324e-05,
"loss": 1.7203,
"step": 13
},
{
"epoch": 0.03,
"grad_norm": 0.500988245010376,
"learning_rate": 1.9996867005594193e-05,
"loss": 1.6721,
"step": 14
},
{
"epoch": 0.03,
"grad_norm": 0.4903103709220886,
"learning_rate": 1.999510484003224e-05,
"loss": 1.6167,
"step": 15
},
{
"epoch": 0.03,
"grad_norm": 0.4756762683391571,
"learning_rate": 1.999295122271253e-05,
"loss": 1.57,
"step": 16
},
{
"epoch": 0.03,
"grad_norm": 0.4689522385597229,
"learning_rate": 1.999040623798008e-05,
"loss": 1.5461,
"step": 17
},
{
"epoch": 0.04,
"grad_norm": 0.5094612836837769,
"learning_rate": 1.9987469985507553e-05,
"loss": 1.5526,
"step": 18
},
{
"epoch": 0.04,
"grad_norm": 0.49769631028175354,
"learning_rate": 1.9984142580291368e-05,
"loss": 1.5115,
"step": 19
},
{
"epoch": 0.04,
"grad_norm": 0.46388670802116394,
"learning_rate": 1.9980424152647174e-05,
"loss": 1.467,
"step": 20
},
{
"epoch": 0.04,
"grad_norm": 0.4357146918773651,
"learning_rate": 1.9976314848204762e-05,
"loss": 1.3887,
"step": 21
},
{
"epoch": 0.04,
"grad_norm": 0.440377414226532,
"learning_rate": 1.997181482790236e-05,
"loss": 1.3845,
"step": 22
},
{
"epoch": 0.04,
"grad_norm": 0.4116402566432953,
"learning_rate": 1.9966924267980326e-05,
"loss": 1.4091,
"step": 23
},
{
"epoch": 0.05,
"grad_norm": 0.3181552588939667,
"learning_rate": 1.996164335997425e-05,
"loss": 1.3324,
"step": 24
},
{
"epoch": 0.05,
"grad_norm": 0.2932267189025879,
"learning_rate": 1.995597231070744e-05,
"loss": 1.315,
"step": 25
},
{
"epoch": 0.05,
"grad_norm": 0.328800231218338,
"learning_rate": 1.994991134228285e-05,
"loss": 1.3334,
"step": 26
},
{
"epoch": 0.05,
"grad_norm": 0.32027724385261536,
"learning_rate": 1.9943460692074345e-05,
"loss": 1.3161,
"step": 27
},
{
"epoch": 0.05,
"grad_norm": 0.3247709274291992,
"learning_rate": 1.993662061271743e-05,
"loss": 1.2601,
"step": 28
},
{
"epoch": 0.06,
"grad_norm": 0.33424896001815796,
"learning_rate": 1.9929391372099352e-05,
"loss": 1.2807,
"step": 29
},
{
"epoch": 0.06,
"grad_norm": 0.28847330808639526,
"learning_rate": 1.9921773253348604e-05,
"loss": 1.2427,
"step": 30
},
{
"epoch": 0.06,
"grad_norm": 0.2601753771305084,
"learning_rate": 1.991376655482383e-05,
"loss": 1.2602,
"step": 31
},
{
"epoch": 0.06,
"grad_norm": 0.25505828857421875,
"learning_rate": 1.9905371590102157e-05,
"loss": 1.2539,
"step": 32
},
{
"epoch": 0.06,
"grad_norm": 0.25789541006088257,
"learning_rate": 1.989658868796689e-05,
"loss": 1.2796,
"step": 33
},
{
"epoch": 0.07,
"grad_norm": 0.1963696926832199,
"learning_rate": 1.988741819239467e-05,
"loss": 1.2533,
"step": 34
},
{
"epoch": 0.07,
"grad_norm": 0.1652669906616211,
"learning_rate": 1.9877860462541964e-05,
"loss": 1.27,
"step": 35
},
{
"epoch": 0.07,
"grad_norm": 0.15272551774978638,
"learning_rate": 1.986791587273103e-05,
"loss": 1.2092,
"step": 36
},
{
"epoch": 0.07,
"grad_norm": 0.14809414744377136,
"learning_rate": 1.985758481243523e-05,
"loss": 1.2028,
"step": 37
},
{
"epoch": 0.07,
"grad_norm": 0.14091093838214874,
"learning_rate": 1.98468676862638e-05,
"loss": 1.1737,
"step": 38
},
{
"epoch": 0.08,
"grad_norm": 0.13234961032867432,
"learning_rate": 1.9835764913945998e-05,
"loss": 1.2242,
"step": 39
},
{
"epoch": 0.08,
"grad_norm": 0.12562313675880432,
"learning_rate": 1.982427693031465e-05,
"loss": 1.1846,
"step": 40
},
{
"epoch": 0.08,
"grad_norm": 0.12460777163505554,
"learning_rate": 1.981240418528914e-05,
"loss": 1.1954,
"step": 41
},
{
"epoch": 0.08,
"grad_norm": 0.1261477917432785,
"learning_rate": 1.9800147143857774e-05,
"loss": 1.1944,
"step": 42
},
{
"epoch": 0.08,
"grad_norm": 0.12070100754499435,
"learning_rate": 1.9787506286059584e-05,
"loss": 1.1814,
"step": 43
},
{
"epoch": 0.09,
"grad_norm": 0.1318473368883133,
"learning_rate": 1.9774482106965512e-05,
"loss": 1.2289,
"step": 44
},
{
"epoch": 0.09,
"grad_norm": 0.11869361251592636,
"learning_rate": 1.9761075116659037e-05,
"loss": 1.1507,
"step": 45
},
{
"epoch": 0.09,
"grad_norm": 0.11668427288532257,
"learning_rate": 1.974728584021618e-05,
"loss": 1.1693,
"step": 46
},
{
"epoch": 0.09,
"grad_norm": 0.12271205335855484,
"learning_rate": 1.9733114817684957e-05,
"loss": 1.219,
"step": 47
},
{
"epoch": 0.09,
"grad_norm": 0.12055838108062744,
"learning_rate": 1.9718562604064213e-05,
"loss": 1.2424,
"step": 48
},
{
"epoch": 0.1,
"grad_norm": 0.1191168949007988,
"learning_rate": 1.97036297692819e-05,
"loss": 1.2206,
"step": 49
},
{
"epoch": 0.1,
"grad_norm": 0.11361384391784668,
"learning_rate": 1.9688316898172744e-05,
"loss": 1.1927,
"step": 50
},
{
"epoch": 0.1,
"grad_norm": 0.109556645154953,
"learning_rate": 1.967262459045535e-05,
"loss": 1.2013,
"step": 51
},
{
"epoch": 0.1,
"grad_norm": 0.11278169602155685,
"learning_rate": 1.9656553460708707e-05,
"loss": 1.2379,
"step": 52
},
{
"epoch": 0.1,
"grad_norm": 0.11011548340320587,
"learning_rate": 1.9640104138348124e-05,
"loss": 1.1808,
"step": 53
},
{
"epoch": 0.11,
"grad_norm": 0.09818632155656815,
"learning_rate": 1.9623277267600574e-05,
"loss": 1.1731,
"step": 54
},
{
"epoch": 0.11,
"grad_norm": 0.1045491099357605,
"learning_rate": 1.9606073507479466e-05,
"loss": 1.1729,
"step": 55
},
{
"epoch": 0.11,
"grad_norm": 0.0985143780708313,
"learning_rate": 1.9588493531758843e-05,
"loss": 1.165,
"step": 56
},
{
"epoch": 0.11,
"grad_norm": 0.09513280540704727,
"learning_rate": 1.9570538028946974e-05,
"loss": 1.1765,
"step": 57
},
{
"epoch": 0.11,
"grad_norm": 0.09834066778421402,
"learning_rate": 1.9552207702259412e-05,
"loss": 1.1411,
"step": 58
},
{
"epoch": 0.12,
"grad_norm": 0.09748240560293198,
"learning_rate": 1.9533503269591438e-05,
"loss": 1.1995,
"step": 59
},
{
"epoch": 0.12,
"grad_norm": 0.09501401335000992,
"learning_rate": 1.9514425463489946e-05,
"loss": 1.1414,
"step": 60
},
{
"epoch": 0.12,
"grad_norm": 0.09078366309404373,
"learning_rate": 1.9494975031124768e-05,
"loss": 1.1132,
"step": 61
},
{
"epoch": 0.12,
"grad_norm": 0.09064218401908875,
"learning_rate": 1.947515273425939e-05,
"loss": 1.1498,
"step": 62
},
{
"epoch": 0.12,
"grad_norm": 0.09029112011194229,
"learning_rate": 1.945495934922113e-05,
"loss": 1.158,
"step": 63
},
{
"epoch": 0.12,
"grad_norm": 0.09335145354270935,
"learning_rate": 1.9434395666870735e-05,
"loss": 1.181,
"step": 64
},
{
"epoch": 0.13,
"grad_norm": 0.08959628641605377,
"learning_rate": 1.9413462492571403e-05,
"loss": 1.1353,
"step": 65
},
{
"epoch": 0.13,
"grad_norm": 0.09235028922557831,
"learning_rate": 1.9392160646157242e-05,
"loss": 1.1566,
"step": 66
},
{
"epoch": 0.13,
"grad_norm": 0.08852320164442062,
"learning_rate": 1.937049096190117e-05,
"loss": 1.1015,
"step": 67
},
{
"epoch": 0.13,
"grad_norm": 0.09060905128717422,
"learning_rate": 1.934845428848222e-05,
"loss": 1.1312,
"step": 68
},
{
"epoch": 0.13,
"grad_norm": 0.09065355360507965,
"learning_rate": 1.9326051488952334e-05,
"loss": 1.1456,
"step": 69
},
{
"epoch": 0.14,
"grad_norm": 0.09140690416097641,
"learning_rate": 1.9303283440702524e-05,
"loss": 1.1661,
"step": 70
},
{
"epoch": 0.14,
"grad_norm": 0.08641023188829422,
"learning_rate": 1.9280151035428544e-05,
"loss": 1.1153,
"step": 71
},
{
"epoch": 0.14,
"grad_norm": 0.08729224652051926,
"learning_rate": 1.9256655179095954e-05,
"loss": 1.1956,
"step": 72
},
{
"epoch": 0.14,
"grad_norm": 0.08514908701181412,
"learning_rate": 1.9232796791904627e-05,
"loss": 1.0969,
"step": 73
},
{
"epoch": 0.14,
"grad_norm": 0.08789129555225372,
"learning_rate": 1.9208576808252725e-05,
"loss": 1.1669,
"step": 74
},
{
"epoch": 0.15,
"grad_norm": 0.0829731896519661,
"learning_rate": 1.918399617670011e-05,
"loss": 1.101,
"step": 75
},
{
"epoch": 0.15,
"grad_norm": 0.08415351063013077,
"learning_rate": 1.9159055859931163e-05,
"loss": 1.122,
"step": 76
},
{
"epoch": 0.15,
"grad_norm": 0.07933653146028519,
"learning_rate": 1.9133756834717118e-05,
"loss": 1.1175,
"step": 77
},
{
"epoch": 0.15,
"grad_norm": 0.0849999189376831,
"learning_rate": 1.9108100091877787e-05,
"loss": 1.1577,
"step": 78
},
{
"epoch": 0.15,
"grad_norm": 0.0835108831524849,
"learning_rate": 1.9082086636242757e-05,
"loss": 1.1253,
"step": 79
},
{
"epoch": 0.16,
"grad_norm": 0.07834841310977936,
"learning_rate": 1.905571748661204e-05,
"loss": 1.0963,
"step": 80
},
{
"epoch": 0.16,
"grad_norm": 0.07953493297100067,
"learning_rate": 1.902899367571617e-05,
"loss": 1.1102,
"step": 81
},
{
"epoch": 0.16,
"grad_norm": 0.07989759743213654,
"learning_rate": 1.9001916250175764e-05,
"loss": 1.1576,
"step": 82
},
{
"epoch": 0.16,
"grad_norm": 0.07849448174238205,
"learning_rate": 1.8974486270460518e-05,
"loss": 1.0963,
"step": 83
},
{
"epoch": 0.16,
"grad_norm": 0.07805287837982178,
"learning_rate": 1.894670481084769e-05,
"loss": 1.1364,
"step": 84
},
{
"epoch": 0.17,
"grad_norm": 0.07698098570108414,
"learning_rate": 1.8918572959380005e-05,
"loss": 1.1407,
"step": 85
},
{
"epoch": 0.17,
"grad_norm": 0.0766262486577034,
"learning_rate": 1.8890091817823073e-05,
"loss": 1.1225,
"step": 86
},
{
"epoch": 0.17,
"grad_norm": 0.0798678770661354,
"learning_rate": 1.8861262501622213e-05,
"loss": 1.137,
"step": 87
},
{
"epoch": 0.17,
"grad_norm": 0.07717825472354889,
"learning_rate": 1.8832086139858777e-05,
"loss": 1.1311,
"step": 88
},
{
"epoch": 0.17,
"grad_norm": 0.07542562484741211,
"learning_rate": 1.880256387520593e-05,
"loss": 1.1066,
"step": 89
},
{
"epoch": 0.18,
"grad_norm": 0.07316063344478607,
"learning_rate": 1.8772696863883905e-05,
"loss": 1.0976,
"step": 90
},
{
"epoch": 0.18,
"grad_norm": 0.0738874301314354,
"learning_rate": 1.8742486275614706e-05,
"loss": 1.0901,
"step": 91
},
{
"epoch": 0.18,
"grad_norm": 0.07698226720094681,
"learning_rate": 1.8711933293576303e-05,
"loss": 1.1224,
"step": 92
},
{
"epoch": 0.18,
"grad_norm": 0.07452582567930222,
"learning_rate": 1.8681039114356298e-05,
"loss": 1.1399,
"step": 93
},
{
"epoch": 0.18,
"grad_norm": 0.07452700287103653,
"learning_rate": 1.8649804947905057e-05,
"loss": 1.1639,
"step": 94
},
{
"epoch": 0.19,
"grad_norm": 0.07358838617801666,
"learning_rate": 1.861823201748833e-05,
"loss": 1.1139,
"step": 95
},
{
"epoch": 0.19,
"grad_norm": 0.07469804584980011,
"learning_rate": 1.8586321559639316e-05,
"loss": 1.1103,
"step": 96
},
{
"epoch": 0.19,
"grad_norm": 0.07484911382198334,
"learning_rate": 1.8554074824110285e-05,
"loss": 1.1231,
"step": 97
},
{
"epoch": 0.19,
"grad_norm": 0.07320189476013184,
"learning_rate": 1.8521493073823583e-05,
"loss": 1.1405,
"step": 98
},
{
"epoch": 0.19,
"grad_norm": 0.07219311594963074,
"learning_rate": 1.8488577584822197e-05,
"loss": 1.1084,
"step": 99
},
{
"epoch": 0.19,
"grad_norm": 0.07267658412456512,
"learning_rate": 1.8455329646219767e-05,
"loss": 1.109,
"step": 100
},
{
"epoch": 0.2,
"grad_norm": 0.07124843448400497,
"learning_rate": 1.8421750560150112e-05,
"loss": 1.0997,
"step": 101
},
{
"epoch": 0.2,
"grad_norm": 0.06921572983264923,
"learning_rate": 1.8387841641716226e-05,
"loss": 1.1095,
"step": 102
},
{
"epoch": 0.2,
"grad_norm": 0.07149618864059448,
"learning_rate": 1.835360421893876e-05,
"loss": 1.1078,
"step": 103
},
{
"epoch": 0.2,
"grad_norm": 0.07851895689964294,
"learning_rate": 1.8319039632704042e-05,
"loss": 1.1195,
"step": 104
},
{
"epoch": 0.2,
"grad_norm": 0.07615454494953156,
"learning_rate": 1.8284149236711527e-05,
"loss": 1.0754,
"step": 105
},
{
"epoch": 0.21,
"grad_norm": 0.07054944336414337,
"learning_rate": 1.8248934397420802e-05,
"loss": 1.0943,
"step": 106
},
{
"epoch": 0.21,
"grad_norm": 0.07253159582614899,
"learning_rate": 1.821339649399807e-05,
"loss": 1.1263,
"step": 107
},
{
"epoch": 0.21,
"grad_norm": 0.0729857012629509,
"learning_rate": 1.817753691826212e-05,
"loss": 1.0977,
"step": 108
},
{
"epoch": 0.21,
"grad_norm": 0.07234011590480804,
"learning_rate": 1.8141357074629838e-05,
"loss": 1.1334,
"step": 109
},
{
"epoch": 0.21,
"grad_norm": 0.07030120491981506,
"learning_rate": 1.8104858380061178e-05,
"loss": 1.0767,
"step": 110
},
{
"epoch": 0.22,
"grad_norm": 0.07036615908145905,
"learning_rate": 1.80680422640037e-05,
"loss": 1.0796,
"step": 111
},
{
"epoch": 0.22,
"grad_norm": 0.0742933601140976,
"learning_rate": 1.8030910168336558e-05,
"loss": 1.0671,
"step": 112
},
{
"epoch": 0.22,
"grad_norm": 0.07065165787935257,
"learning_rate": 1.7993463547314044e-05,
"loss": 1.1594,
"step": 113
},
{
"epoch": 0.22,
"grad_norm": 0.07182008028030396,
"learning_rate": 1.7955703867508634e-05,
"loss": 1.0936,
"step": 114
},
{
"epoch": 0.22,
"grad_norm": 0.06882106512784958,
"learning_rate": 1.791763260775354e-05,
"loss": 1.1017,
"step": 115
},
{
"epoch": 0.23,
"grad_norm": 0.07001936435699463,
"learning_rate": 1.7879251259084803e-05,
"loss": 1.1267,
"step": 116
},
{
"epoch": 0.23,
"grad_norm": 0.06916490197181702,
"learning_rate": 1.78405613246829e-05,
"loss": 1.0787,
"step": 117
},
{
"epoch": 0.23,
"grad_norm": 0.07149837166070938,
"learning_rate": 1.7801564319813854e-05,
"loss": 1.1302,
"step": 118
},
{
"epoch": 0.23,
"grad_norm": 0.06783504039049149,
"learning_rate": 1.776226177176991e-05,
"loss": 1.1159,
"step": 119
},
{
"epoch": 0.23,
"grad_norm": 0.07285293936729431,
"learning_rate": 1.7722655219809718e-05,
"loss": 1.0758,
"step": 120
},
{
"epoch": 0.24,
"grad_norm": 0.07273004204034805,
"learning_rate": 1.768274621509803e-05,
"loss": 1.1019,
"step": 121
},
{
"epoch": 0.24,
"grad_norm": 0.07392899692058563,
"learning_rate": 1.7642536320644964e-05,
"loss": 1.1111,
"step": 122
},
{
"epoch": 0.24,
"grad_norm": 0.0693732351064682,
"learning_rate": 1.7602027111244807e-05,
"loss": 1.1109,
"step": 123
},
{
"epoch": 0.24,
"grad_norm": 0.0721542090177536,
"learning_rate": 1.7561220173414297e-05,
"loss": 1.1246,
"step": 124
},
{
"epoch": 0.24,
"grad_norm": 0.07002190500497818,
"learning_rate": 1.7520117105330524e-05,
"loss": 1.073,
"step": 125
},
{
"epoch": 0.25,
"grad_norm": 0.0697953850030899,
"learning_rate": 1.7478719516768324e-05,
"loss": 1.0913,
"step": 126
},
{
"epoch": 0.25,
"grad_norm": 0.07040461152791977,
"learning_rate": 1.7437029029037233e-05,
"loss": 1.1445,
"step": 127
},
{
"epoch": 0.25,
"grad_norm": 0.07231634110212326,
"learning_rate": 1.7395047274917994e-05,
"loss": 1.1106,
"step": 128
},
{
"epoch": 0.25,
"eval_loss": 1.0988876819610596,
"eval_runtime": 708.4228,
"eval_samples_per_second": 7.162,
"eval_steps_per_second": 1.791,
"step": 128
},
{
"epoch": 0.25,
"grad_norm": 0.0713375061750412,
"learning_rate": 1.7352775898598615e-05,
"loss": 1.0982,
"step": 129
},
{
"epoch": 0.25,
"grad_norm": 0.06747942417860031,
"learning_rate": 1.731021655560995e-05,
"loss": 1.1017,
"step": 130
},
{
"epoch": 0.26,
"grad_norm": 0.071540467441082,
"learning_rate": 1.72673709127609e-05,
"loss": 1.0859,
"step": 131
},
{
"epoch": 0.26,
"grad_norm": 0.06861750036478043,
"learning_rate": 1.7224240648073097e-05,
"loss": 1.0728,
"step": 132
},
{
"epoch": 0.26,
"grad_norm": 0.06919445842504501,
"learning_rate": 1.718082745071521e-05,
"loss": 1.1218,
"step": 133
},
{
"epoch": 0.26,
"grad_norm": 0.07422851771116257,
"learning_rate": 1.7137133020936783e-05,
"loss": 1.0881,
"step": 134
},
{
"epoch": 0.26,
"grad_norm": 0.07452652603387833,
"learning_rate": 1.7093159070001637e-05,
"loss": 1.1073,
"step": 135
},
{
"epoch": 0.27,
"grad_norm": 0.07337850332260132,
"learning_rate": 1.7048907320120867e-05,
"loss": 1.1065,
"step": 136
},
{
"epoch": 0.27,
"grad_norm": 0.07020066678524017,
"learning_rate": 1.700437950438537e-05,
"loss": 1.0742,
"step": 137
},
{
"epoch": 0.27,
"grad_norm": 0.07053718715906143,
"learning_rate": 1.695957736669799e-05,
"loss": 1.0627,
"step": 138
},
{
"epoch": 0.27,
"grad_norm": 0.07288292795419693,
"learning_rate": 1.6914502661705216e-05,
"loss": 1.0842,
"step": 139
},
{
"epoch": 0.27,
"grad_norm": 0.07197044044733047,
"learning_rate": 1.6869157154728437e-05,
"loss": 1.065,
"step": 140
},
{
"epoch": 0.27,
"grad_norm": 0.07109569013118744,
"learning_rate": 1.6823542621694852e-05,
"loss": 1.0996,
"step": 141
},
{
"epoch": 0.28,
"grad_norm": 0.07084467262029648,
"learning_rate": 1.677766084906787e-05,
"loss": 1.0862,
"step": 142
},
{
"epoch": 0.28,
"grad_norm": 0.07195379585027695,
"learning_rate": 1.6731513633777173e-05,
"loss": 1.1184,
"step": 143
},
{
"epoch": 0.28,
"grad_norm": 0.07326792180538177,
"learning_rate": 1.668510278314833e-05,
"loss": 1.0867,
"step": 144
},
{
"epoch": 0.28,
"grad_norm": 0.07582233846187592,
"learning_rate": 1.6638430114832015e-05,
"loss": 1.0721,
"step": 145
},
{
"epoch": 0.28,
"grad_norm": 0.07204006612300873,
"learning_rate": 1.6591497456732827e-05,
"loss": 1.0565,
"step": 146
},
{
"epoch": 0.29,
"grad_norm": 0.07225130498409271,
"learning_rate": 1.6544306646937683e-05,
"loss": 1.1036,
"step": 147
},
{
"epoch": 0.29,
"grad_norm": 0.07662148773670197,
"learning_rate": 1.649685953364385e-05,
"loss": 1.0289,
"step": 148
},
{
"epoch": 0.29,
"grad_norm": 0.07611638307571411,
"learning_rate": 1.644915797508656e-05,
"loss": 1.1068,
"step": 149
},
{
"epoch": 0.29,
"grad_norm": 0.07609565556049347,
"learning_rate": 1.6401203839466212e-05,
"loss": 1.0816,
"step": 150
},
{
"epoch": 0.29,
"grad_norm": 0.0737641304731369,
"learning_rate": 1.6352999004875242e-05,
"loss": 1.1016,
"step": 151
},
{
"epoch": 0.3,
"grad_norm": 0.07359515875577927,
"learning_rate": 1.630454535922452e-05,
"loss": 1.0787,
"step": 152
},
{
"epoch": 0.3,
"grad_norm": 0.07506351917982101,
"learning_rate": 1.6255844800169472e-05,
"loss": 1.0789,
"step": 153
},
{
"epoch": 0.3,
"grad_norm": 0.07777760922908783,
"learning_rate": 1.62068992350357e-05,
"loss": 1.096,
"step": 154
},
{
"epoch": 0.3,
"grad_norm": 0.07574637979269028,
"learning_rate": 1.6157710580744322e-05,
"loss": 1.1007,
"step": 155
},
{
"epoch": 0.3,
"grad_norm": 0.07857154309749603,
"learning_rate": 1.610828076373687e-05,
"loss": 1.0735,
"step": 156
},
{
"epoch": 0.31,
"grad_norm": 0.07402702420949936,
"learning_rate": 1.605861171989988e-05,
"loss": 1.1003,
"step": 157
},
{
"epoch": 0.31,
"grad_norm": 0.07439373433589935,
"learning_rate": 1.6008705394489032e-05,
"loss": 1.0662,
"step": 158
},
{
"epoch": 0.31,
"grad_norm": 0.07392847537994385,
"learning_rate": 1.5958563742052987e-05,
"loss": 1.0487,
"step": 159
},
{
"epoch": 0.31,
"grad_norm": 0.07773245126008987,
"learning_rate": 1.5908188726356843e-05,
"loss": 1.1107,
"step": 160
},
{
"epoch": 0.31,
"grad_norm": 0.07752656936645508,
"learning_rate": 1.5857582320305207e-05,
"loss": 1.0426,
"step": 161
},
{
"epoch": 0.32,
"grad_norm": 0.07541097700595856,
"learning_rate": 1.5806746505864947e-05,
"loss": 1.081,
"step": 162
},
{
"epoch": 0.32,
"grad_norm": 0.07938623428344727,
"learning_rate": 1.5755683273987554e-05,
"loss": 1.0969,
"step": 163
},
{
"epoch": 0.32,
"grad_norm": 0.07379717379808426,
"learning_rate": 1.5704394624531184e-05,
"loss": 1.0763,
"step": 164
},
{
"epoch": 0.32,
"grad_norm": 0.07850446552038193,
"learning_rate": 1.5652882566182316e-05,
"loss": 1.1029,
"step": 165
},
{
"epoch": 0.32,
"grad_norm": 0.07627106457948685,
"learning_rate": 1.5601149116377095e-05,
"loss": 1.0611,
"step": 166
},
{
"epoch": 0.33,
"grad_norm": 0.07577154785394669,
"learning_rate": 1.554919630122232e-05,
"loss": 1.0973,
"step": 167
},
{
"epoch": 0.33,
"grad_norm": 0.07844171673059464,
"learning_rate": 1.5497026155416087e-05,
"loss": 1.1006,
"step": 168
},
{
"epoch": 0.33,
"grad_norm": 0.08061926811933517,
"learning_rate": 1.5444640722168114e-05,
"loss": 1.0879,
"step": 169
},
{
"epoch": 0.33,
"grad_norm": 0.07918211817741394,
"learning_rate": 1.53920420531197e-05,
"loss": 1.0602,
"step": 170
},
{
"epoch": 0.33,
"grad_norm": 0.08213488012552261,
"learning_rate": 1.5339232208263394e-05,
"loss": 1.0798,
"step": 171
},
{
"epoch": 0.34,
"grad_norm": 0.07898285239934921,
"learning_rate": 1.5286213255862295e-05,
"loss": 1.0969,
"step": 172
},
{
"epoch": 0.34,
"grad_norm": 0.08233582973480225,
"learning_rate": 1.5232987272369076e-05,
"loss": 1.0699,
"step": 173
},
{
"epoch": 0.34,
"grad_norm": 0.08074311912059784,
"learning_rate": 1.5179556342344643e-05,
"loss": 1.0851,
"step": 174
},
{
"epoch": 0.34,
"grad_norm": 0.08196305483579636,
"learning_rate": 1.51259225583765e-05,
"loss": 1.076,
"step": 175
},
{
"epoch": 0.34,
"grad_norm": 0.08637065440416336,
"learning_rate": 1.5072088020996791e-05,
"loss": 1.0989,
"step": 176
},
{
"epoch": 0.35,
"grad_norm": 0.08313170820474625,
"learning_rate": 1.5018054838600033e-05,
"loss": 1.09,
"step": 177
},
{
"epoch": 0.35,
"grad_norm": 0.08245568722486496,
"learning_rate": 1.496382512736056e-05,
"loss": 1.0572,
"step": 178
},
{
"epoch": 0.35,
"grad_norm": 0.08442118763923645,
"learning_rate": 1.490940101114961e-05,
"loss": 1.0669,
"step": 179
},
{
"epoch": 0.35,
"grad_norm": 0.08224523812532425,
"learning_rate": 1.4854784621452176e-05,
"loss": 1.0842,
"step": 180
},
{
"epoch": 0.35,
"grad_norm": 0.08642537891864777,
"learning_rate": 1.479997809728352e-05,
"loss": 1.123,
"step": 181
},
{
"epoch": 0.35,
"grad_norm": 0.08723440766334534,
"learning_rate": 1.4744983585105388e-05,
"loss": 1.0649,
"step": 182
},
{
"epoch": 0.36,
"grad_norm": 0.08666212856769562,
"learning_rate": 1.4689803238741955e-05,
"loss": 1.0938,
"step": 183
},
{
"epoch": 0.36,
"grad_norm": 0.09213647246360779,
"learning_rate": 1.463443921929548e-05,
"loss": 1.0903,
"step": 184
},
{
"epoch": 0.36,
"grad_norm": 0.08998877555131912,
"learning_rate": 1.4578893695061644e-05,
"loss": 1.0778,
"step": 185
},
{
"epoch": 0.36,
"grad_norm": 0.09158129245042801,
"learning_rate": 1.4523168841444657e-05,
"loss": 1.0932,
"step": 186
},
{
"epoch": 0.36,
"grad_norm": 0.09460633993148804,
"learning_rate": 1.4467266840872041e-05,
"loss": 1.0691,
"step": 187
},
{
"epoch": 0.37,
"grad_norm": 0.09502755105495453,
"learning_rate": 1.441118988270916e-05,
"loss": 1.0684,
"step": 188
},
{
"epoch": 0.37,
"grad_norm": 0.09307122975587845,
"learning_rate": 1.4354940163173486e-05,
"loss": 1.0776,
"step": 189
},
{
"epoch": 0.37,
"grad_norm": 0.09580650180578232,
"learning_rate": 1.4298519885248574e-05,
"loss": 1.0882,
"step": 190
},
{
"epoch": 0.37,
"grad_norm": 0.09251459687948227,
"learning_rate": 1.4241931258597781e-05,
"loss": 1.077,
"step": 191
},
{
"epoch": 0.37,
"grad_norm": 0.09432998299598694,
"learning_rate": 1.4185176499477742e-05,
"loss": 1.0012,
"step": 192
},
{
"epoch": 0.38,
"grad_norm": 0.09586652368307114,
"learning_rate": 1.4128257830651554e-05,
"loss": 1.0334,
"step": 193
},
{
"epoch": 0.38,
"grad_norm": 0.09538242220878601,
"learning_rate": 1.407117748130174e-05,
"loss": 1.0731,
"step": 194
},
{
"epoch": 0.38,
"grad_norm": 0.09691152721643448,
"learning_rate": 1.401393768694292e-05,
"loss": 1.0412,
"step": 195
},
{
"epoch": 0.38,
"grad_norm": 0.09779084473848343,
"learning_rate": 1.3956540689334286e-05,
"loss": 1.0602,
"step": 196
},
{
"epoch": 0.38,
"grad_norm": 0.0998532623052597,
"learning_rate": 1.3898988736391792e-05,
"loss": 1.0261,
"step": 197
},
{
"epoch": 0.39,
"grad_norm": 0.10739872604608536,
"learning_rate": 1.384128408210011e-05,
"loss": 1.0502,
"step": 198
},
{
"epoch": 0.39,
"grad_norm": 0.11806387454271317,
"learning_rate": 1.3783428986424366e-05,
"loss": 1.1188,
"step": 199
},
{
"epoch": 0.39,
"grad_norm": 0.10208501666784286,
"learning_rate": 1.3725425715221625e-05,
"loss": 1.0465,
"step": 200
},
{
"epoch": 0.39,
"grad_norm": 0.1044783741235733,
"learning_rate": 1.3667276540152143e-05,
"loss": 1.0561,
"step": 201
},
{
"epoch": 0.39,
"grad_norm": 0.1070132926106453,
"learning_rate": 1.3608983738590414e-05,
"loss": 1.0429,
"step": 202
},
{
"epoch": 0.4,
"grad_norm": 0.11181865632534027,
"learning_rate": 1.3550549593535965e-05,
"loss": 1.0564,
"step": 203
},
{
"epoch": 0.4,
"grad_norm": 0.11098324507474899,
"learning_rate": 1.3491976393523952e-05,
"loss": 1.0632,
"step": 204
},
{
"epoch": 0.4,
"grad_norm": 0.10281454026699066,
"learning_rate": 1.343326643253552e-05,
"loss": 1.0637,
"step": 205
},
{
"epoch": 0.4,
"grad_norm": 0.10408665239810944,
"learning_rate": 1.3374422009907984e-05,
"loss": 1.0701,
"step": 206
},
{
"epoch": 0.4,
"grad_norm": 0.10533872246742249,
"learning_rate": 1.3315445430244744e-05,
"loss": 1.0654,
"step": 207
},
{
"epoch": 0.41,
"grad_norm": 0.10545054078102112,
"learning_rate": 1.3256339003325054e-05,
"loss": 1.0518,
"step": 208
},
{
"epoch": 0.41,
"grad_norm": 0.09894714504480362,
"learning_rate": 1.3197105044013544e-05,
"loss": 1.0671,
"step": 209
},
{
"epoch": 0.41,
"grad_norm": 0.08720172196626663,
"learning_rate": 1.3137745872169578e-05,
"loss": 1.0127,
"step": 210
},
{
"epoch": 0.41,
"grad_norm": 0.08827454596757889,
"learning_rate": 1.3078263812556377e-05,
"loss": 1.0154,
"step": 211
},
{
"epoch": 0.41,
"grad_norm": 0.0914626345038414,
"learning_rate": 1.3018661194749986e-05,
"loss": 1.0201,
"step": 212
},
{
"epoch": 0.42,
"grad_norm": 0.08843535929918289,
"learning_rate": 1.295894035304803e-05,
"loss": 1.0516,
"step": 213
},
{
"epoch": 0.42,
"grad_norm": 0.08639541268348694,
"learning_rate": 1.28991036263783e-05,
"loss": 1.0165,
"step": 214
},
{
"epoch": 0.42,
"grad_norm": 0.07750130444765091,
"learning_rate": 1.2839153358207142e-05,
"loss": 1.0223,
"step": 215
},
{
"epoch": 0.42,
"grad_norm": 0.0824190005660057,
"learning_rate": 1.2779091896447682e-05,
"loss": 1.0337,
"step": 216
},
{
"epoch": 0.42,
"grad_norm": 0.08451572805643082,
"learning_rate": 1.2718921593367874e-05,
"loss": 1.0542,
"step": 217
},
{
"epoch": 0.43,
"grad_norm": 0.0857366994023323,
"learning_rate": 1.2658644805498361e-05,
"loss": 1.0759,
"step": 218
},
{
"epoch": 0.43,
"grad_norm": 0.07681415975093842,
"learning_rate": 1.2598263893540207e-05,
"loss": 1.0506,
"step": 219
},
{
"epoch": 0.43,
"grad_norm": 0.07856535911560059,
"learning_rate": 1.2537781222272423e-05,
"loss": 1.0974,
"step": 220
},
{
"epoch": 0.43,
"grad_norm": 0.08015410602092743,
"learning_rate": 1.2477199160459345e-05,
"loss": 1.0604,
"step": 221
},
{
"epoch": 0.43,
"grad_norm": 0.08314133435487747,
"learning_rate": 1.2416520080757892e-05,
"loss": 1.0889,
"step": 222
},
{
"epoch": 0.43,
"grad_norm": 0.08028203994035721,
"learning_rate": 1.2355746359624621e-05,
"loss": 1.0281,
"step": 223
},
{
"epoch": 0.44,
"grad_norm": 0.0775797963142395,
"learning_rate": 1.2294880377222649e-05,
"loss": 1.078,
"step": 224
},
{
"epoch": 0.44,
"grad_norm": 0.08315123617649078,
"learning_rate": 1.2233924517328456e-05,
"loss": 1.0356,
"step": 225
},
{
"epoch": 0.44,
"grad_norm": 0.0795183852314949,
"learning_rate": 1.2172881167238515e-05,
"loss": 1.0332,
"step": 226
},
{
"epoch": 0.44,
"grad_norm": 0.0779062882065773,
"learning_rate": 1.2111752717675788e-05,
"loss": 0.9954,
"step": 227
},
{
"epoch": 0.44,
"grad_norm": 0.07758854329586029,
"learning_rate": 1.205054156269611e-05,
"loss": 1.0242,
"step": 228
},
{
"epoch": 0.45,
"grad_norm": 0.07713694125413895,
"learning_rate": 1.1989250099594412e-05,
"loss": 1.0686,
"step": 229
},
{
"epoch": 0.45,
"grad_norm": 0.07772821933031082,
"learning_rate": 1.192788072881085e-05,
"loss": 1.0338,
"step": 230
},
{
"epoch": 0.45,
"grad_norm": 0.08006665855646133,
"learning_rate": 1.1866435853836773e-05,
"loss": 1.0946,
"step": 231
},
{
"epoch": 0.45,
"grad_norm": 0.0821637436747551,
"learning_rate": 1.1804917881120608e-05,
"loss": 1.0525,
"step": 232
},
{
"epoch": 0.45,
"grad_norm": 0.07892850786447525,
"learning_rate": 1.1743329219973609e-05,
"loss": 1.0127,
"step": 233
},
{
"epoch": 0.46,
"grad_norm": 0.07800798863172531,
"learning_rate": 1.1681672282475495e-05,
"loss": 1.0254,
"step": 234
},
{
"epoch": 0.46,
"grad_norm": 0.07875402271747589,
"learning_rate": 1.161994948337998e-05,
"loss": 1.0319,
"step": 235
},
{
"epoch": 0.46,
"grad_norm": 0.08178096264600754,
"learning_rate": 1.1558163240020209e-05,
"loss": 1.0541,
"step": 236
},
{
"epoch": 0.46,
"grad_norm": 0.08126726001501083,
"learning_rate": 1.1496315972214076e-05,
"loss": 1.0681,
"step": 237
},
{
"epoch": 0.46,
"grad_norm": 0.08104463666677475,
"learning_rate": 1.1434410102169462e-05,
"loss": 0.9767,
"step": 238
},
{
"epoch": 0.47,
"grad_norm": 0.0746295303106308,
"learning_rate": 1.1372448054389364e-05,
"loss": 1.0586,
"step": 239
},
{
"epoch": 0.47,
"grad_norm": 0.08171354979276657,
"learning_rate": 1.1310432255576944e-05,
"loss": 1.0655,
"step": 240
},
{
"epoch": 0.47,
"grad_norm": 0.08069796115159988,
"learning_rate": 1.1248365134540489e-05,
"loss": 1.079,
"step": 241
},
{
"epoch": 0.47,
"grad_norm": 0.07922904193401337,
"learning_rate": 1.1186249122098282e-05,
"loss": 1.0371,
"step": 242
},
{
"epoch": 0.47,
"grad_norm": 0.07877922058105469,
"learning_rate": 1.1124086650983415e-05,
"loss": 1.0236,
"step": 243
},
{
"epoch": 0.48,
"grad_norm": 0.07606945931911469,
"learning_rate": 1.1061880155748497e-05,
"loss": 1.0255,
"step": 244
},
{
"epoch": 0.48,
"grad_norm": 0.08225277811288834,
"learning_rate": 1.0999632072670314e-05,
"loss": 1.0571,
"step": 245
},
{
"epoch": 0.48,
"grad_norm": 0.07907744497060776,
"learning_rate": 1.0937344839654416e-05,
"loss": 1.0745,
"step": 246
},
{
"epoch": 0.48,
"grad_norm": 0.07885382324457169,
"learning_rate": 1.087502089613963e-05,
"loss": 0.9899,
"step": 247
},
{
"epoch": 0.48,
"grad_norm": 0.08236192911863327,
"learning_rate": 1.0812662683002528e-05,
"loss": 1.046,
"step": 248
},
{
"epoch": 0.49,
"grad_norm": 0.08153583109378815,
"learning_rate": 1.075027264246183e-05,
"loss": 1.0769,
"step": 249
},
{
"epoch": 0.49,
"grad_norm": 0.0847182348370552,
"learning_rate": 1.068785321798276e-05,
"loss": 1.0695,
"step": 250
},
{
"epoch": 0.49,
"grad_norm": 0.07414229959249496,
"learning_rate": 1.062540685418133e-05,
"loss": 1.0555,
"step": 251
},
{
"epoch": 0.49,
"grad_norm": 0.07932449132204056,
"learning_rate": 1.0562935996728629e-05,
"loss": 1.0644,
"step": 252
},
{
"epoch": 0.49,
"grad_norm": 0.08247576653957367,
"learning_rate": 1.0500443092255017e-05,
"loss": 1.064,
"step": 253
},
{
"epoch": 0.5,
"grad_norm": 0.07860003411769867,
"learning_rate": 1.043793058825431e-05,
"loss": 1.0579,
"step": 254
},
{
"epoch": 0.5,
"grad_norm": 0.08330255001783371,
"learning_rate": 1.0375400932987932e-05,
"loss": 1.0218,
"step": 255
},
{
"epoch": 0.5,
"grad_norm": 0.08150562644004822,
"learning_rate": 1.0312856575389016e-05,
"loss": 1.0379,
"step": 256
},
{
"epoch": 0.5,
"eval_loss": 1.0509783029556274,
"eval_runtime": 708.357,
"eval_samples_per_second": 7.163,
"eval_steps_per_second": 1.791,
"step": 256
}
],
"logging_steps": 1,
"max_steps": 512,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 256,
"total_flos": 2.262770368118784e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}