gemma-sum / trainer_state.json
baltop's picture
Upload folder using huggingface_hub
e38f3d0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.8924033522573669,
"eval_steps": 500,
"global_step": 10500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"grad_norm": 0.87135249376297,
"learning_rate": 0.00019819808452978876,
"loss": 2.1425,
"step": 100
},
{
"epoch": 0.04,
"grad_norm": 0.8564028739929199,
"learning_rate": 0.0001963956283227154,
"loss": 1.8647,
"step": 200
},
{
"epoch": 0.05,
"grad_norm": 0.8958914875984192,
"learning_rate": 0.000194593172115642,
"loss": 1.8182,
"step": 300
},
{
"epoch": 0.07,
"grad_norm": 0.8007214069366455,
"learning_rate": 0.00019279071590856862,
"loss": 1.711,
"step": 400
},
{
"epoch": 0.09,
"grad_norm": 0.8793672919273376,
"learning_rate": 0.00019098825970149526,
"loss": 1.686,
"step": 500
},
{
"epoch": 0.11,
"grad_norm": 0.9486576318740845,
"learning_rate": 0.00018918580349442187,
"loss": 1.7068,
"step": 600
},
{
"epoch": 0.13,
"grad_norm": 0.8130625486373901,
"learning_rate": 0.0001873833472873485,
"loss": 1.6457,
"step": 700
},
{
"epoch": 0.14,
"grad_norm": 0.8903294801712036,
"learning_rate": 0.00018558089108027513,
"loss": 1.6285,
"step": 800
},
{
"epoch": 0.16,
"grad_norm": 0.9597026109695435,
"learning_rate": 0.0001837784348732017,
"loss": 1.6305,
"step": 900
},
{
"epoch": 0.18,
"grad_norm": 0.7700974941253662,
"learning_rate": 0.00018197597866612835,
"loss": 1.6443,
"step": 1000
},
{
"epoch": 0.2,
"grad_norm": 0.8106345534324646,
"learning_rate": 0.00018017352245905497,
"loss": 1.5963,
"step": 1100
},
{
"epoch": 0.22,
"grad_norm": 1.026309847831726,
"learning_rate": 0.00017837106625198158,
"loss": 1.5615,
"step": 1200
},
{
"epoch": 0.23,
"grad_norm": 0.8697523474693298,
"learning_rate": 0.00017656861004490822,
"loss": 1.5295,
"step": 1300
},
{
"epoch": 0.25,
"grad_norm": 1.2120341062545776,
"learning_rate": 0.00017476615383783483,
"loss": 1.4984,
"step": 1400
},
{
"epoch": 0.27,
"grad_norm": 0.7356016039848328,
"learning_rate": 0.00017296369763076145,
"loss": 1.5311,
"step": 1500
},
{
"epoch": 0.29,
"grad_norm": 0.8384151458740234,
"learning_rate": 0.00017116124142368809,
"loss": 1.5632,
"step": 1600
},
{
"epoch": 0.31,
"grad_norm": 0.8941056132316589,
"learning_rate": 0.0001693587852166147,
"loss": 1.492,
"step": 1700
},
{
"epoch": 0.32,
"grad_norm": 0.7094323039054871,
"learning_rate": 0.0001675563290095413,
"loss": 1.4425,
"step": 1800
},
{
"epoch": 0.34,
"grad_norm": 0.7246663570404053,
"learning_rate": 0.00016575387280246795,
"loss": 1.4277,
"step": 1900
},
{
"epoch": 0.36,
"grad_norm": 0.8121210932731628,
"learning_rate": 0.00016395141659539456,
"loss": 1.464,
"step": 2000
},
{
"epoch": 0.38,
"grad_norm": 0.712011456489563,
"learning_rate": 0.0001621489603883212,
"loss": 1.4779,
"step": 2100
},
{
"epoch": 0.4,
"grad_norm": 0.7419346570968628,
"learning_rate": 0.00016034650418124782,
"loss": 1.4669,
"step": 2200
},
{
"epoch": 0.41,
"grad_norm": 1.0694609880447388,
"learning_rate": 0.0001585440479741744,
"loss": 1.4499,
"step": 2300
},
{
"epoch": 0.43,
"grad_norm": 1.0339300632476807,
"learning_rate": 0.00015674159176710104,
"loss": 1.4115,
"step": 2400
},
{
"epoch": 0.45,
"grad_norm": 1.1224662065505981,
"learning_rate": 0.00015493913556002766,
"loss": 1.4353,
"step": 2500
},
{
"epoch": 0.47,
"grad_norm": 0.8455696702003479,
"learning_rate": 0.0001531366793529543,
"loss": 1.3991,
"step": 2600
},
{
"epoch": 0.49,
"grad_norm": 0.8783261179924011,
"learning_rate": 0.0001513342231458809,
"loss": 1.4174,
"step": 2700
},
{
"epoch": 0.5,
"grad_norm": 0.8644577264785767,
"learning_rate": 0.00014953176693880752,
"loss": 1.4272,
"step": 2800
},
{
"epoch": 0.52,
"grad_norm": 0.804175853729248,
"learning_rate": 0.00014772931073173416,
"loss": 1.3904,
"step": 2900
},
{
"epoch": 0.54,
"grad_norm": 0.8686081767082214,
"learning_rate": 0.00014592685452466077,
"loss": 1.3577,
"step": 3000
},
{
"epoch": 0.56,
"grad_norm": 0.8131946325302124,
"learning_rate": 0.0001441243983175874,
"loss": 1.3798,
"step": 3100
},
{
"epoch": 0.58,
"grad_norm": 0.9579694271087646,
"learning_rate": 0.00014232194211051403,
"loss": 1.3705,
"step": 3200
},
{
"epoch": 0.59,
"grad_norm": 0.7878475785255432,
"learning_rate": 0.00014051948590344064,
"loss": 1.3216,
"step": 3300
},
{
"epoch": 0.61,
"grad_norm": 0.9384462833404541,
"learning_rate": 0.00013871702969636725,
"loss": 1.3681,
"step": 3400
},
{
"epoch": 0.63,
"grad_norm": 0.899638295173645,
"learning_rate": 0.0001369145734892939,
"loss": 1.3752,
"step": 3500
},
{
"epoch": 0.65,
"grad_norm": 0.8509306907653809,
"learning_rate": 0.0001351121172822205,
"loss": 1.3253,
"step": 3600
},
{
"epoch": 0.67,
"grad_norm": 0.712924063205719,
"learning_rate": 0.00013330966107514712,
"loss": 1.3318,
"step": 3700
},
{
"epoch": 0.68,
"grad_norm": 0.8807259798049927,
"learning_rate": 0.00013150720486807373,
"loss": 1.3163,
"step": 3800
},
{
"epoch": 0.7,
"grad_norm": 0.9081091284751892,
"learning_rate": 0.00012970474866100034,
"loss": 1.3839,
"step": 3900
},
{
"epoch": 0.72,
"grad_norm": 1.0412542819976807,
"learning_rate": 0.00012790229245392698,
"loss": 1.3057,
"step": 4000
},
{
"epoch": 0.74,
"grad_norm": 0.8416357636451721,
"learning_rate": 0.0001260998362468536,
"loss": 1.2548,
"step": 4100
},
{
"epoch": 0.76,
"grad_norm": 0.8973735570907593,
"learning_rate": 0.0001242973800397802,
"loss": 1.3154,
"step": 4200
},
{
"epoch": 0.77,
"grad_norm": 0.7394294738769531,
"learning_rate": 0.00012249492383270685,
"loss": 1.3079,
"step": 4300
},
{
"epoch": 0.79,
"grad_norm": 1.1180624961853027,
"learning_rate": 0.00012069246762563346,
"loss": 1.2789,
"step": 4400
},
{
"epoch": 0.81,
"grad_norm": 0.8885756134986877,
"learning_rate": 0.00011889001141856009,
"loss": 1.2959,
"step": 4500
},
{
"epoch": 0.83,
"grad_norm": 1.1742843389511108,
"learning_rate": 0.00011708755521148672,
"loss": 1.2486,
"step": 4600
},
{
"epoch": 0.85,
"grad_norm": 0.9566686153411865,
"learning_rate": 0.00011528509900441333,
"loss": 1.303,
"step": 4700
},
{
"epoch": 0.87,
"grad_norm": 1.2613877058029175,
"learning_rate": 0.00011348264279733996,
"loss": 1.301,
"step": 4800
},
{
"epoch": 0.88,
"grad_norm": 0.9030331969261169,
"learning_rate": 0.00011168018659026658,
"loss": 1.3338,
"step": 4900
},
{
"epoch": 0.9,
"grad_norm": 1.0433690547943115,
"learning_rate": 0.00010987773038319318,
"loss": 1.3068,
"step": 5000
},
{
"epoch": 0.92,
"grad_norm": 0.8587890267372131,
"learning_rate": 0.00010807527417611981,
"loss": 1.2632,
"step": 5100
},
{
"epoch": 0.94,
"grad_norm": 1.0812350511550903,
"learning_rate": 0.00010627281796904642,
"loss": 1.278,
"step": 5200
},
{
"epoch": 0.96,
"grad_norm": 0.8623504042625427,
"learning_rate": 0.00010447036176197305,
"loss": 1.2392,
"step": 5300
},
{
"epoch": 0.97,
"grad_norm": 0.8327571749687195,
"learning_rate": 0.00010266790555489967,
"loss": 1.2652,
"step": 5400
},
{
"epoch": 0.99,
"grad_norm": 0.958329975605011,
"learning_rate": 0.00010086544934782629,
"loss": 1.286,
"step": 5500
},
{
"epoch": 1.01,
"grad_norm": 0.9664350748062134,
"learning_rate": 9.906299314075291e-05,
"loss": 1.1171,
"step": 5600
},
{
"epoch": 1.03,
"grad_norm": 0.8452981114387512,
"learning_rate": 9.726053693367954e-05,
"loss": 1.0828,
"step": 5700
},
{
"epoch": 1.05,
"grad_norm": 1.0611803531646729,
"learning_rate": 9.545808072660615e-05,
"loss": 1.1142,
"step": 5800
},
{
"epoch": 1.06,
"grad_norm": 1.0450036525726318,
"learning_rate": 9.365562451953278e-05,
"loss": 1.1217,
"step": 5900
},
{
"epoch": 1.08,
"grad_norm": 0.9196897745132446,
"learning_rate": 9.18531683124594e-05,
"loss": 1.1435,
"step": 6000
},
{
"epoch": 1.1,
"grad_norm": 1.241141676902771,
"learning_rate": 9.005071210538602e-05,
"loss": 1.1174,
"step": 6100
},
{
"epoch": 1.12,
"grad_norm": 0.8073747754096985,
"learning_rate": 8.826628046038338e-05,
"loss": 1.0501,
"step": 6200
},
{
"epoch": 1.14,
"grad_norm": 0.8413310647010803,
"learning_rate": 8.646382425331e-05,
"loss": 1.1023,
"step": 6300
},
{
"epoch": 1.15,
"grad_norm": 0.8178868889808655,
"learning_rate": 8.466136804623662e-05,
"loss": 1.0948,
"step": 6400
},
{
"epoch": 1.17,
"grad_norm": 0.9561821222305298,
"learning_rate": 8.285891183916323e-05,
"loss": 1.0936,
"step": 6500
},
{
"epoch": 1.19,
"grad_norm": 0.9246460199356079,
"learning_rate": 8.105645563208986e-05,
"loss": 1.0679,
"step": 6600
},
{
"epoch": 1.21,
"grad_norm": 0.9705007076263428,
"learning_rate": 7.925399942501648e-05,
"loss": 1.026,
"step": 6700
},
{
"epoch": 1.23,
"grad_norm": 0.9710861444473267,
"learning_rate": 7.74515432179431e-05,
"loss": 1.0543,
"step": 6800
},
{
"epoch": 1.24,
"grad_norm": 1.0675069093704224,
"learning_rate": 7.564908701086972e-05,
"loss": 1.0987,
"step": 6900
},
{
"epoch": 1.26,
"grad_norm": 0.8517453670501709,
"learning_rate": 7.384663080379635e-05,
"loss": 1.0693,
"step": 7000
},
{
"epoch": 1.28,
"grad_norm": 0.901584267616272,
"learning_rate": 7.204417459672296e-05,
"loss": 1.0685,
"step": 7100
},
{
"epoch": 1.3,
"grad_norm": 1.0663121938705444,
"learning_rate": 7.024171838964957e-05,
"loss": 1.0802,
"step": 7200
},
{
"epoch": 1.32,
"grad_norm": 1.0489306449890137,
"learning_rate": 6.84392621825762e-05,
"loss": 1.1116,
"step": 7300
},
{
"epoch": 1.33,
"grad_norm": 0.8096909523010254,
"learning_rate": 6.663680597550283e-05,
"loss": 1.0675,
"step": 7400
},
{
"epoch": 1.35,
"grad_norm": 1.0951379537582397,
"learning_rate": 6.483434976842945e-05,
"loss": 1.0631,
"step": 7500
},
{
"epoch": 1.37,
"grad_norm": 1.08359956741333,
"learning_rate": 6.303189356135607e-05,
"loss": 1.0522,
"step": 7600
},
{
"epoch": 1.39,
"grad_norm": 1.22184419631958,
"learning_rate": 6.122943735428269e-05,
"loss": 1.0878,
"step": 7700
},
{
"epoch": 1.41,
"grad_norm": 1.087251901626587,
"learning_rate": 5.9426981147209305e-05,
"loss": 1.0659,
"step": 7800
},
{
"epoch": 1.42,
"grad_norm": 1.020251750946045,
"learning_rate": 5.7624524940135925e-05,
"loss": 1.0668,
"step": 7900
},
{
"epoch": 1.44,
"grad_norm": 0.9591791033744812,
"learning_rate": 5.582206873306255e-05,
"loss": 1.0702,
"step": 8000
},
{
"epoch": 1.46,
"grad_norm": 1.0169813632965088,
"learning_rate": 5.401961252598917e-05,
"loss": 1.0375,
"step": 8100
},
{
"epoch": 1.48,
"grad_norm": 1.044224739074707,
"learning_rate": 5.22171563189158e-05,
"loss": 1.0777,
"step": 8200
},
{
"epoch": 1.5,
"grad_norm": 1.0525567531585693,
"learning_rate": 5.041470011184242e-05,
"loss": 0.9967,
"step": 8300
},
{
"epoch": 1.51,
"grad_norm": 0.9581038951873779,
"learning_rate": 4.861224390476904e-05,
"loss": 1.0595,
"step": 8400
},
{
"epoch": 1.53,
"grad_norm": 1.0464085340499878,
"learning_rate": 4.6809787697695656e-05,
"loss": 1.0423,
"step": 8500
},
{
"epoch": 1.55,
"grad_norm": 0.982803225517273,
"learning_rate": 4.5007331490622276e-05,
"loss": 1.0683,
"step": 8600
},
{
"epoch": 1.57,
"grad_norm": 1.1214386224746704,
"learning_rate": 4.3204875283548896e-05,
"loss": 1.0984,
"step": 8700
},
{
"epoch": 1.59,
"grad_norm": 1.0456256866455078,
"learning_rate": 4.1402419076475515e-05,
"loss": 1.0549,
"step": 8800
},
{
"epoch": 1.6,
"grad_norm": 1.0025187730789185,
"learning_rate": 3.959996286940214e-05,
"loss": 1.024,
"step": 8900
},
{
"epoch": 1.62,
"grad_norm": 1.2760844230651855,
"learning_rate": 3.7797506662328755e-05,
"loss": 1.0313,
"step": 9000
},
{
"epoch": 1.64,
"grad_norm": 0.9632763862609863,
"learning_rate": 3.599505045525538e-05,
"loss": 1.0263,
"step": 9100
},
{
"epoch": 1.66,
"grad_norm": 1.01961088180542,
"learning_rate": 3.4192594248182e-05,
"loss": 1.0503,
"step": 9200
},
{
"epoch": 1.68,
"grad_norm": 0.9579876065254211,
"learning_rate": 3.239013804110862e-05,
"loss": 1.03,
"step": 9300
},
{
"epoch": 1.69,
"grad_norm": 1.400282859802246,
"learning_rate": 3.058768183403524e-05,
"loss": 1.0481,
"step": 9400
},
{
"epoch": 1.71,
"grad_norm": 1.1665406227111816,
"learning_rate": 2.8785225626961863e-05,
"loss": 1.0585,
"step": 9500
},
{
"epoch": 1.73,
"grad_norm": 1.1331160068511963,
"learning_rate": 2.6982769419888486e-05,
"loss": 0.9992,
"step": 9600
},
{
"epoch": 1.75,
"grad_norm": 1.0598838329315186,
"learning_rate": 2.5180313212815106e-05,
"loss": 1.0663,
"step": 9700
},
{
"epoch": 1.77,
"grad_norm": 1.0826873779296875,
"learning_rate": 2.3377857005741726e-05,
"loss": 0.9844,
"step": 9800
},
{
"epoch": 1.78,
"grad_norm": 0.9529953002929688,
"learning_rate": 2.1575400798668345e-05,
"loss": 1.0349,
"step": 9900
},
{
"epoch": 1.8,
"grad_norm": 1.0949389934539795,
"learning_rate": 1.977294459159497e-05,
"loss": 1.0473,
"step": 10000
},
{
"epoch": 1.82,
"grad_norm": 1.0248372554779053,
"learning_rate": 1.7970488384521588e-05,
"loss": 1.0382,
"step": 10100
},
{
"epoch": 1.84,
"grad_norm": 0.9931679368019104,
"learning_rate": 1.616803217744821e-05,
"loss": 1.017,
"step": 10200
},
{
"epoch": 1.86,
"grad_norm": 0.9561355710029602,
"learning_rate": 1.436557597037483e-05,
"loss": 1.0577,
"step": 10300
},
{
"epoch": 1.87,
"grad_norm": 1.2075040340423584,
"learning_rate": 1.2563119763301454e-05,
"loss": 1.0493,
"step": 10400
},
{
"epoch": 1.89,
"grad_norm": 1.1561285257339478,
"learning_rate": 1.0760663556228073e-05,
"loss": 0.9947,
"step": 10500
}
],
"logging_steps": 100,
"max_steps": 11096,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"total_flos": 3.714097560675041e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}