imdatta0's picture
End of training
8b1b653 verified
raw
history blame
22.2 kB
{
"best_metric": 0.9216057062149048,
"best_model_checkpoint": "/home/datta0/models/lora_final/Qwen2-7B_magiccoder_reverse/checkpoint-4",
"epoch": 0.99836867862969,
"eval_steps": 4,
"global_step": 153,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0065252854812398045,
"grad_norm": 6.942051887512207,
"learning_rate": 7.5e-05,
"loss": 0.8051,
"step": 1
},
{
"epoch": 0.013050570962479609,
"grad_norm": 5.598752498626709,
"learning_rate": 0.00015,
"loss": 0.9302,
"step": 2
},
{
"epoch": 0.026101141924959218,
"grad_norm": 4.465769290924072,
"learning_rate": 0.0003,
"loss": 0.8213,
"step": 4
},
{
"epoch": 0.026101141924959218,
"eval_loss": 0.9216057062149048,
"eval_runtime": 24.7492,
"eval_samples_per_second": 19.758,
"eval_steps_per_second": 2.505,
"step": 4
},
{
"epoch": 0.03915171288743882,
"grad_norm": 3.274362564086914,
"learning_rate": 0.00029986665273697545,
"loss": 0.8625,
"step": 6
},
{
"epoch": 0.052202283849918436,
"grad_norm": 5.245540618896484,
"learning_rate": 0.0002994668480344693,
"loss": 0.9106,
"step": 8
},
{
"epoch": 0.052202283849918436,
"eval_loss": 0.9644020795822144,
"eval_runtime": 24.7265,
"eval_samples_per_second": 19.776,
"eval_steps_per_second": 2.507,
"step": 8
},
{
"epoch": 0.06525285481239804,
"grad_norm": 2.9444775581359863,
"learning_rate": 0.0002988012967306524,
"loss": 0.928,
"step": 10
},
{
"epoch": 0.07830342577487764,
"grad_norm": 3.0126848220825195,
"learning_rate": 0.000297871182151455,
"loss": 0.9506,
"step": 12
},
{
"epoch": 0.07830342577487764,
"eval_loss": 0.9630553722381592,
"eval_runtime": 24.7024,
"eval_samples_per_second": 19.796,
"eval_steps_per_second": 2.51,
"step": 12
},
{
"epoch": 0.09135399673735727,
"grad_norm": 2.676522731781006,
"learning_rate": 0.00029667815800665635,
"loss": 0.9756,
"step": 14
},
{
"epoch": 0.10440456769983687,
"grad_norm": 2.626117467880249,
"learning_rate": 0.0002952243454496488,
"loss": 0.9339,
"step": 16
},
{
"epoch": 0.10440456769983687,
"eval_loss": 0.9845271706581116,
"eval_runtime": 24.6457,
"eval_samples_per_second": 19.841,
"eval_steps_per_second": 2.516,
"step": 16
},
{
"epoch": 0.11745513866231648,
"grad_norm": 2.454320192337036,
"learning_rate": 0.0002935123293061047,
"loss": 0.9374,
"step": 18
},
{
"epoch": 0.13050570962479607,
"grad_norm": 2.6138737201690674,
"learning_rate": 0.0002915451534782506,
"loss": 1.0039,
"step": 20
},
{
"epoch": 0.13050570962479607,
"eval_loss": 0.9996753334999084,
"eval_runtime": 24.5982,
"eval_samples_per_second": 19.879,
"eval_steps_per_second": 2.521,
"step": 20
},
{
"epoch": 0.14355628058727568,
"grad_norm": 2.4134886264801025,
"learning_rate": 0.0002893263155329204,
"loss": 0.9932,
"step": 22
},
{
"epoch": 0.1566068515497553,
"grad_norm": 2.5122950077056885,
"learning_rate": 0.00028685976048300875,
"loss": 0.9095,
"step": 24
},
{
"epoch": 0.1566068515497553,
"eval_loss": 1.0116466283798218,
"eval_runtime": 24.4885,
"eval_samples_per_second": 19.969,
"eval_steps_per_second": 2.532,
"step": 24
},
{
"epoch": 0.16965742251223492,
"grad_norm": 2.41286563873291,
"learning_rate": 0.00028414987377338235,
"loss": 1.0434,
"step": 26
},
{
"epoch": 0.18270799347471453,
"grad_norm": 2.3061952590942383,
"learning_rate": 0.0002812014734837191,
"loss": 0.9241,
"step": 28
},
{
"epoch": 0.18270799347471453,
"eval_loss": 1.0198205709457397,
"eval_runtime": 73.4147,
"eval_samples_per_second": 6.661,
"eval_steps_per_second": 0.845,
"step": 28
},
{
"epoch": 0.19575856443719414,
"grad_norm": 2.5358309745788574,
"learning_rate": 0.0002780198017621379,
"loss": 1.0064,
"step": 30
},
{
"epoch": 0.20880913539967375,
"grad_norm": 2.349397897720337,
"learning_rate": 0.00027461051550485116,
"loss": 1.0582,
"step": 32
},
{
"epoch": 0.20880913539967375,
"eval_loss": 1.0290634632110596,
"eval_runtime": 57.0475,
"eval_samples_per_second": 8.572,
"eval_steps_per_second": 1.087,
"step": 32
},
{
"epoch": 0.22185970636215335,
"grad_norm": 2.219332456588745,
"learning_rate": 0.00027097967629840906,
"loss": 0.9762,
"step": 34
},
{
"epoch": 0.23491027732463296,
"grad_norm": 2.143191337585449,
"learning_rate": 0.0002671337396424204,
"loss": 0.9677,
"step": 36
},
{
"epoch": 0.23491027732463296,
"eval_loss": 1.0306977033615112,
"eval_runtime": 55.6789,
"eval_samples_per_second": 8.783,
"eval_steps_per_second": 1.114,
"step": 36
},
{
"epoch": 0.24796084828711257,
"grad_norm": 2.179919481277466,
"learning_rate": 0.00026307954347190983,
"loss": 0.9415,
"step": 38
},
{
"epoch": 0.26101141924959215,
"grad_norm": 2.126628875732422,
"learning_rate": 0.00025882429599971866,
"loss": 1.0044,
"step": 40
},
{
"epoch": 0.26101141924959215,
"eval_loss": 1.03548002243042,
"eval_runtime": 56.2682,
"eval_samples_per_second": 8.691,
"eval_steps_per_second": 1.102,
"step": 40
},
{
"epoch": 0.2740619902120718,
"grad_norm": 1.9612793922424316,
"learning_rate": 0.0002543755629005657,
"loss": 0.9929,
"step": 42
},
{
"epoch": 0.28711256117455136,
"grad_norm": 2.20817494392395,
"learning_rate": 0.0002497412538595537,
"loss": 1.0672,
"step": 44
},
{
"epoch": 0.28711256117455136,
"eval_loss": 1.038764476776123,
"eval_runtime": 56.5986,
"eval_samples_per_second": 8.64,
"eval_steps_per_second": 1.095,
"step": 44
},
{
"epoch": 0.300163132137031,
"grad_norm": 2.2024221420288086,
"learning_rate": 0.00024492960850903755,
"loss": 1.0003,
"step": 46
},
{
"epoch": 0.3132137030995106,
"grad_norm": 1.9989386796951294,
"learning_rate": 0.00023994918177885902,
"loss": 1.0368,
"step": 48
},
{
"epoch": 0.3132137030995106,
"eval_loss": 1.0401816368103027,
"eval_runtime": 55.8152,
"eval_samples_per_second": 8.761,
"eval_steps_per_second": 1.111,
"step": 48
},
{
"epoch": 0.3262642740619902,
"grad_norm": 2.487414836883545,
"learning_rate": 0.0002348088286859938,
"loss": 1.0797,
"step": 50
},
{
"epoch": 0.33931484502446985,
"grad_norm": 2.199925661087036,
"learning_rate": 0.00022951768859065402,
"loss": 0.9603,
"step": 52
},
{
"epoch": 0.33931484502446985,
"eval_loss": 1.042026400566101,
"eval_runtime": 56.1668,
"eval_samples_per_second": 8.706,
"eval_steps_per_second": 1.104,
"step": 52
},
{
"epoch": 0.3523654159869494,
"grad_norm": 2.082878589630127,
"learning_rate": 0.0002240851689468395,
"loss": 0.9746,
"step": 54
},
{
"epoch": 0.36541598694942906,
"grad_norm": 2.201341390609741,
"learning_rate": 0.00021852092857622808,
"loss": 0.9709,
"step": 56
},
{
"epoch": 0.36541598694942906,
"eval_loss": 1.0397862195968628,
"eval_runtime": 57.0586,
"eval_samples_per_second": 8.57,
"eval_steps_per_second": 1.087,
"step": 56
},
{
"epoch": 0.37846655791190864,
"grad_norm": 2.0373122692108154,
"learning_rate": 0.00021283486049514277,
"loss": 1.0489,
"step": 58
},
{
"epoch": 0.3915171288743883,
"grad_norm": 2.22078537940979,
"learning_rate": 0.00020703707432513004,
"loss": 1.0019,
"step": 60
},
{
"epoch": 0.3915171288743883,
"eval_loss": 1.0403335094451904,
"eval_runtime": 56.1745,
"eval_samples_per_second": 8.705,
"eval_steps_per_second": 1.104,
"step": 60
},
{
"epoch": 0.40456769983686786,
"grad_norm": 2.051842212677002,
"learning_rate": 0.00020113787831842152,
"loss": 0.9318,
"step": 62
},
{
"epoch": 0.4176182707993475,
"grad_norm": 2.2714943885803223,
"learning_rate": 0.0001951477610302378,
"loss": 1.0537,
"step": 64
},
{
"epoch": 0.4176182707993475,
"eval_loss": 1.0384303331375122,
"eval_runtime": 55.0642,
"eval_samples_per_second": 8.881,
"eval_steps_per_second": 1.126,
"step": 64
},
{
"epoch": 0.43066884176182707,
"grad_norm": 2.05861234664917,
"learning_rate": 0.0001890773726705198,
"loss": 1.0197,
"step": 66
},
{
"epoch": 0.4437194127243067,
"grad_norm": 2.1242425441741943,
"learning_rate": 0.00018293750616824443,
"loss": 1.0365,
"step": 68
},
{
"epoch": 0.4437194127243067,
"eval_loss": 1.0344992876052856,
"eval_runtime": 24.7472,
"eval_samples_per_second": 19.76,
"eval_steps_per_second": 2.505,
"step": 68
},
{
"epoch": 0.4567699836867863,
"grad_norm": 1.84840989112854,
"learning_rate": 0.00017673907798199052,
"loss": 1.0531,
"step": 70
},
{
"epoch": 0.4698205546492659,
"grad_norm": 2.0347371101379395,
"learning_rate": 0.000170493108690874,
"loss": 1.0,
"step": 72
},
{
"epoch": 0.4698205546492659,
"eval_loss": 1.0331792831420898,
"eval_runtime": 24.7561,
"eval_samples_per_second": 19.753,
"eval_steps_per_second": 2.504,
"step": 72
},
{
"epoch": 0.4828711256117455,
"grad_norm": 1.9134975671768188,
"learning_rate": 0.00016421070340036023,
"loss": 1.0346,
"step": 74
},
{
"epoch": 0.49592169657422513,
"grad_norm": 2.098032236099243,
"learning_rate": 0.00015790303199779193,
"loss": 1.0165,
"step": 76
},
{
"epoch": 0.49592169657422513,
"eval_loss": 1.0305981636047363,
"eval_runtime": 24.7106,
"eval_samples_per_second": 19.789,
"eval_steps_per_second": 2.509,
"step": 76
},
{
"epoch": 0.5089722675367048,
"grad_norm": 2.1426265239715576,
"learning_rate": 0.00015158130929273695,
"loss": 0.9569,
"step": 78
},
{
"epoch": 0.5220228384991843,
"grad_norm": 1.9341685771942139,
"learning_rate": 0.00014525677507746615,
"loss": 0.9778,
"step": 80
},
{
"epoch": 0.5220228384991843,
"eval_loss": 1.0271245241165161,
"eval_runtime": 24.6768,
"eval_samples_per_second": 19.816,
"eval_steps_per_second": 2.512,
"step": 80
},
{
"epoch": 0.5350734094616639,
"grad_norm": 1.9822169542312622,
"learning_rate": 0.00013894067414301314,
"loss": 1.0639,
"step": 82
},
{
"epoch": 0.5481239804241436,
"grad_norm": 1.9346858263015747,
"learning_rate": 0.0001326442362863458,
"loss": 1.0497,
"step": 84
},
{
"epoch": 0.5481239804241436,
"eval_loss": 1.0228689908981323,
"eval_runtime": 24.5949,
"eval_samples_per_second": 19.882,
"eval_steps_per_second": 2.521,
"step": 84
},
{
"epoch": 0.5611745513866232,
"grad_norm": 1.9368449449539185,
"learning_rate": 0.00012637865634419735,
"loss": 1.013,
"step": 86
},
{
"epoch": 0.5742251223491027,
"grad_norm": 1.977944016456604,
"learning_rate": 0.00012015507428905507,
"loss": 0.9652,
"step": 88
},
{
"epoch": 0.5742251223491027,
"eval_loss": 1.020308256149292,
"eval_runtime": 24.5264,
"eval_samples_per_second": 19.938,
"eval_steps_per_second": 2.528,
"step": 88
},
{
"epoch": 0.5872756933115824,
"grad_norm": 2.2710931301116943,
"learning_rate": 0.00011398455542269575,
"loss": 0.93,
"step": 90
},
{
"epoch": 0.600326264274062,
"grad_norm": 1.9686428308486938,
"learning_rate": 0.00010787807070248305,
"loss": 1.0435,
"step": 92
},
{
"epoch": 0.600326264274062,
"eval_loss": 1.0184926986694336,
"eval_runtime": 24.4479,
"eval_samples_per_second": 20.002,
"eval_steps_per_second": 2.536,
"step": 92
},
{
"epoch": 0.6133768352365416,
"grad_norm": 2.019303560256958,
"learning_rate": 0.00010184647723540557,
"loss": 0.9686,
"step": 94
},
{
"epoch": 0.6264274061990212,
"grad_norm": 1.969067096710205,
"learning_rate": 9.590049897453668e-05,
"loss": 0.9769,
"step": 96
},
{
"epoch": 0.6264274061990212,
"eval_loss": 1.0141024589538574,
"eval_runtime": 55.8487,
"eval_samples_per_second": 8.756,
"eval_steps_per_second": 1.11,
"step": 96
},
{
"epoch": 0.6394779771615008,
"grad_norm": 1.8334566354751587,
"learning_rate": 9.005070765223768e-05,
"loss": 1.0576,
"step": 98
},
{
"epoch": 0.6525285481239804,
"grad_norm": 2.123537302017212,
"learning_rate": 8.430750398400308e-05,
"loss": 1.0648,
"step": 100
},
{
"epoch": 0.6525285481239804,
"eval_loss": 1.0104012489318848,
"eval_runtime": 56.3038,
"eval_samples_per_second": 8.685,
"eval_steps_per_second": 1.101,
"step": 100
},
{
"epoch": 0.6655791190864601,
"grad_norm": 1.8945276737213135,
"learning_rate": 7.868109917636821e-05,
"loss": 0.9739,
"step": 102
},
{
"epoch": 0.6786296900489397,
"grad_norm": 1.9878089427947998,
"learning_rate": 7.318149677175675e-05,
"loss": 0.9463,
"step": 104
},
{
"epoch": 0.6786296900489397,
"eval_loss": 1.007932424545288,
"eval_runtime": 57.0911,
"eval_samples_per_second": 8.565,
"eval_steps_per_second": 1.086,
"step": 104
},
{
"epoch": 0.6916802610114192,
"grad_norm": 1.9067178964614868,
"learning_rate": 6.781847486254697e-05,
"loss": 0.963,
"step": 106
},
{
"epoch": 0.7047308319738989,
"grad_norm": 1.9966986179351807,
"learning_rate": 6.260156870598071e-05,
"loss": 0.9835,
"step": 108
},
{
"epoch": 0.7047308319738989,
"eval_loss": 1.004884958267212,
"eval_runtime": 55.902,
"eval_samples_per_second": 8.747,
"eval_steps_per_second": 1.109,
"step": 108
},
{
"epoch": 0.7177814029363785,
"grad_norm": 1.834557294845581,
"learning_rate": 5.7540053770823644e-05,
"loss": 0.9684,
"step": 110
},
{
"epoch": 0.7308319738988581,
"grad_norm": 2.008937120437622,
"learning_rate": 5.264292924592073e-05,
"loss": 0.9584,
"step": 112
},
{
"epoch": 0.7308319738988581,
"eval_loss": 1.001002550125122,
"eval_runtime": 55.5718,
"eval_samples_per_second": 8.799,
"eval_steps_per_second": 1.116,
"step": 112
},
{
"epoch": 0.7438825448613376,
"grad_norm": 1.9365612268447876,
"learning_rate": 4.791890203996634e-05,
"loss": 0.9816,
"step": 114
},
{
"epoch": 0.7569331158238173,
"grad_norm": 1.697938084602356,
"learning_rate": 4.3376371300938786e-05,
"loss": 0.9185,
"step": 116
},
{
"epoch": 0.7569331158238173,
"eval_loss": 0.9972716569900513,
"eval_runtime": 56.4675,
"eval_samples_per_second": 8.66,
"eval_steps_per_second": 1.098,
"step": 116
},
{
"epoch": 0.7699836867862969,
"grad_norm": 1.8656069040298462,
"learning_rate": 3.9023413482721426e-05,
"loss": 0.9744,
"step": 118
},
{
"epoch": 0.7830342577487766,
"grad_norm": 1.8504821062088013,
"learning_rate": 3.4867767985462507e-05,
"loss": 0.9021,
"step": 120
},
{
"epoch": 0.7830342577487766,
"eval_loss": 0.994992733001709,
"eval_runtime": 56.1712,
"eval_samples_per_second": 8.706,
"eval_steps_per_second": 1.104,
"step": 120
},
{
"epoch": 0.7960848287112561,
"grad_norm": 1.8380029201507568,
"learning_rate": 3.09168233952042e-05,
"loss": 1.0255,
"step": 122
},
{
"epoch": 0.8091353996737357,
"grad_norm": 1.8176851272583008,
"learning_rate": 2.717760434724613e-05,
"loss": 0.9684,
"step": 124
},
{
"epoch": 0.8091353996737357,
"eval_loss": 0.9929932951927185,
"eval_runtime": 56.7685,
"eval_samples_per_second": 8.614,
"eval_steps_per_second": 1.092,
"step": 124
},
{
"epoch": 0.8221859706362153,
"grad_norm": 1.7307217121124268,
"learning_rate": 2.3656759036600187e-05,
"loss": 0.9727,
"step": 126
},
{
"epoch": 0.835236541598695,
"grad_norm": 1.9544731378555298,
"learning_rate": 2.0360547397742523e-05,
"loss": 0.9461,
"step": 128
},
{
"epoch": 0.835236541598695,
"eval_loss": 0.9912956953048706,
"eval_runtime": 56.6869,
"eval_samples_per_second": 8.626,
"eval_steps_per_second": 1.094,
"step": 128
},
{
"epoch": 0.8482871125611745,
"grad_norm": 1.797264575958252,
"learning_rate": 1.7294829974678338e-05,
"loss": 0.9235,
"step": 130
},
{
"epoch": 0.8613376835236541,
"grad_norm": 1.994328498840332,
"learning_rate": 1.4465057501108546e-05,
"loss": 1.0232,
"step": 132
},
{
"epoch": 0.8613376835236541,
"eval_loss": 0.9894663691520691,
"eval_runtime": 56.1708,
"eval_samples_per_second": 8.706,
"eval_steps_per_second": 1.104,
"step": 132
},
{
"epoch": 0.8743882544861338,
"grad_norm": 1.8575083017349243,
"learning_rate": 1.1876261209224314e-05,
"loss": 0.9372,
"step": 134
},
{
"epoch": 0.8874388254486134,
"grad_norm": 1.6837760210037231,
"learning_rate": 9.533043884359615e-06,
"loss": 0.9646,
"step": 136
},
{
"epoch": 0.8874388254486134,
"eval_loss": 0.9884896874427795,
"eval_runtime": 24.7353,
"eval_samples_per_second": 19.769,
"eval_steps_per_second": 2.507,
"step": 136
},
{
"epoch": 0.9004893964110929,
"grad_norm": 1.7492249011993408,
"learning_rate": 7.439571681407053e-06,
"loss": 1.0043,
"step": 138
},
{
"epoch": 0.9135399673735726,
"grad_norm": 1.8709555864334106,
"learning_rate": 5.59956671754635e-06,
"loss": 0.9912,
"step": 140
},
{
"epoch": 0.9135399673735726,
"eval_loss": 0.9873647093772888,
"eval_runtime": 24.7216,
"eval_samples_per_second": 19.78,
"eval_steps_per_second": 2.508,
"step": 140
},
{
"epoch": 0.9265905383360522,
"grad_norm": 1.91972017288208,
"learning_rate": 4.016300454455945e-06,
"loss": 0.9987,
"step": 142
},
{
"epoch": 0.9396411092985318,
"grad_norm": 2.0142247676849365,
"learning_rate": 2.692587881773478e-06,
"loss": 0.9464,
"step": 144
},
{
"epoch": 0.9396411092985318,
"eval_loss": 0.9870482683181763,
"eval_runtime": 24.723,
"eval_samples_per_second": 19.779,
"eval_steps_per_second": 2.508,
"step": 144
},
{
"epoch": 0.9526916802610114,
"grad_norm": 1.8588862419128418,
"learning_rate": 1.6307825121469164e-06,
"loss": 0.9875,
"step": 146
},
{
"epoch": 0.965742251223491,
"grad_norm": 1.8777186870574951,
"learning_rate": 8.327721967749779e-07,
"loss": 1.0104,
"step": 148
},
{
"epoch": 0.965742251223491,
"eval_loss": 0.9868296980857849,
"eval_runtime": 24.6717,
"eval_samples_per_second": 19.82,
"eval_steps_per_second": 2.513,
"step": 148
},
{
"epoch": 0.9787928221859706,
"grad_norm": 1.91819167137146,
"learning_rate": 2.9997576887660913e-07,
"loss": 0.9288,
"step": 150
},
{
"epoch": 0.9918433931484503,
"grad_norm": 1.8686131238937378,
"learning_rate": 3.334052105728458e-08,
"loss": 0.9624,
"step": 152
},
{
"epoch": 0.9918433931484503,
"eval_loss": 0.9868960976600647,
"eval_runtime": 24.5863,
"eval_samples_per_second": 19.889,
"eval_steps_per_second": 2.522,
"step": 152
}
],
"logging_steps": 2,
"max_steps": 153,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 4,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.85963932651946e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}