Sanskrit_31 / trainer_state.json
Saurabh4509's picture
Upload 11 files
4783cc1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.024681201151789386,
"eval_steps": 500,
"global_step": 120,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00020567667626491157,
"grad_norm": 12.799294471740723,
"learning_rate": 5e-06,
"loss": 1.5984,
"step": 1
},
{
"epoch": 0.00041135335252982314,
"grad_norm": 16.628677368164062,
"learning_rate": 1e-05,
"loss": 2.3592,
"step": 2
},
{
"epoch": 0.0006170300287947347,
"grad_norm": 9.23403549194336,
"learning_rate": 1.5e-05,
"loss": 1.9703,
"step": 3
},
{
"epoch": 0.0008227067050596463,
"grad_norm": 8.804163932800293,
"learning_rate": 2e-05,
"loss": 1.7174,
"step": 4
},
{
"epoch": 0.0010283833813245578,
"grad_norm": 19.202816009521484,
"learning_rate": 2.5e-05,
"loss": 2.5644,
"step": 5
},
{
"epoch": 0.0012340600575894694,
"grad_norm": Infinity,
"learning_rate": 2.5e-05,
"loss": 1.9342,
"step": 6
},
{
"epoch": 0.001439736733854381,
"grad_norm": 7.568014621734619,
"learning_rate": 3e-05,
"loss": 1.8679,
"step": 7
},
{
"epoch": 0.0016454134101192926,
"grad_norm": 14.93575382232666,
"learning_rate": 3.5e-05,
"loss": 1.6565,
"step": 8
},
{
"epoch": 0.001851090086384204,
"grad_norm": 11.807939529418945,
"learning_rate": 4e-05,
"loss": 2.067,
"step": 9
},
{
"epoch": 0.0020567667626491155,
"grad_norm": 9.383672714233398,
"learning_rate": 4.5e-05,
"loss": 2.2929,
"step": 10
},
{
"epoch": 0.0022624434389140274,
"grad_norm": 12.13580322265625,
"learning_rate": 5e-05,
"loss": 1.902,
"step": 11
},
{
"epoch": 0.0024681201151789387,
"grad_norm": 26.6066837310791,
"learning_rate": 4.9545454545454553e-05,
"loss": 2.2335,
"step": 12
},
{
"epoch": 0.00267379679144385,
"grad_norm": 22.572980880737305,
"learning_rate": 4.909090909090909e-05,
"loss": 1.3673,
"step": 13
},
{
"epoch": 0.002879473467708762,
"grad_norm": 14.39148235321045,
"learning_rate": 4.863636363636364e-05,
"loss": 1.8704,
"step": 14
},
{
"epoch": 0.0030851501439736733,
"grad_norm": 10.3450288772583,
"learning_rate": 4.8181818181818186e-05,
"loss": 1.0219,
"step": 15
},
{
"epoch": 0.003290826820238585,
"grad_norm": 11.662192344665527,
"learning_rate": 4.772727272727273e-05,
"loss": 1.9658,
"step": 16
},
{
"epoch": 0.0034965034965034965,
"grad_norm": 9.669842720031738,
"learning_rate": 4.7272727272727275e-05,
"loss": 1.7769,
"step": 17
},
{
"epoch": 0.003702180172768408,
"grad_norm": Infinity,
"learning_rate": 4.7272727272727275e-05,
"loss": 1.4503,
"step": 18
},
{
"epoch": 0.00390785684903332,
"grad_norm": 24.309511184692383,
"learning_rate": 4.681818181818182e-05,
"loss": 1.1466,
"step": 19
},
{
"epoch": 0.004113533525298231,
"grad_norm": 10.933331489562988,
"learning_rate": 4.636363636363636e-05,
"loss": 1.9125,
"step": 20
},
{
"epoch": 0.0043192102015631425,
"grad_norm": 16.095943450927734,
"learning_rate": 4.5909090909090914e-05,
"loss": 2.1035,
"step": 21
},
{
"epoch": 0.004524886877828055,
"grad_norm": 11.987975120544434,
"learning_rate": 4.545454545454546e-05,
"loss": 0.6702,
"step": 22
},
{
"epoch": 0.004730563554092966,
"grad_norm": 8.843574523925781,
"learning_rate": 4.5e-05,
"loss": 0.5454,
"step": 23
},
{
"epoch": 0.0049362402303578775,
"grad_norm": 8.445805549621582,
"learning_rate": 4.454545454545455e-05,
"loss": 1.7987,
"step": 24
},
{
"epoch": 0.005141916906622789,
"grad_norm": 5.443497657775879,
"learning_rate": 4.409090909090909e-05,
"loss": 0.2675,
"step": 25
},
{
"epoch": 0.0053475935828877,
"grad_norm": 5.370733737945557,
"learning_rate": 4.3636363636363636e-05,
"loss": 0.1773,
"step": 26
},
{
"epoch": 0.0055532702591526125,
"grad_norm": 12.393692016601562,
"learning_rate": 4.318181818181819e-05,
"loss": 2.0755,
"step": 27
},
{
"epoch": 0.005758946935417524,
"grad_norm": 11.817388534545898,
"learning_rate": 4.2727272727272724e-05,
"loss": 1.796,
"step": 28
},
{
"epoch": 0.005964623611682435,
"grad_norm": 7.721453666687012,
"learning_rate": 4.2272727272727275e-05,
"loss": 1.461,
"step": 29
},
{
"epoch": 0.006170300287947347,
"grad_norm": 8.523295402526855,
"learning_rate": 4.181818181818182e-05,
"loss": 1.6376,
"step": 30
},
{
"epoch": 0.006375976964212258,
"grad_norm": 14.518152236938477,
"learning_rate": 4.1363636363636364e-05,
"loss": 2.036,
"step": 31
},
{
"epoch": 0.00658165364047717,
"grad_norm": 12.425220489501953,
"learning_rate": 4.0909090909090915e-05,
"loss": 0.4348,
"step": 32
},
{
"epoch": 0.006787330316742082,
"grad_norm": 26.095151901245117,
"learning_rate": 4.045454545454546e-05,
"loss": 1.2404,
"step": 33
},
{
"epoch": 0.006993006993006993,
"grad_norm": 4.784183979034424,
"learning_rate": 4e-05,
"loss": 1.309,
"step": 34
},
{
"epoch": 0.007198683669271904,
"grad_norm": 13.188830375671387,
"learning_rate": 3.954545454545455e-05,
"loss": 1.5275,
"step": 35
},
{
"epoch": 0.007404360345536816,
"grad_norm": 9.305349349975586,
"learning_rate": 3.909090909090909e-05,
"loss": 1.8645,
"step": 36
},
{
"epoch": 0.007610037021801728,
"grad_norm": 10.391180038452148,
"learning_rate": 3.8636363636363636e-05,
"loss": 0.2851,
"step": 37
},
{
"epoch": 0.00781571369806664,
"grad_norm": 2.905449390411377,
"learning_rate": 3.818181818181819e-05,
"loss": 0.0632,
"step": 38
},
{
"epoch": 0.008021390374331552,
"grad_norm": 15.784213066101074,
"learning_rate": 3.7727272727272725e-05,
"loss": 0.7361,
"step": 39
},
{
"epoch": 0.008227067050596462,
"grad_norm": 4.363598346710205,
"learning_rate": 3.7272727272727276e-05,
"loss": 0.0486,
"step": 40
},
{
"epoch": 0.008432743726861374,
"grad_norm": 17.239139556884766,
"learning_rate": 3.681818181818182e-05,
"loss": 1.3015,
"step": 41
},
{
"epoch": 0.008638420403126285,
"grad_norm": 1.9617282152175903,
"learning_rate": 3.6363636363636364e-05,
"loss": 0.0413,
"step": 42
},
{
"epoch": 0.008844097079391197,
"grad_norm": 5.427540302276611,
"learning_rate": 3.590909090909091e-05,
"loss": 1.733,
"step": 43
},
{
"epoch": 0.00904977375565611,
"grad_norm": 41.895721435546875,
"learning_rate": 3.545454545454546e-05,
"loss": 1.7908,
"step": 44
},
{
"epoch": 0.00925545043192102,
"grad_norm": 6.50022554397583,
"learning_rate": 3.5e-05,
"loss": 2.393,
"step": 45
},
{
"epoch": 0.009461127108185932,
"grad_norm": 13.534425735473633,
"learning_rate": 3.454545454545455e-05,
"loss": 1.0052,
"step": 46
},
{
"epoch": 0.009666803784450843,
"grad_norm": 1.7201191186904907,
"learning_rate": 3.409090909090909e-05,
"loss": 0.0428,
"step": 47
},
{
"epoch": 0.009872480460715755,
"grad_norm": 18.76580810546875,
"learning_rate": 3.3636363636363636e-05,
"loss": 0.8643,
"step": 48
},
{
"epoch": 0.010078157136980667,
"grad_norm": 18.51691246032715,
"learning_rate": 3.318181818181819e-05,
"loss": 1.0954,
"step": 49
},
{
"epoch": 0.010283833813245578,
"grad_norm": 8.370597839355469,
"learning_rate": 3.272727272727273e-05,
"loss": 0.6081,
"step": 50
},
{
"epoch": 0.01048951048951049,
"grad_norm": 6.726954936981201,
"learning_rate": 3.2272727272727276e-05,
"loss": 0.0419,
"step": 51
},
{
"epoch": 0.0106951871657754,
"grad_norm": 6.499707221984863,
"learning_rate": 3.181818181818182e-05,
"loss": 1.7456,
"step": 52
},
{
"epoch": 0.010900863842040313,
"grad_norm": 17.723892211914062,
"learning_rate": 3.1363636363636365e-05,
"loss": 1.2133,
"step": 53
},
{
"epoch": 0.011106540518305225,
"grad_norm": 7.051292419433594,
"learning_rate": 3.090909090909091e-05,
"loss": 0.2732,
"step": 54
},
{
"epoch": 0.011312217194570135,
"grad_norm": 12.115779876708984,
"learning_rate": 3.0454545454545456e-05,
"loss": 2.4646,
"step": 55
},
{
"epoch": 0.011517893870835048,
"grad_norm": 5.81415319442749,
"learning_rate": 3e-05,
"loss": 1.6826,
"step": 56
},
{
"epoch": 0.011723570547099958,
"grad_norm": 13.005535125732422,
"learning_rate": 2.954545454545455e-05,
"loss": 0.8454,
"step": 57
},
{
"epoch": 0.01192924722336487,
"grad_norm": 13.358834266662598,
"learning_rate": 2.909090909090909e-05,
"loss": 0.6585,
"step": 58
},
{
"epoch": 0.012134923899629783,
"grad_norm": 7.232337474822998,
"learning_rate": 2.863636363636364e-05,
"loss": 2.0886,
"step": 59
},
{
"epoch": 0.012340600575894693,
"grad_norm": 5.909549713134766,
"learning_rate": 2.818181818181818e-05,
"loss": 2.04,
"step": 60
},
{
"epoch": 0.012546277252159605,
"grad_norm": 5.2378621101379395,
"learning_rate": 2.772727272727273e-05,
"loss": 1.3154,
"step": 61
},
{
"epoch": 0.012751953928424516,
"grad_norm": 7.897792816162109,
"learning_rate": 2.7272727272727273e-05,
"loss": 1.7168,
"step": 62
},
{
"epoch": 0.012957630604689428,
"grad_norm": 10.026203155517578,
"learning_rate": 2.681818181818182e-05,
"loss": 2.1631,
"step": 63
},
{
"epoch": 0.01316330728095434,
"grad_norm": 7.848910808563232,
"learning_rate": 2.636363636363636e-05,
"loss": 0.895,
"step": 64
},
{
"epoch": 0.013368983957219251,
"grad_norm": 8.935349464416504,
"learning_rate": 2.590909090909091e-05,
"loss": 1.3377,
"step": 65
},
{
"epoch": 0.013574660633484163,
"grad_norm": 12.838030815124512,
"learning_rate": 2.5454545454545454e-05,
"loss": 1.0923,
"step": 66
},
{
"epoch": 0.013780337309749074,
"grad_norm": 11.543920516967773,
"learning_rate": 2.5e-05,
"loss": 1.8728,
"step": 67
},
{
"epoch": 0.013986013986013986,
"grad_norm": 5.111774444580078,
"learning_rate": 2.4545454545454545e-05,
"loss": 1.4745,
"step": 68
},
{
"epoch": 0.014191690662278898,
"grad_norm": 9.102482795715332,
"learning_rate": 2.4090909090909093e-05,
"loss": 0.5785,
"step": 69
},
{
"epoch": 0.014397367338543809,
"grad_norm": 10.797809600830078,
"learning_rate": 2.3636363636363637e-05,
"loss": 1.5844,
"step": 70
},
{
"epoch": 0.014603044014808721,
"grad_norm": 6.701333999633789,
"learning_rate": 2.318181818181818e-05,
"loss": 1.6503,
"step": 71
},
{
"epoch": 0.014808720691073632,
"grad_norm": 8.514144897460938,
"learning_rate": 2.272727272727273e-05,
"loss": 2.0404,
"step": 72
},
{
"epoch": 0.015014397367338544,
"grad_norm": 4.390872001647949,
"learning_rate": 2.2272727272727274e-05,
"loss": 1.3714,
"step": 73
},
{
"epoch": 0.015220074043603456,
"grad_norm": 11.0691556930542,
"learning_rate": 2.1818181818181818e-05,
"loss": 1.9498,
"step": 74
},
{
"epoch": 0.015425750719868367,
"grad_norm": 4.954442024230957,
"learning_rate": 2.1363636363636362e-05,
"loss": 1.5526,
"step": 75
},
{
"epoch": 0.01563142739613328,
"grad_norm": 3.523308038711548,
"learning_rate": 2.090909090909091e-05,
"loss": 0.0933,
"step": 76
},
{
"epoch": 0.01583710407239819,
"grad_norm": 7.044577121734619,
"learning_rate": 2.0454545454545457e-05,
"loss": 1.8968,
"step": 77
},
{
"epoch": 0.016042780748663103,
"grad_norm": 12.184310913085938,
"learning_rate": 2e-05,
"loss": 1.9779,
"step": 78
},
{
"epoch": 0.016248457424928014,
"grad_norm": 7.611854076385498,
"learning_rate": 1.9545454545454546e-05,
"loss": 1.1828,
"step": 79
},
{
"epoch": 0.016454134101192924,
"grad_norm": 2.3979077339172363,
"learning_rate": 1.9090909090909094e-05,
"loss": 0.0637,
"step": 80
},
{
"epoch": 0.016659810777457835,
"grad_norm": 7.704205513000488,
"learning_rate": 1.8636363636363638e-05,
"loss": 0.8134,
"step": 81
},
{
"epoch": 0.01686548745372275,
"grad_norm": 5.452297210693359,
"learning_rate": 1.8181818181818182e-05,
"loss": 1.4938,
"step": 82
},
{
"epoch": 0.01707116412998766,
"grad_norm": 6.996687889099121,
"learning_rate": 1.772727272727273e-05,
"loss": 0.8019,
"step": 83
},
{
"epoch": 0.01727684080625257,
"grad_norm": 7.0274271965026855,
"learning_rate": 1.7272727272727274e-05,
"loss": 0.7567,
"step": 84
},
{
"epoch": 0.017482517482517484,
"grad_norm": 14.325960159301758,
"learning_rate": 1.6818181818181818e-05,
"loss": 1.861,
"step": 85
},
{
"epoch": 0.017688194158782394,
"grad_norm": 8.082893371582031,
"learning_rate": 1.6363636363636366e-05,
"loss": 1.7144,
"step": 86
},
{
"epoch": 0.017893870835047305,
"grad_norm": 18.079805374145508,
"learning_rate": 1.590909090909091e-05,
"loss": 1.3443,
"step": 87
},
{
"epoch": 0.01809954751131222,
"grad_norm": 7.730350971221924,
"learning_rate": 1.5454545454545454e-05,
"loss": 0.6243,
"step": 88
},
{
"epoch": 0.01830522418757713,
"grad_norm": 11.749229431152344,
"learning_rate": 1.5e-05,
"loss": 0.9159,
"step": 89
},
{
"epoch": 0.01851090086384204,
"grad_norm": 1.573517918586731,
"learning_rate": 1.4545454545454545e-05,
"loss": 0.0361,
"step": 90
},
{
"epoch": 0.01871657754010695,
"grad_norm": 12.70760440826416,
"learning_rate": 1.409090909090909e-05,
"loss": 0.6055,
"step": 91
},
{
"epoch": 0.018922254216371864,
"grad_norm": 8.807103157043457,
"learning_rate": 1.3636363636363637e-05,
"loss": 1.4647,
"step": 92
},
{
"epoch": 0.019127930892636775,
"grad_norm": 4.610854625701904,
"learning_rate": 1.318181818181818e-05,
"loss": 0.1759,
"step": 93
},
{
"epoch": 0.019333607568901685,
"grad_norm": 4.804567813873291,
"learning_rate": 1.2727272727272727e-05,
"loss": 1.5768,
"step": 94
},
{
"epoch": 0.0195392842451666,
"grad_norm": 8.914559364318848,
"learning_rate": 1.2272727272727273e-05,
"loss": 0.9042,
"step": 95
},
{
"epoch": 0.01974496092143151,
"grad_norm": 10.30044174194336,
"learning_rate": 1.1818181818181819e-05,
"loss": 1.4115,
"step": 96
},
{
"epoch": 0.01995063759769642,
"grad_norm": 1.7506386041641235,
"learning_rate": 1.1363636363636365e-05,
"loss": 0.0564,
"step": 97
},
{
"epoch": 0.020156314273961334,
"grad_norm": 15.82219123840332,
"learning_rate": 1.0909090909090909e-05,
"loss": 1.3599,
"step": 98
},
{
"epoch": 0.020361990950226245,
"grad_norm": 13.379084587097168,
"learning_rate": 1.0454545454545455e-05,
"loss": 0.6752,
"step": 99
},
{
"epoch": 0.020567667626491155,
"grad_norm": 13.464095115661621,
"learning_rate": 1e-05,
"loss": 0.71,
"step": 100
},
{
"epoch": 0.020773344302756066,
"grad_norm": 11.258218765258789,
"learning_rate": 9.545454545454547e-06,
"loss": 1.5134,
"step": 101
},
{
"epoch": 0.02097902097902098,
"grad_norm": 6.195601463317871,
"learning_rate": 9.090909090909091e-06,
"loss": 1.9024,
"step": 102
},
{
"epoch": 0.02118469765528589,
"grad_norm": 14.71764087677002,
"learning_rate": 8.636363636363637e-06,
"loss": 1.8721,
"step": 103
},
{
"epoch": 0.0213903743315508,
"grad_norm": 18.410600662231445,
"learning_rate": 8.181818181818183e-06,
"loss": 0.9641,
"step": 104
},
{
"epoch": 0.021596051007815715,
"grad_norm": 14.327963829040527,
"learning_rate": 7.727272727272727e-06,
"loss": 1.3635,
"step": 105
},
{
"epoch": 0.021801727684080625,
"grad_norm": 9.677972793579102,
"learning_rate": 7.272727272727272e-06,
"loss": 1.1723,
"step": 106
},
{
"epoch": 0.022007404360345536,
"grad_norm": 8.000631332397461,
"learning_rate": 6.818181818181818e-06,
"loss": 0.6521,
"step": 107
},
{
"epoch": 0.02221308103661045,
"grad_norm": 1.116436243057251,
"learning_rate": 6.363636363636363e-06,
"loss": 0.0357,
"step": 108
},
{
"epoch": 0.02241875771287536,
"grad_norm": 1.7491281032562256,
"learning_rate": 5.909090909090909e-06,
"loss": 0.0469,
"step": 109
},
{
"epoch": 0.02262443438914027,
"grad_norm": 1.2441469430923462,
"learning_rate": 5.4545454545454545e-06,
"loss": 0.0386,
"step": 110
},
{
"epoch": 0.02283011106540518,
"grad_norm": 18.10247039794922,
"learning_rate": 5e-06,
"loss": 1.1133,
"step": 111
},
{
"epoch": 0.023035787741670095,
"grad_norm": 8.674224853515625,
"learning_rate": 4.5454545454545455e-06,
"loss": 1.4932,
"step": 112
},
{
"epoch": 0.023241464417935006,
"grad_norm": 10.565869331359863,
"learning_rate": 4.0909090909090915e-06,
"loss": 0.2315,
"step": 113
},
{
"epoch": 0.023447141094199916,
"grad_norm": 7.652951240539551,
"learning_rate": 3.636363636363636e-06,
"loss": 1.8242,
"step": 114
},
{
"epoch": 0.02365281777046483,
"grad_norm": 5.98936653137207,
"learning_rate": 3.1818181818181817e-06,
"loss": 1.9984,
"step": 115
},
{
"epoch": 0.02385849444672974,
"grad_norm": 9.790857315063477,
"learning_rate": 2.7272727272727272e-06,
"loss": 1.9085,
"step": 116
},
{
"epoch": 0.02406417112299465,
"grad_norm": 1.22812020778656,
"learning_rate": 2.2727272727272728e-06,
"loss": 0.0346,
"step": 117
},
{
"epoch": 0.024269847799259565,
"grad_norm": 14.80505657196045,
"learning_rate": 1.818181818181818e-06,
"loss": 1.1611,
"step": 118
},
{
"epoch": 0.024475524475524476,
"grad_norm": 15.46601390838623,
"learning_rate": 1.3636363636363636e-06,
"loss": 2.3691,
"step": 119
},
{
"epoch": 0.024681201151789386,
"grad_norm": 17.55849266052246,
"learning_rate": 9.09090909090909e-07,
"loss": 0.8722,
"step": 120
}
],
"logging_steps": 1,
"max_steps": 120,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3875365448736768.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}