Phi_medner_finetuned_lora / trainer_state.json
emilykang's picture
Model save
a320179 verified
raw
history blame contribute delete
No virus
35.3 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.987452948557088,
"eval_steps": 500,
"global_step": 1990,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.050188205771643665,
"grad_norm": 0.3712446093559265,
"learning_rate": 0.00019998753895176575,
"loss": 1.308,
"step": 10
},
{
"epoch": 0.10037641154328733,
"grad_norm": 0.2267504632472992,
"learning_rate": 0.0001999501589126174,
"loss": 1.0216,
"step": 20
},
{
"epoch": 0.15056461731493098,
"grad_norm": 0.21779587864875793,
"learning_rate": 0.00019988786919844436,
"loss": 0.9698,
"step": 30
},
{
"epoch": 0.20075282308657466,
"grad_norm": 0.19701333343982697,
"learning_rate": 0.00019980068533314934,
"loss": 0.9313,
"step": 40
},
{
"epoch": 0.25094102885821834,
"grad_norm": 0.20786035060882568,
"learning_rate": 0.00019968862904477935,
"loss": 0.9067,
"step": 50
},
{
"epoch": 0.30112923462986196,
"grad_norm": 0.20178885757923126,
"learning_rate": 0.00019955172826011062,
"loss": 0.8945,
"step": 60
},
{
"epoch": 0.35131744040150564,
"grad_norm": 0.21096089482307434,
"learning_rate": 0.0001993900170976888,
"loss": 0.8929,
"step": 70
},
{
"epoch": 0.4015056461731493,
"grad_norm": 0.21473677456378937,
"learning_rate": 0.00019920353585932578,
"loss": 0.8688,
"step": 80
},
{
"epoch": 0.451693851944793,
"grad_norm": 0.23557408154010773,
"learning_rate": 0.00019899233102005573,
"loss": 0.8585,
"step": 90
},
{
"epoch": 0.5018820577164367,
"grad_norm": 0.21959255635738373,
"learning_rate": 0.0001987564552165524,
"loss": 0.8615,
"step": 100
},
{
"epoch": 0.5520702634880803,
"grad_norm": 0.23273342847824097,
"learning_rate": 0.00019849596723401107,
"loss": 0.8523,
"step": 110
},
{
"epoch": 0.6022584692597239,
"grad_norm": 0.22011199593544006,
"learning_rate": 0.00019821093199149804,
"loss": 0.8588,
"step": 120
},
{
"epoch": 0.6524466750313677,
"grad_norm": 0.2598012685775757,
"learning_rate": 0.0001979014205257715,
"loss": 0.8389,
"step": 130
},
{
"epoch": 0.7026348808030113,
"grad_norm": 0.251001238822937,
"learning_rate": 0.0001975675099735774,
"loss": 0.8297,
"step": 140
},
{
"epoch": 0.7528230865746549,
"grad_norm": 0.2203661948442459,
"learning_rate": 0.00019720928355242568,
"loss": 0.8222,
"step": 150
},
{
"epoch": 0.8030112923462986,
"grad_norm": 0.2208539992570877,
"learning_rate": 0.00019682683053985072,
"loss": 0.8365,
"step": 160
},
{
"epoch": 0.8531994981179423,
"grad_norm": 0.23628230392932892,
"learning_rate": 0.00019642024625116117,
"loss": 0.8242,
"step": 170
},
{
"epoch": 0.903387703889586,
"grad_norm": 0.2504488229751587,
"learning_rate": 0.00019598963201568573,
"loss": 0.8245,
"step": 180
},
{
"epoch": 0.9535759096612296,
"grad_norm": 0.2108476608991623,
"learning_rate": 0.0001955350951515195,
"loss": 0.8144,
"step": 190
},
{
"epoch": 1.0037641154328734,
"grad_norm": 0.3357304632663727,
"learning_rate": 0.0001950567489387783,
"loss": 0.8139,
"step": 200
},
{
"epoch": 1.053952321204517,
"grad_norm": 0.2318328619003296,
"learning_rate": 0.0001945547125913667,
"loss": 0.8025,
"step": 210
},
{
"epoch": 1.1041405269761606,
"grad_norm": 0.21543821692466736,
"learning_rate": 0.00019402911122726757,
"loss": 0.7935,
"step": 220
},
{
"epoch": 1.1543287327478042,
"grad_norm": 0.22069813311100006,
"learning_rate": 0.00019348007583735983,
"loss": 0.7883,
"step": 230
},
{
"epoch": 1.2045169385194479,
"grad_norm": 0.23400938510894775,
"learning_rate": 0.00019290774325277305,
"loss": 0.7837,
"step": 240
},
{
"epoch": 1.2547051442910915,
"grad_norm": 0.21560260653495789,
"learning_rate": 0.0001923122561107861,
"loss": 0.7851,
"step": 250
},
{
"epoch": 1.3048933500627353,
"grad_norm": 0.220945805311203,
"learning_rate": 0.00019169376281927888,
"loss": 0.7804,
"step": 260
},
{
"epoch": 1.355081555834379,
"grad_norm": 0.2537701427936554,
"learning_rate": 0.00019105241751974622,
"loss": 0.7782,
"step": 270
},
{
"epoch": 1.4052697616060226,
"grad_norm": 0.22228342294692993,
"learning_rate": 0.0001903883800488824,
"loss": 0.7767,
"step": 280
},
{
"epoch": 1.4554579673776662,
"grad_norm": 0.23013311624526978,
"learning_rate": 0.00018970181589874637,
"loss": 0.7886,
"step": 290
},
{
"epoch": 1.50564617314931,
"grad_norm": 0.24365030229091644,
"learning_rate": 0.00018899289617551804,
"loss": 0.7848,
"step": 300
},
{
"epoch": 1.5558343789209537,
"grad_norm": 0.2531464993953705,
"learning_rate": 0.0001882617975568547,
"loss": 0.7769,
"step": 310
},
{
"epoch": 1.6060225846925973,
"grad_norm": 0.22617797553539276,
"learning_rate": 0.00018750870224785939,
"loss": 0.7745,
"step": 320
},
{
"epoch": 1.656210790464241,
"grad_norm": 0.2243880182504654,
"learning_rate": 0.00018673379793567146,
"loss": 0.7687,
"step": 330
},
{
"epoch": 1.7063989962358845,
"grad_norm": 0.21209578216075897,
"learning_rate": 0.0001859372777426912,
"loss": 0.7628,
"step": 340
},
{
"epoch": 1.7565872020075282,
"grad_norm": 0.21007835865020752,
"learning_rate": 0.00018511934017844948,
"loss": 0.7595,
"step": 350
},
{
"epoch": 1.8067754077791718,
"grad_norm": 0.22626900672912598,
"learning_rate": 0.00018428018909013506,
"loss": 0.7605,
"step": 360
},
{
"epoch": 1.8569636135508154,
"grad_norm": 0.22667524218559265,
"learning_rate": 0.00018342003361179176,
"loss": 0.7726,
"step": 370
},
{
"epoch": 1.9071518193224593,
"grad_norm": 0.2177441567182541,
"learning_rate": 0.00018253908811219764,
"loss": 0.7595,
"step": 380
},
{
"epoch": 1.9573400250941029,
"grad_norm": 0.22076500952243805,
"learning_rate": 0.00018163757214143992,
"loss": 0.7554,
"step": 390
},
{
"epoch": 2.0075282308657467,
"grad_norm": 0.21619777381420135,
"learning_rate": 0.00018071571037619853,
"loss": 0.7353,
"step": 400
},
{
"epoch": 2.0577164366373903,
"grad_norm": 0.22108638286590576,
"learning_rate": 0.00017977373256375194,
"loss": 0.7281,
"step": 410
},
{
"epoch": 2.107904642409034,
"grad_norm": 0.2403818517923355,
"learning_rate": 0.00017881187346471925,
"loss": 0.736,
"step": 420
},
{
"epoch": 2.1580928481806776,
"grad_norm": 0.2393644005060196,
"learning_rate": 0.00017783037279455298,
"loss": 0.724,
"step": 430
},
{
"epoch": 2.208281053952321,
"grad_norm": 0.23402653634548187,
"learning_rate": 0.00017682947516379707,
"loss": 0.7309,
"step": 440
},
{
"epoch": 2.258469259723965,
"grad_norm": 0.24220925569534302,
"learning_rate": 0.00017580943001712455,
"loss": 0.7201,
"step": 450
},
{
"epoch": 2.3086574654956085,
"grad_norm": 0.2503248155117035,
"learning_rate": 0.00017477049157117093,
"loss": 0.7226,
"step": 460
},
{
"epoch": 2.358845671267252,
"grad_norm": 0.23094403743743896,
"learning_rate": 0.0001737129187511779,
"loss": 0.7206,
"step": 470
},
{
"epoch": 2.4090338770388957,
"grad_norm": 0.23082856833934784,
"learning_rate": 0.00017263697512646394,
"loss": 0.7133,
"step": 480
},
{
"epoch": 2.4592220828105393,
"grad_norm": 0.2403910756111145,
"learning_rate": 0.00017154292884473713,
"loss": 0.7307,
"step": 490
},
{
"epoch": 2.509410288582183,
"grad_norm": 0.2515548765659332,
"learning_rate": 0.00017043105256526724,
"loss": 0.7264,
"step": 500
},
{
"epoch": 2.5595984943538266,
"grad_norm": 0.23908871412277222,
"learning_rate": 0.00016930162339093318,
"loss": 0.7258,
"step": 510
},
{
"epoch": 2.6097867001254706,
"grad_norm": 0.2339881807565689,
"learning_rate": 0.0001681549227991634,
"loss": 0.7189,
"step": 520
},
{
"epoch": 2.6599749058971143,
"grad_norm": 0.23234973847866058,
"learning_rate": 0.00016699123657178553,
"loss": 0.7144,
"step": 530
},
{
"epoch": 2.710163111668758,
"grad_norm": 0.252946674823761,
"learning_rate": 0.00016581085472380376,
"loss": 0.7199,
"step": 540
},
{
"epoch": 2.7603513174404015,
"grad_norm": 0.23885370790958405,
"learning_rate": 0.00016461407143112097,
"loss": 0.7107,
"step": 550
},
{
"epoch": 2.810539523212045,
"grad_norm": 0.24114787578582764,
"learning_rate": 0.00016340118495722388,
"loss": 0.7129,
"step": 560
},
{
"epoch": 2.8607277289836888,
"grad_norm": 0.24572765827178955,
"learning_rate": 0.00016217249757884955,
"loss": 0.7158,
"step": 570
},
{
"epoch": 2.9109159347553324,
"grad_norm": 0.24029052257537842,
"learning_rate": 0.0001609283155106517,
"loss": 0.7084,
"step": 580
},
{
"epoch": 2.961104140526976,
"grad_norm": 0.24108637869358063,
"learning_rate": 0.00015966894882888562,
"loss": 0.7125,
"step": 590
},
{
"epoch": 3.0112923462986196,
"grad_norm": 0.2422133982181549,
"learning_rate": 0.00015839471139413066,
"loss": 0.6978,
"step": 600
},
{
"epoch": 3.0614805520702637,
"grad_norm": 0.2545720338821411,
"learning_rate": 0.0001571059207730695,
"loss": 0.6779,
"step": 610
},
{
"epoch": 3.1116687578419073,
"grad_norm": 0.2578783333301544,
"learning_rate": 0.00015580289815934401,
"loss": 0.673,
"step": 620
},
{
"epoch": 3.161856963613551,
"grad_norm": 0.2702922224998474,
"learning_rate": 0.00015448596829350706,
"loss": 0.686,
"step": 630
},
{
"epoch": 3.2120451693851946,
"grad_norm": 0.26222941279411316,
"learning_rate": 0.00015315545938209015,
"loss": 0.6853,
"step": 640
},
{
"epoch": 3.262233375156838,
"grad_norm": 0.25867560505867004,
"learning_rate": 0.00015181170301580777,
"loss": 0.677,
"step": 650
},
{
"epoch": 3.312421580928482,
"grad_norm": 0.2750966548919678,
"learning_rate": 0.00015045503408691775,
"loss": 0.6758,
"step": 660
},
{
"epoch": 3.3626097867001254,
"grad_norm": 0.2567848861217499,
"learning_rate": 0.00014908579070575936,
"loss": 0.6708,
"step": 670
},
{
"epoch": 3.412797992471769,
"grad_norm": 0.26190003752708435,
"learning_rate": 0.00014770431411648897,
"loss": 0.677,
"step": 680
},
{
"epoch": 3.4629861982434127,
"grad_norm": 0.26486852765083313,
"learning_rate": 0.0001463109486120348,
"loss": 0.6785,
"step": 690
},
{
"epoch": 3.5131744040150563,
"grad_norm": 0.26697248220443726,
"learning_rate": 0.00014490604144829202,
"loss": 0.6791,
"step": 700
},
{
"epoch": 3.5633626097867,
"grad_norm": 0.26674166321754456,
"learning_rate": 0.00014348994275757931,
"loss": 0.6775,
"step": 710
},
{
"epoch": 3.6135508155583436,
"grad_norm": 0.2583613395690918,
"learning_rate": 0.00014206300546137842,
"loss": 0.6722,
"step": 720
},
{
"epoch": 3.663739021329987,
"grad_norm": 0.2743168771266937,
"learning_rate": 0.00014062558518237892,
"loss": 0.6777,
"step": 730
},
{
"epoch": 3.7139272271016313,
"grad_norm": 0.2537378668785095,
"learning_rate": 0.00013917804015584932,
"loss": 0.6775,
"step": 740
},
{
"epoch": 3.764115432873275,
"grad_norm": 0.27333304286003113,
"learning_rate": 0.00013772073114035762,
"loss": 0.6797,
"step": 750
},
{
"epoch": 3.8143036386449185,
"grad_norm": 0.26115766167640686,
"learning_rate": 0.00013625402132786248,
"loss": 0.6687,
"step": 760
},
{
"epoch": 3.864491844416562,
"grad_norm": 0.2621854543685913,
"learning_rate": 0.00013477827625319824,
"loss": 0.6634,
"step": 770
},
{
"epoch": 3.9146800501882058,
"grad_norm": 0.25681644678115845,
"learning_rate": 0.00013329386370297615,
"loss": 0.6676,
"step": 780
},
{
"epoch": 3.9648682559598494,
"grad_norm": 0.2630254626274109,
"learning_rate": 0.00013180115362392382,
"loss": 0.6819,
"step": 790
},
{
"epoch": 4.015056461731493,
"grad_norm": 0.2596668004989624,
"learning_rate": 0.00013030051803068727,
"loss": 0.6562,
"step": 800
},
{
"epoch": 4.065244667503137,
"grad_norm": 0.2807883620262146,
"learning_rate": 0.00012879233091311667,
"loss": 0.6343,
"step": 810
},
{
"epoch": 4.115432873274781,
"grad_norm": 0.3002206087112427,
"learning_rate": 0.00012727696814306033,
"loss": 0.6426,
"step": 820
},
{
"epoch": 4.165621079046424,
"grad_norm": 0.278054803609848,
"learning_rate": 0.0001257548073806897,
"loss": 0.6434,
"step": 830
},
{
"epoch": 4.215809284818068,
"grad_norm": 0.28684940934181213,
"learning_rate": 0.00012422622798037832,
"loss": 0.64,
"step": 840
},
{
"epoch": 4.265997490589712,
"grad_norm": 0.2862164378166199,
"learning_rate": 0.000122691610896159,
"loss": 0.6413,
"step": 850
},
{
"epoch": 4.316185696361355,
"grad_norm": 0.2956394553184509,
"learning_rate": 0.00012115133858678191,
"loss": 0.6344,
"step": 860
},
{
"epoch": 4.366373902132999,
"grad_norm": 0.28408849239349365,
"learning_rate": 0.00011960579492039783,
"loss": 0.6368,
"step": 870
},
{
"epoch": 4.416562107904642,
"grad_norm": 0.2809561789035797,
"learning_rate": 0.00011805536507889021,
"loss": 0.6336,
"step": 880
},
{
"epoch": 4.466750313676286,
"grad_norm": 0.27648741006851196,
"learning_rate": 0.00011650043546187995,
"loss": 0.6357,
"step": 890
},
{
"epoch": 4.51693851944793,
"grad_norm": 0.28754714131355286,
"learning_rate": 0.0001149413935904261,
"loss": 0.6341,
"step": 900
},
{
"epoch": 4.567126725219573,
"grad_norm": 0.2936854958534241,
"learning_rate": 0.00011337862801044792,
"loss": 0.6292,
"step": 910
},
{
"epoch": 4.617314930991217,
"grad_norm": 0.27858176827430725,
"learning_rate": 0.00011181252819589081,
"loss": 0.6351,
"step": 920
},
{
"epoch": 4.6675031367628605,
"grad_norm": 0.2897019684314728,
"learning_rate": 0.00011024348445166133,
"loss": 0.6369,
"step": 930
},
{
"epoch": 4.717691342534504,
"grad_norm": 0.28682589530944824,
"learning_rate": 0.00010867188781635512,
"loss": 0.6375,
"step": 940
},
{
"epoch": 4.767879548306148,
"grad_norm": 0.28193414211273193,
"learning_rate": 0.0001070981299648016,
"loss": 0.6337,
"step": 950
},
{
"epoch": 4.818067754077791,
"grad_norm": 0.28822091221809387,
"learning_rate": 0.00010552260311045082,
"loss": 0.6378,
"step": 960
},
{
"epoch": 4.868255959849435,
"grad_norm": 0.28457361459732056,
"learning_rate": 0.00010394569990762529,
"loss": 0.6368,
"step": 970
},
{
"epoch": 4.918444165621079,
"grad_norm": 0.2925203740596771,
"learning_rate": 0.00010236781335366239,
"loss": 0.6287,
"step": 980
},
{
"epoch": 4.968632371392722,
"grad_norm": 0.2838154435157776,
"learning_rate": 0.00010078933669097135,
"loss": 0.6305,
"step": 990
},
{
"epoch": 5.018820577164367,
"grad_norm": 0.29948368668556213,
"learning_rate": 9.92106633090287e-05,
"loss": 0.6216,
"step": 1000
},
{
"epoch": 5.06900878293601,
"grad_norm": 0.30535656213760376,
"learning_rate": 9.763218664633763e-05,
"loss": 0.5997,
"step": 1010
},
{
"epoch": 5.119196988707654,
"grad_norm": 0.2984197735786438,
"learning_rate": 9.605430009237474e-05,
"loss": 0.604,
"step": 1020
},
{
"epoch": 5.169385194479298,
"grad_norm": 0.31448280811309814,
"learning_rate": 9.447739688954919e-05,
"loss": 0.599,
"step": 1030
},
{
"epoch": 5.219573400250941,
"grad_norm": 0.3126201927661896,
"learning_rate": 9.29018700351984e-05,
"loss": 0.6064,
"step": 1040
},
{
"epoch": 5.269761606022585,
"grad_norm": 0.3049900233745575,
"learning_rate": 9.132811218364495e-05,
"loss": 0.6023,
"step": 1050
},
{
"epoch": 5.3199498117942285,
"grad_norm": 0.3015764653682709,
"learning_rate": 8.975651554833869e-05,
"loss": 0.6023,
"step": 1060
},
{
"epoch": 5.370138017565872,
"grad_norm": 0.31510215997695923,
"learning_rate": 8.818747180410921e-05,
"loss": 0.6072,
"step": 1070
},
{
"epoch": 5.420326223337516,
"grad_norm": 0.31331363320350647,
"learning_rate": 8.66213719895521e-05,
"loss": 0.603,
"step": 1080
},
{
"epoch": 5.470514429109159,
"grad_norm": 0.311443030834198,
"learning_rate": 8.505860640957391e-05,
"loss": 0.6034,
"step": 1090
},
{
"epoch": 5.520702634880803,
"grad_norm": 0.3126680254936218,
"learning_rate": 8.349956453812009e-05,
"loss": 0.5954,
"step": 1100
},
{
"epoch": 5.570890840652447,
"grad_norm": 0.32074013352394104,
"learning_rate": 8.194463492110981e-05,
"loss": 0.5997,
"step": 1110
},
{
"epoch": 5.62107904642409,
"grad_norm": 0.31394365429878235,
"learning_rate": 8.03942050796022e-05,
"loss": 0.6075,
"step": 1120
},
{
"epoch": 5.671267252195734,
"grad_norm": 0.3085944950580597,
"learning_rate": 7.88486614132181e-05,
"loss": 0.5993,
"step": 1130
},
{
"epoch": 5.7214554579673775,
"grad_norm": 0.3151126503944397,
"learning_rate": 7.730838910384097e-05,
"loss": 0.6067,
"step": 1140
},
{
"epoch": 5.771643663739021,
"grad_norm": 0.31070196628570557,
"learning_rate": 7.57737720196217e-05,
"loss": 0.6039,
"step": 1150
},
{
"epoch": 5.821831869510665,
"grad_norm": 0.31582969427108765,
"learning_rate": 7.424519261931036e-05,
"loss": 0.6012,
"step": 1160
},
{
"epoch": 5.872020075282308,
"grad_norm": 0.31882044672966003,
"learning_rate": 7.27230318569397e-05,
"loss": 0.6035,
"step": 1170
},
{
"epoch": 5.922208281053952,
"grad_norm": 0.31374436616897583,
"learning_rate": 7.120766908688336e-05,
"loss": 0.6084,
"step": 1180
},
{
"epoch": 5.972396486825596,
"grad_norm": 0.3210514485836029,
"learning_rate": 6.969948196931272e-05,
"loss": 0.6034,
"step": 1190
},
{
"epoch": 6.022584692597239,
"grad_norm": 0.3218853175640106,
"learning_rate": 6.819884637607619e-05,
"loss": 0.5889,
"step": 1200
},
{
"epoch": 6.072772898368883,
"grad_norm": 0.32491976022720337,
"learning_rate": 6.670613629702391e-05,
"loss": 0.576,
"step": 1210
},
{
"epoch": 6.122961104140527,
"grad_norm": 0.3358321487903595,
"learning_rate": 6.522172374680177e-05,
"loss": 0.5708,
"step": 1220
},
{
"epoch": 6.173149309912171,
"grad_norm": 0.31775182485580444,
"learning_rate": 6.374597867213756e-05,
"loss": 0.5743,
"step": 1230
},
{
"epoch": 6.223337515683815,
"grad_norm": 0.3289986550807953,
"learning_rate": 6.22792688596424e-05,
"loss": 0.5853,
"step": 1240
},
{
"epoch": 6.273525721455458,
"grad_norm": 0.33586037158966064,
"learning_rate": 6.0821959844150687e-05,
"loss": 0.5799,
"step": 1250
},
{
"epoch": 6.323713927227102,
"grad_norm": 0.33577895164489746,
"learning_rate": 5.9374414817621114e-05,
"loss": 0.5675,
"step": 1260
},
{
"epoch": 6.3739021329987455,
"grad_norm": 0.33007678389549255,
"learning_rate": 5.7936994538621605e-05,
"loss": 0.5764,
"step": 1270
},
{
"epoch": 6.424090338770389,
"grad_norm": 0.3328823149204254,
"learning_rate": 5.651005724242071e-05,
"loss": 0.5747,
"step": 1280
},
{
"epoch": 6.474278544542033,
"grad_norm": 0.33794859051704407,
"learning_rate": 5.509395855170798e-05,
"loss": 0.5762,
"step": 1290
},
{
"epoch": 6.524466750313676,
"grad_norm": 0.33616700768470764,
"learning_rate": 5.368905138796523e-05,
"loss": 0.5754,
"step": 1300
},
{
"epoch": 6.57465495608532,
"grad_norm": 0.3314683437347412,
"learning_rate": 5.229568588351108e-05,
"loss": 0.5827,
"step": 1310
},
{
"epoch": 6.624843161856964,
"grad_norm": 0.32283809781074524,
"learning_rate": 5.0914209294240644e-05,
"loss": 0.5762,
"step": 1320
},
{
"epoch": 6.675031367628607,
"grad_norm": 0.33403000235557556,
"learning_rate": 4.9544965913082264e-05,
"loss": 0.5759,
"step": 1330
},
{
"epoch": 6.725219573400251,
"grad_norm": 0.32813191413879395,
"learning_rate": 4.818829698419225e-05,
"loss": 0.5808,
"step": 1340
},
{
"epoch": 6.7754077791718945,
"grad_norm": 0.3342324495315552,
"learning_rate": 4.684454061790987e-05,
"loss": 0.5722,
"step": 1350
},
{
"epoch": 6.825595984943538,
"grad_norm": 0.3277010917663574,
"learning_rate": 4.5514031706492986e-05,
"loss": 0.5729,
"step": 1360
},
{
"epoch": 6.875784190715182,
"grad_norm": 0.32855984568595886,
"learning_rate": 4.4197101840655995e-05,
"loss": 0.5776,
"step": 1370
},
{
"epoch": 6.925972396486825,
"grad_norm": 0.3375394344329834,
"learning_rate": 4.289407922693053e-05,
"loss": 0.5702,
"step": 1380
},
{
"epoch": 6.976160602258469,
"grad_norm": 0.33724990487098694,
"learning_rate": 4.1605288605869365e-05,
"loss": 0.5703,
"step": 1390
},
{
"epoch": 7.026348808030113,
"grad_norm": 0.33817237615585327,
"learning_rate": 4.033105117111441e-05,
"loss": 0.563,
"step": 1400
},
{
"epoch": 7.076537013801756,
"grad_norm": 0.3434535264968872,
"learning_rate": 3.907168448934836e-05,
"loss": 0.5571,
"step": 1410
},
{
"epoch": 7.1267252195734,
"grad_norm": 0.34801870584487915,
"learning_rate": 3.7827502421150496e-05,
"loss": 0.562,
"step": 1420
},
{
"epoch": 7.1769134253450435,
"grad_norm": 0.35552722215652466,
"learning_rate": 3.659881504277613e-05,
"loss": 0.5527,
"step": 1430
},
{
"epoch": 7.227101631116687,
"grad_norm": 0.3546360731124878,
"learning_rate": 3.538592856887901e-05,
"loss": 0.5594,
"step": 1440
},
{
"epoch": 7.277289836888332,
"grad_norm": 0.34311702847480774,
"learning_rate": 3.4189145276196245e-05,
"loss": 0.5573,
"step": 1450
},
{
"epoch": 7.327478042659975,
"grad_norm": 0.3503047525882721,
"learning_rate": 3.3008763428214505e-05,
"loss": 0.5642,
"step": 1460
},
{
"epoch": 7.377666248431619,
"grad_norm": 0.3464205861091614,
"learning_rate": 3.1845077200836636e-05,
"loss": 0.5615,
"step": 1470
},
{
"epoch": 7.4278544542032625,
"grad_norm": 0.35482051968574524,
"learning_rate": 3.0698376609066825e-05,
"loss": 0.5527,
"step": 1480
},
{
"epoch": 7.478042659974906,
"grad_norm": 0.3588634729385376,
"learning_rate": 2.9568947434732775e-05,
"loss": 0.556,
"step": 1490
},
{
"epoch": 7.52823086574655,
"grad_norm": 0.3532968759536743,
"learning_rate": 2.8457071155262884e-05,
"loss": 0.5586,
"step": 1500
},
{
"epoch": 7.578419071518193,
"grad_norm": 0.3441388010978699,
"learning_rate": 2.736302487353609e-05,
"loss": 0.5461,
"step": 1510
},
{
"epoch": 7.628607277289837,
"grad_norm": 0.36395809054374695,
"learning_rate": 2.628708124882212e-05,
"loss": 0.5544,
"step": 1520
},
{
"epoch": 7.678795483061481,
"grad_norm": 0.3574591279029846,
"learning_rate": 2.5229508428829096e-05,
"loss": 0.5584,
"step": 1530
},
{
"epoch": 7.728983688833124,
"grad_norm": 0.35188260674476624,
"learning_rate": 2.4190569982875467e-05,
"loss": 0.5566,
"step": 1540
},
{
"epoch": 7.779171894604768,
"grad_norm": 0.34741711616516113,
"learning_rate": 2.3170524836202933e-05,
"loss": 0.5525,
"step": 1550
},
{
"epoch": 7.8293601003764115,
"grad_norm": 0.35913023352622986,
"learning_rate": 2.216962720544703e-05,
"loss": 0.5491,
"step": 1560
},
{
"epoch": 7.879548306148055,
"grad_norm": 0.3487934470176697,
"learning_rate": 2.1188126535280773e-05,
"loss": 0.558,
"step": 1570
},
{
"epoch": 7.929736511919699,
"grad_norm": 0.3519488573074341,
"learning_rate": 2.022626743624807e-05,
"loss": 0.5575,
"step": 1580
},
{
"epoch": 7.979924717691342,
"grad_norm": 0.35680004954338074,
"learning_rate": 1.9284289623801477e-05,
"loss": 0.5559,
"step": 1590
},
{
"epoch": 8.030112923462987,
"grad_norm": 0.3475489914417267,
"learning_rate": 1.8362427858560093e-05,
"loss": 0.5461,
"step": 1600
},
{
"epoch": 8.08030112923463,
"grad_norm": 0.3541754484176636,
"learning_rate": 1.74609118878024e-05,
"loss": 0.5395,
"step": 1610
},
{
"epoch": 8.130489335006274,
"grad_norm": 0.3458302319049835,
"learning_rate": 1.657996638820826e-05,
"loss": 0.5428,
"step": 1620
},
{
"epoch": 8.180677540777918,
"grad_norm": 0.35417988896369934,
"learning_rate": 1.5719810909864942e-05,
"loss": 0.5395,
"step": 1630
},
{
"epoch": 8.230865746549561,
"grad_norm": 0.35355257987976074,
"learning_rate": 1.4880659821550546e-05,
"loss": 0.5527,
"step": 1640
},
{
"epoch": 8.281053952321205,
"grad_norm": 0.35250890254974365,
"learning_rate": 1.4062722257308803e-05,
"loss": 0.5501,
"step": 1650
},
{
"epoch": 8.331242158092849,
"grad_norm": 0.34818190336227417,
"learning_rate": 1.3266202064328548e-05,
"loss": 0.5432,
"step": 1660
},
{
"epoch": 8.381430363864492,
"grad_norm": 0.36963459849357605,
"learning_rate": 1.2491297752140641e-05,
"loss": 0.5448,
"step": 1670
},
{
"epoch": 8.431618569636136,
"grad_norm": 0.35220593214035034,
"learning_rate": 1.1738202443145308e-05,
"loss": 0.5434,
"step": 1680
},
{
"epoch": 8.48180677540778,
"grad_norm": 0.3520500063896179,
"learning_rate": 1.1007103824481979e-05,
"loss": 0.5458,
"step": 1690
},
{
"epoch": 8.531994981179423,
"grad_norm": 0.36262160539627075,
"learning_rate": 1.029818410125365e-05,
"loss": 0.5428,
"step": 1700
},
{
"epoch": 8.582183186951067,
"grad_norm": 0.3580245077610016,
"learning_rate": 9.611619951117657e-06,
"loss": 0.5427,
"step": 1710
},
{
"epoch": 8.63237139272271,
"grad_norm": 0.35791924595832825,
"learning_rate": 8.94758248025378e-06,
"loss": 0.5523,
"step": 1720
},
{
"epoch": 8.682559598494354,
"grad_norm": 0.35621368885040283,
"learning_rate": 8.306237180721121e-06,
"loss": 0.5403,
"step": 1730
},
{
"epoch": 8.732747804265998,
"grad_norm": 0.3615633547306061,
"learning_rate": 7.687743889213938e-06,
"loss": 0.5455,
"step": 1740
},
{
"epoch": 8.782936010037641,
"grad_norm": 0.35723286867141724,
"learning_rate": 7.0922567472269444e-06,
"loss": 0.5449,
"step": 1750
},
{
"epoch": 8.833124215809285,
"grad_norm": 0.35941046476364136,
"learning_rate": 6.519924162640167e-06,
"loss": 0.5396,
"step": 1760
},
{
"epoch": 8.883312421580928,
"grad_norm": 0.36941203474998474,
"learning_rate": 5.9708887727324525e-06,
"loss": 0.5466,
"step": 1770
},
{
"epoch": 8.933500627352572,
"grad_norm": 0.3527214527130127,
"learning_rate": 5.445287408633304e-06,
"loss": 0.5469,
"step": 1780
},
{
"epoch": 8.983688833124216,
"grad_norm": 0.3579261004924774,
"learning_rate": 4.943251061221721e-06,
"loss": 0.5369,
"step": 1790
},
{
"epoch": 9.03387703889586,
"grad_norm": 0.3588533103466034,
"learning_rate": 4.464904848480523e-06,
"loss": 0.5482,
"step": 1800
},
{
"epoch": 9.084065244667503,
"grad_norm": 0.3596334457397461,
"learning_rate": 4.0103679843142895e-06,
"loss": 0.5402,
"step": 1810
},
{
"epoch": 9.134253450439147,
"grad_norm": 0.35277649760246277,
"learning_rate": 3.5797537488388323e-06,
"loss": 0.5431,
"step": 1820
},
{
"epoch": 9.18444165621079,
"grad_norm": 0.35417917370796204,
"learning_rate": 3.1731694601492833e-06,
"loss": 0.5352,
"step": 1830
},
{
"epoch": 9.234629861982434,
"grad_norm": 0.36016353964805603,
"learning_rate": 2.7907164475743043e-06,
"loss": 0.5395,
"step": 1840
},
{
"epoch": 9.284818067754077,
"grad_norm": 0.36541038751602173,
"learning_rate": 2.4324900264226403e-06,
"loss": 0.5348,
"step": 1850
},
{
"epoch": 9.335006273525721,
"grad_norm": 0.36023426055908203,
"learning_rate": 2.098579474228546e-06,
"loss": 0.5324,
"step": 1860
},
{
"epoch": 9.385194479297365,
"grad_norm": 0.3567328155040741,
"learning_rate": 1.7890680085019595e-06,
"loss": 0.5341,
"step": 1870
},
{
"epoch": 9.435382685069008,
"grad_norm": 0.3682873547077179,
"learning_rate": 1.5040327659889608e-06,
"loss": 0.5382,
"step": 1880
},
{
"epoch": 9.485570890840652,
"grad_norm": 0.36713671684265137,
"learning_rate": 1.2435447834476255e-06,
"loss": 0.537,
"step": 1890
},
{
"epoch": 9.535759096612296,
"grad_norm": 0.36034858226776123,
"learning_rate": 1.0076689799442873e-06,
"loss": 0.5435,
"step": 1900
},
{
"epoch": 9.58594730238394,
"grad_norm": 0.3527128994464874,
"learning_rate": 7.964641406742135e-07,
"loss": 0.5464,
"step": 1910
},
{
"epoch": 9.636135508155583,
"grad_norm": 0.3527335226535797,
"learning_rate": 6.099829023112235e-07,
"loss": 0.5396,
"step": 1920
},
{
"epoch": 9.686323713927226,
"grad_norm": 0.36540141701698303,
"learning_rate": 4.482717398894165e-07,
"loss": 0.5424,
"step": 1930
},
{
"epoch": 9.73651191969887,
"grad_norm": 0.35210534930229187,
"learning_rate": 3.1137095522068007e-07,
"loss": 0.5456,
"step": 1940
},
{
"epoch": 9.786700125470514,
"grad_norm": 0.3526809811592102,
"learning_rate": 1.9931466685065847e-07,
"loss": 0.5394,
"step": 1950
},
{
"epoch": 9.836888331242157,
"grad_norm": 0.36059680581092834,
"learning_rate": 1.1213080155564326e-07,
"loss": 0.5359,
"step": 1960
},
{
"epoch": 9.887076537013801,
"grad_norm": 0.36295098066329956,
"learning_rate": 4.9841087382618276e-08,
"loss": 0.5404,
"step": 1970
},
{
"epoch": 9.937264742785445,
"grad_norm": 0.35113370418548584,
"learning_rate": 1.2461048234269079e-08,
"loss": 0.5373,
"step": 1980
},
{
"epoch": 9.987452948557088,
"grad_norm": 0.3604467511177063,
"learning_rate": 0.0,
"loss": 0.5361,
"step": 1990
},
{
"epoch": 9.987452948557088,
"step": 1990,
"total_flos": 1.0444655785672704e+18,
"train_loss": 0.65177170523447,
"train_runtime": 44465.981,
"train_samples_per_second": 1.432,
"train_steps_per_second": 0.045
}
],
"logging_steps": 10,
"max_steps": 1990,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"total_flos": 1.0444655785672704e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}