|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.987452948557088, |
|
"eval_steps": 500, |
|
"global_step": 1990, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.050188205771643665, |
|
"grad_norm": 0.3712446093559265, |
|
"learning_rate": 0.00019998753895176575, |
|
"loss": 1.308, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10037641154328733, |
|
"grad_norm": 0.2267504632472992, |
|
"learning_rate": 0.0001999501589126174, |
|
"loss": 1.0216, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15056461731493098, |
|
"grad_norm": 0.21779587864875793, |
|
"learning_rate": 0.00019988786919844436, |
|
"loss": 0.9698, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.20075282308657466, |
|
"grad_norm": 0.19701333343982697, |
|
"learning_rate": 0.00019980068533314934, |
|
"loss": 0.9313, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.25094102885821834, |
|
"grad_norm": 0.20786035060882568, |
|
"learning_rate": 0.00019968862904477935, |
|
"loss": 0.9067, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.30112923462986196, |
|
"grad_norm": 0.20178885757923126, |
|
"learning_rate": 0.00019955172826011062, |
|
"loss": 0.8945, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.35131744040150564, |
|
"grad_norm": 0.21096089482307434, |
|
"learning_rate": 0.0001993900170976888, |
|
"loss": 0.8929, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4015056461731493, |
|
"grad_norm": 0.21473677456378937, |
|
"learning_rate": 0.00019920353585932578, |
|
"loss": 0.8688, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.451693851944793, |
|
"grad_norm": 0.23557408154010773, |
|
"learning_rate": 0.00019899233102005573, |
|
"loss": 0.8585, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5018820577164367, |
|
"grad_norm": 0.21959255635738373, |
|
"learning_rate": 0.0001987564552165524, |
|
"loss": 0.8615, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5520702634880803, |
|
"grad_norm": 0.23273342847824097, |
|
"learning_rate": 0.00019849596723401107, |
|
"loss": 0.8523, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6022584692597239, |
|
"grad_norm": 0.22011199593544006, |
|
"learning_rate": 0.00019821093199149804, |
|
"loss": 0.8588, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6524466750313677, |
|
"grad_norm": 0.2598012685775757, |
|
"learning_rate": 0.0001979014205257715, |
|
"loss": 0.8389, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7026348808030113, |
|
"grad_norm": 0.251001238822937, |
|
"learning_rate": 0.0001975675099735774, |
|
"loss": 0.8297, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.7528230865746549, |
|
"grad_norm": 0.2203661948442459, |
|
"learning_rate": 0.00019720928355242568, |
|
"loss": 0.8222, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.8030112923462986, |
|
"grad_norm": 0.2208539992570877, |
|
"learning_rate": 0.00019682683053985072, |
|
"loss": 0.8365, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8531994981179423, |
|
"grad_norm": 0.23628230392932892, |
|
"learning_rate": 0.00019642024625116117, |
|
"loss": 0.8242, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.903387703889586, |
|
"grad_norm": 0.2504488229751587, |
|
"learning_rate": 0.00019598963201568573, |
|
"loss": 0.8245, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9535759096612296, |
|
"grad_norm": 0.2108476608991623, |
|
"learning_rate": 0.0001955350951515195, |
|
"loss": 0.8144, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.0037641154328734, |
|
"grad_norm": 0.3357304632663727, |
|
"learning_rate": 0.0001950567489387783, |
|
"loss": 0.8139, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.053952321204517, |
|
"grad_norm": 0.2318328619003296, |
|
"learning_rate": 0.0001945547125913667, |
|
"loss": 0.8025, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.1041405269761606, |
|
"grad_norm": 0.21543821692466736, |
|
"learning_rate": 0.00019402911122726757, |
|
"loss": 0.7935, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.1543287327478042, |
|
"grad_norm": 0.22069813311100006, |
|
"learning_rate": 0.00019348007583735983, |
|
"loss": 0.7883, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.2045169385194479, |
|
"grad_norm": 0.23400938510894775, |
|
"learning_rate": 0.00019290774325277305, |
|
"loss": 0.7837, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.2547051442910915, |
|
"grad_norm": 0.21560260653495789, |
|
"learning_rate": 0.0001923122561107861, |
|
"loss": 0.7851, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.3048933500627353, |
|
"grad_norm": 0.220945805311203, |
|
"learning_rate": 0.00019169376281927888, |
|
"loss": 0.7804, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.355081555834379, |
|
"grad_norm": 0.2537701427936554, |
|
"learning_rate": 0.00019105241751974622, |
|
"loss": 0.7782, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.4052697616060226, |
|
"grad_norm": 0.22228342294692993, |
|
"learning_rate": 0.0001903883800488824, |
|
"loss": 0.7767, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.4554579673776662, |
|
"grad_norm": 0.23013311624526978, |
|
"learning_rate": 0.00018970181589874637, |
|
"loss": 0.7886, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.50564617314931, |
|
"grad_norm": 0.24365030229091644, |
|
"learning_rate": 0.00018899289617551804, |
|
"loss": 0.7848, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.5558343789209537, |
|
"grad_norm": 0.2531464993953705, |
|
"learning_rate": 0.0001882617975568547, |
|
"loss": 0.7769, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.6060225846925973, |
|
"grad_norm": 0.22617797553539276, |
|
"learning_rate": 0.00018750870224785939, |
|
"loss": 0.7745, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.656210790464241, |
|
"grad_norm": 0.2243880182504654, |
|
"learning_rate": 0.00018673379793567146, |
|
"loss": 0.7687, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.7063989962358845, |
|
"grad_norm": 0.21209578216075897, |
|
"learning_rate": 0.0001859372777426912, |
|
"loss": 0.7628, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.7565872020075282, |
|
"grad_norm": 0.21007835865020752, |
|
"learning_rate": 0.00018511934017844948, |
|
"loss": 0.7595, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.8067754077791718, |
|
"grad_norm": 0.22626900672912598, |
|
"learning_rate": 0.00018428018909013506, |
|
"loss": 0.7605, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.8569636135508154, |
|
"grad_norm": 0.22667524218559265, |
|
"learning_rate": 0.00018342003361179176, |
|
"loss": 0.7726, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.9071518193224593, |
|
"grad_norm": 0.2177441567182541, |
|
"learning_rate": 0.00018253908811219764, |
|
"loss": 0.7595, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.9573400250941029, |
|
"grad_norm": 0.22076500952243805, |
|
"learning_rate": 0.00018163757214143992, |
|
"loss": 0.7554, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.0075282308657467, |
|
"grad_norm": 0.21619777381420135, |
|
"learning_rate": 0.00018071571037619853, |
|
"loss": 0.7353, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.0577164366373903, |
|
"grad_norm": 0.22108638286590576, |
|
"learning_rate": 0.00017977373256375194, |
|
"loss": 0.7281, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.107904642409034, |
|
"grad_norm": 0.2403818517923355, |
|
"learning_rate": 0.00017881187346471925, |
|
"loss": 0.736, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.1580928481806776, |
|
"grad_norm": 0.2393644005060196, |
|
"learning_rate": 0.00017783037279455298, |
|
"loss": 0.724, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.208281053952321, |
|
"grad_norm": 0.23402653634548187, |
|
"learning_rate": 0.00017682947516379707, |
|
"loss": 0.7309, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.258469259723965, |
|
"grad_norm": 0.24220925569534302, |
|
"learning_rate": 0.00017580943001712455, |
|
"loss": 0.7201, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.3086574654956085, |
|
"grad_norm": 0.2503248155117035, |
|
"learning_rate": 0.00017477049157117093, |
|
"loss": 0.7226, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.358845671267252, |
|
"grad_norm": 0.23094403743743896, |
|
"learning_rate": 0.0001737129187511779, |
|
"loss": 0.7206, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.4090338770388957, |
|
"grad_norm": 0.23082856833934784, |
|
"learning_rate": 0.00017263697512646394, |
|
"loss": 0.7133, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.4592220828105393, |
|
"grad_norm": 0.2403910756111145, |
|
"learning_rate": 0.00017154292884473713, |
|
"loss": 0.7307, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.509410288582183, |
|
"grad_norm": 0.2515548765659332, |
|
"learning_rate": 0.00017043105256526724, |
|
"loss": 0.7264, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.5595984943538266, |
|
"grad_norm": 0.23908871412277222, |
|
"learning_rate": 0.00016930162339093318, |
|
"loss": 0.7258, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.6097867001254706, |
|
"grad_norm": 0.2339881807565689, |
|
"learning_rate": 0.0001681549227991634, |
|
"loss": 0.7189, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.6599749058971143, |
|
"grad_norm": 0.23234973847866058, |
|
"learning_rate": 0.00016699123657178553, |
|
"loss": 0.7144, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.710163111668758, |
|
"grad_norm": 0.252946674823761, |
|
"learning_rate": 0.00016581085472380376, |
|
"loss": 0.7199, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.7603513174404015, |
|
"grad_norm": 0.23885370790958405, |
|
"learning_rate": 0.00016461407143112097, |
|
"loss": 0.7107, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.810539523212045, |
|
"grad_norm": 0.24114787578582764, |
|
"learning_rate": 0.00016340118495722388, |
|
"loss": 0.7129, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.8607277289836888, |
|
"grad_norm": 0.24572765827178955, |
|
"learning_rate": 0.00016217249757884955, |
|
"loss": 0.7158, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.9109159347553324, |
|
"grad_norm": 0.24029052257537842, |
|
"learning_rate": 0.0001609283155106517, |
|
"loss": 0.7084, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.961104140526976, |
|
"grad_norm": 0.24108637869358063, |
|
"learning_rate": 0.00015966894882888562, |
|
"loss": 0.7125, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.0112923462986196, |
|
"grad_norm": 0.2422133982181549, |
|
"learning_rate": 0.00015839471139413066, |
|
"loss": 0.6978, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.0614805520702637, |
|
"grad_norm": 0.2545720338821411, |
|
"learning_rate": 0.0001571059207730695, |
|
"loss": 0.6779, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.1116687578419073, |
|
"grad_norm": 0.2578783333301544, |
|
"learning_rate": 0.00015580289815934401, |
|
"loss": 0.673, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.161856963613551, |
|
"grad_norm": 0.2702922224998474, |
|
"learning_rate": 0.00015448596829350706, |
|
"loss": 0.686, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 3.2120451693851946, |
|
"grad_norm": 0.26222941279411316, |
|
"learning_rate": 0.00015315545938209015, |
|
"loss": 0.6853, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.262233375156838, |
|
"grad_norm": 0.25867560505867004, |
|
"learning_rate": 0.00015181170301580777, |
|
"loss": 0.677, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.312421580928482, |
|
"grad_norm": 0.2750966548919678, |
|
"learning_rate": 0.00015045503408691775, |
|
"loss": 0.6758, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.3626097867001254, |
|
"grad_norm": 0.2567848861217499, |
|
"learning_rate": 0.00014908579070575936, |
|
"loss": 0.6708, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.412797992471769, |
|
"grad_norm": 0.26190003752708435, |
|
"learning_rate": 0.00014770431411648897, |
|
"loss": 0.677, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.4629861982434127, |
|
"grad_norm": 0.26486852765083313, |
|
"learning_rate": 0.0001463109486120348, |
|
"loss": 0.6785, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.5131744040150563, |
|
"grad_norm": 0.26697248220443726, |
|
"learning_rate": 0.00014490604144829202, |
|
"loss": 0.6791, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.5633626097867, |
|
"grad_norm": 0.26674166321754456, |
|
"learning_rate": 0.00014348994275757931, |
|
"loss": 0.6775, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.6135508155583436, |
|
"grad_norm": 0.2583613395690918, |
|
"learning_rate": 0.00014206300546137842, |
|
"loss": 0.6722, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.663739021329987, |
|
"grad_norm": 0.2743168771266937, |
|
"learning_rate": 0.00014062558518237892, |
|
"loss": 0.6777, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.7139272271016313, |
|
"grad_norm": 0.2537378668785095, |
|
"learning_rate": 0.00013917804015584932, |
|
"loss": 0.6775, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.764115432873275, |
|
"grad_norm": 0.27333304286003113, |
|
"learning_rate": 0.00013772073114035762, |
|
"loss": 0.6797, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.8143036386449185, |
|
"grad_norm": 0.26115766167640686, |
|
"learning_rate": 0.00013625402132786248, |
|
"loss": 0.6687, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.864491844416562, |
|
"grad_norm": 0.2621854543685913, |
|
"learning_rate": 0.00013477827625319824, |
|
"loss": 0.6634, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.9146800501882058, |
|
"grad_norm": 0.25681644678115845, |
|
"learning_rate": 0.00013329386370297615, |
|
"loss": 0.6676, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.9648682559598494, |
|
"grad_norm": 0.2630254626274109, |
|
"learning_rate": 0.00013180115362392382, |
|
"loss": 0.6819, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 4.015056461731493, |
|
"grad_norm": 0.2596668004989624, |
|
"learning_rate": 0.00013030051803068727, |
|
"loss": 0.6562, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.065244667503137, |
|
"grad_norm": 0.2807883620262146, |
|
"learning_rate": 0.00012879233091311667, |
|
"loss": 0.6343, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 4.115432873274781, |
|
"grad_norm": 0.3002206087112427, |
|
"learning_rate": 0.00012727696814306033, |
|
"loss": 0.6426, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 4.165621079046424, |
|
"grad_norm": 0.278054803609848, |
|
"learning_rate": 0.0001257548073806897, |
|
"loss": 0.6434, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 4.215809284818068, |
|
"grad_norm": 0.28684940934181213, |
|
"learning_rate": 0.00012422622798037832, |
|
"loss": 0.64, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 4.265997490589712, |
|
"grad_norm": 0.2862164378166199, |
|
"learning_rate": 0.000122691610896159, |
|
"loss": 0.6413, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 4.316185696361355, |
|
"grad_norm": 0.2956394553184509, |
|
"learning_rate": 0.00012115133858678191, |
|
"loss": 0.6344, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 4.366373902132999, |
|
"grad_norm": 0.28408849239349365, |
|
"learning_rate": 0.00011960579492039783, |
|
"loss": 0.6368, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 4.416562107904642, |
|
"grad_norm": 0.2809561789035797, |
|
"learning_rate": 0.00011805536507889021, |
|
"loss": 0.6336, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 4.466750313676286, |
|
"grad_norm": 0.27648741006851196, |
|
"learning_rate": 0.00011650043546187995, |
|
"loss": 0.6357, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 4.51693851944793, |
|
"grad_norm": 0.28754714131355286, |
|
"learning_rate": 0.0001149413935904261, |
|
"loss": 0.6341, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.567126725219573, |
|
"grad_norm": 0.2936854958534241, |
|
"learning_rate": 0.00011337862801044792, |
|
"loss": 0.6292, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 4.617314930991217, |
|
"grad_norm": 0.27858176827430725, |
|
"learning_rate": 0.00011181252819589081, |
|
"loss": 0.6351, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 4.6675031367628605, |
|
"grad_norm": 0.2897019684314728, |
|
"learning_rate": 0.00011024348445166133, |
|
"loss": 0.6369, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 4.717691342534504, |
|
"grad_norm": 0.28682589530944824, |
|
"learning_rate": 0.00010867188781635512, |
|
"loss": 0.6375, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 4.767879548306148, |
|
"grad_norm": 0.28193414211273193, |
|
"learning_rate": 0.0001070981299648016, |
|
"loss": 0.6337, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 4.818067754077791, |
|
"grad_norm": 0.28822091221809387, |
|
"learning_rate": 0.00010552260311045082, |
|
"loss": 0.6378, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 4.868255959849435, |
|
"grad_norm": 0.28457361459732056, |
|
"learning_rate": 0.00010394569990762529, |
|
"loss": 0.6368, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 4.918444165621079, |
|
"grad_norm": 0.2925203740596771, |
|
"learning_rate": 0.00010236781335366239, |
|
"loss": 0.6287, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 4.968632371392722, |
|
"grad_norm": 0.2838154435157776, |
|
"learning_rate": 0.00010078933669097135, |
|
"loss": 0.6305, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 5.018820577164367, |
|
"grad_norm": 0.29948368668556213, |
|
"learning_rate": 9.92106633090287e-05, |
|
"loss": 0.6216, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.06900878293601, |
|
"grad_norm": 0.30535656213760376, |
|
"learning_rate": 9.763218664633763e-05, |
|
"loss": 0.5997, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 5.119196988707654, |
|
"grad_norm": 0.2984197735786438, |
|
"learning_rate": 9.605430009237474e-05, |
|
"loss": 0.604, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 5.169385194479298, |
|
"grad_norm": 0.31448280811309814, |
|
"learning_rate": 9.447739688954919e-05, |
|
"loss": 0.599, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 5.219573400250941, |
|
"grad_norm": 0.3126201927661896, |
|
"learning_rate": 9.29018700351984e-05, |
|
"loss": 0.6064, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 5.269761606022585, |
|
"grad_norm": 0.3049900233745575, |
|
"learning_rate": 9.132811218364495e-05, |
|
"loss": 0.6023, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 5.3199498117942285, |
|
"grad_norm": 0.3015764653682709, |
|
"learning_rate": 8.975651554833869e-05, |
|
"loss": 0.6023, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 5.370138017565872, |
|
"grad_norm": 0.31510215997695923, |
|
"learning_rate": 8.818747180410921e-05, |
|
"loss": 0.6072, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 5.420326223337516, |
|
"grad_norm": 0.31331363320350647, |
|
"learning_rate": 8.66213719895521e-05, |
|
"loss": 0.603, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 5.470514429109159, |
|
"grad_norm": 0.311443030834198, |
|
"learning_rate": 8.505860640957391e-05, |
|
"loss": 0.6034, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 5.520702634880803, |
|
"grad_norm": 0.3126680254936218, |
|
"learning_rate": 8.349956453812009e-05, |
|
"loss": 0.5954, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 5.570890840652447, |
|
"grad_norm": 0.32074013352394104, |
|
"learning_rate": 8.194463492110981e-05, |
|
"loss": 0.5997, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 5.62107904642409, |
|
"grad_norm": 0.31394365429878235, |
|
"learning_rate": 8.03942050796022e-05, |
|
"loss": 0.6075, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 5.671267252195734, |
|
"grad_norm": 0.3085944950580597, |
|
"learning_rate": 7.88486614132181e-05, |
|
"loss": 0.5993, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 5.7214554579673775, |
|
"grad_norm": 0.3151126503944397, |
|
"learning_rate": 7.730838910384097e-05, |
|
"loss": 0.6067, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 5.771643663739021, |
|
"grad_norm": 0.31070196628570557, |
|
"learning_rate": 7.57737720196217e-05, |
|
"loss": 0.6039, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 5.821831869510665, |
|
"grad_norm": 0.31582969427108765, |
|
"learning_rate": 7.424519261931036e-05, |
|
"loss": 0.6012, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 5.872020075282308, |
|
"grad_norm": 0.31882044672966003, |
|
"learning_rate": 7.27230318569397e-05, |
|
"loss": 0.6035, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 5.922208281053952, |
|
"grad_norm": 0.31374436616897583, |
|
"learning_rate": 7.120766908688336e-05, |
|
"loss": 0.6084, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 5.972396486825596, |
|
"grad_norm": 0.3210514485836029, |
|
"learning_rate": 6.969948196931272e-05, |
|
"loss": 0.6034, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 6.022584692597239, |
|
"grad_norm": 0.3218853175640106, |
|
"learning_rate": 6.819884637607619e-05, |
|
"loss": 0.5889, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 6.072772898368883, |
|
"grad_norm": 0.32491976022720337, |
|
"learning_rate": 6.670613629702391e-05, |
|
"loss": 0.576, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 6.122961104140527, |
|
"grad_norm": 0.3358321487903595, |
|
"learning_rate": 6.522172374680177e-05, |
|
"loss": 0.5708, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 6.173149309912171, |
|
"grad_norm": 0.31775182485580444, |
|
"learning_rate": 6.374597867213756e-05, |
|
"loss": 0.5743, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 6.223337515683815, |
|
"grad_norm": 0.3289986550807953, |
|
"learning_rate": 6.22792688596424e-05, |
|
"loss": 0.5853, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 6.273525721455458, |
|
"grad_norm": 0.33586037158966064, |
|
"learning_rate": 6.0821959844150687e-05, |
|
"loss": 0.5799, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 6.323713927227102, |
|
"grad_norm": 0.33577895164489746, |
|
"learning_rate": 5.9374414817621114e-05, |
|
"loss": 0.5675, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 6.3739021329987455, |
|
"grad_norm": 0.33007678389549255, |
|
"learning_rate": 5.7936994538621605e-05, |
|
"loss": 0.5764, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 6.424090338770389, |
|
"grad_norm": 0.3328823149204254, |
|
"learning_rate": 5.651005724242071e-05, |
|
"loss": 0.5747, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 6.474278544542033, |
|
"grad_norm": 0.33794859051704407, |
|
"learning_rate": 5.509395855170798e-05, |
|
"loss": 0.5762, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 6.524466750313676, |
|
"grad_norm": 0.33616700768470764, |
|
"learning_rate": 5.368905138796523e-05, |
|
"loss": 0.5754, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 6.57465495608532, |
|
"grad_norm": 0.3314683437347412, |
|
"learning_rate": 5.229568588351108e-05, |
|
"loss": 0.5827, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 6.624843161856964, |
|
"grad_norm": 0.32283809781074524, |
|
"learning_rate": 5.0914209294240644e-05, |
|
"loss": 0.5762, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 6.675031367628607, |
|
"grad_norm": 0.33403000235557556, |
|
"learning_rate": 4.9544965913082264e-05, |
|
"loss": 0.5759, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 6.725219573400251, |
|
"grad_norm": 0.32813191413879395, |
|
"learning_rate": 4.818829698419225e-05, |
|
"loss": 0.5808, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 6.7754077791718945, |
|
"grad_norm": 0.3342324495315552, |
|
"learning_rate": 4.684454061790987e-05, |
|
"loss": 0.5722, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 6.825595984943538, |
|
"grad_norm": 0.3277010917663574, |
|
"learning_rate": 4.5514031706492986e-05, |
|
"loss": 0.5729, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 6.875784190715182, |
|
"grad_norm": 0.32855984568595886, |
|
"learning_rate": 4.4197101840655995e-05, |
|
"loss": 0.5776, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 6.925972396486825, |
|
"grad_norm": 0.3375394344329834, |
|
"learning_rate": 4.289407922693053e-05, |
|
"loss": 0.5702, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 6.976160602258469, |
|
"grad_norm": 0.33724990487098694, |
|
"learning_rate": 4.1605288605869365e-05, |
|
"loss": 0.5703, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 7.026348808030113, |
|
"grad_norm": 0.33817237615585327, |
|
"learning_rate": 4.033105117111441e-05, |
|
"loss": 0.563, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 7.076537013801756, |
|
"grad_norm": 0.3434535264968872, |
|
"learning_rate": 3.907168448934836e-05, |
|
"loss": 0.5571, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 7.1267252195734, |
|
"grad_norm": 0.34801870584487915, |
|
"learning_rate": 3.7827502421150496e-05, |
|
"loss": 0.562, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 7.1769134253450435, |
|
"grad_norm": 0.35552722215652466, |
|
"learning_rate": 3.659881504277613e-05, |
|
"loss": 0.5527, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 7.227101631116687, |
|
"grad_norm": 0.3546360731124878, |
|
"learning_rate": 3.538592856887901e-05, |
|
"loss": 0.5594, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 7.277289836888332, |
|
"grad_norm": 0.34311702847480774, |
|
"learning_rate": 3.4189145276196245e-05, |
|
"loss": 0.5573, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 7.327478042659975, |
|
"grad_norm": 0.3503047525882721, |
|
"learning_rate": 3.3008763428214505e-05, |
|
"loss": 0.5642, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 7.377666248431619, |
|
"grad_norm": 0.3464205861091614, |
|
"learning_rate": 3.1845077200836636e-05, |
|
"loss": 0.5615, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 7.4278544542032625, |
|
"grad_norm": 0.35482051968574524, |
|
"learning_rate": 3.0698376609066825e-05, |
|
"loss": 0.5527, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 7.478042659974906, |
|
"grad_norm": 0.3588634729385376, |
|
"learning_rate": 2.9568947434732775e-05, |
|
"loss": 0.556, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 7.52823086574655, |
|
"grad_norm": 0.3532968759536743, |
|
"learning_rate": 2.8457071155262884e-05, |
|
"loss": 0.5586, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 7.578419071518193, |
|
"grad_norm": 0.3441388010978699, |
|
"learning_rate": 2.736302487353609e-05, |
|
"loss": 0.5461, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 7.628607277289837, |
|
"grad_norm": 0.36395809054374695, |
|
"learning_rate": 2.628708124882212e-05, |
|
"loss": 0.5544, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 7.678795483061481, |
|
"grad_norm": 0.3574591279029846, |
|
"learning_rate": 2.5229508428829096e-05, |
|
"loss": 0.5584, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 7.728983688833124, |
|
"grad_norm": 0.35188260674476624, |
|
"learning_rate": 2.4190569982875467e-05, |
|
"loss": 0.5566, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 7.779171894604768, |
|
"grad_norm": 0.34741711616516113, |
|
"learning_rate": 2.3170524836202933e-05, |
|
"loss": 0.5525, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 7.8293601003764115, |
|
"grad_norm": 0.35913023352622986, |
|
"learning_rate": 2.216962720544703e-05, |
|
"loss": 0.5491, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 7.879548306148055, |
|
"grad_norm": 0.3487934470176697, |
|
"learning_rate": 2.1188126535280773e-05, |
|
"loss": 0.558, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 7.929736511919699, |
|
"grad_norm": 0.3519488573074341, |
|
"learning_rate": 2.022626743624807e-05, |
|
"loss": 0.5575, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 7.979924717691342, |
|
"grad_norm": 0.35680004954338074, |
|
"learning_rate": 1.9284289623801477e-05, |
|
"loss": 0.5559, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 8.030112923462987, |
|
"grad_norm": 0.3475489914417267, |
|
"learning_rate": 1.8362427858560093e-05, |
|
"loss": 0.5461, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 8.08030112923463, |
|
"grad_norm": 0.3541754484176636, |
|
"learning_rate": 1.74609118878024e-05, |
|
"loss": 0.5395, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 8.130489335006274, |
|
"grad_norm": 0.3458302319049835, |
|
"learning_rate": 1.657996638820826e-05, |
|
"loss": 0.5428, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 8.180677540777918, |
|
"grad_norm": 0.35417988896369934, |
|
"learning_rate": 1.5719810909864942e-05, |
|
"loss": 0.5395, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 8.230865746549561, |
|
"grad_norm": 0.35355257987976074, |
|
"learning_rate": 1.4880659821550546e-05, |
|
"loss": 0.5527, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 8.281053952321205, |
|
"grad_norm": 0.35250890254974365, |
|
"learning_rate": 1.4062722257308803e-05, |
|
"loss": 0.5501, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 8.331242158092849, |
|
"grad_norm": 0.34818190336227417, |
|
"learning_rate": 1.3266202064328548e-05, |
|
"loss": 0.5432, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 8.381430363864492, |
|
"grad_norm": 0.36963459849357605, |
|
"learning_rate": 1.2491297752140641e-05, |
|
"loss": 0.5448, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 8.431618569636136, |
|
"grad_norm": 0.35220593214035034, |
|
"learning_rate": 1.1738202443145308e-05, |
|
"loss": 0.5434, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 8.48180677540778, |
|
"grad_norm": 0.3520500063896179, |
|
"learning_rate": 1.1007103824481979e-05, |
|
"loss": 0.5458, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 8.531994981179423, |
|
"grad_norm": 0.36262160539627075, |
|
"learning_rate": 1.029818410125365e-05, |
|
"loss": 0.5428, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 8.582183186951067, |
|
"grad_norm": 0.3580245077610016, |
|
"learning_rate": 9.611619951117657e-06, |
|
"loss": 0.5427, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 8.63237139272271, |
|
"grad_norm": 0.35791924595832825, |
|
"learning_rate": 8.94758248025378e-06, |
|
"loss": 0.5523, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 8.682559598494354, |
|
"grad_norm": 0.35621368885040283, |
|
"learning_rate": 8.306237180721121e-06, |
|
"loss": 0.5403, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 8.732747804265998, |
|
"grad_norm": 0.3615633547306061, |
|
"learning_rate": 7.687743889213938e-06, |
|
"loss": 0.5455, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 8.782936010037641, |
|
"grad_norm": 0.35723286867141724, |
|
"learning_rate": 7.0922567472269444e-06, |
|
"loss": 0.5449, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 8.833124215809285, |
|
"grad_norm": 0.35941046476364136, |
|
"learning_rate": 6.519924162640167e-06, |
|
"loss": 0.5396, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 8.883312421580928, |
|
"grad_norm": 0.36941203474998474, |
|
"learning_rate": 5.9708887727324525e-06, |
|
"loss": 0.5466, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 8.933500627352572, |
|
"grad_norm": 0.3527214527130127, |
|
"learning_rate": 5.445287408633304e-06, |
|
"loss": 0.5469, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 8.983688833124216, |
|
"grad_norm": 0.3579261004924774, |
|
"learning_rate": 4.943251061221721e-06, |
|
"loss": 0.5369, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 9.03387703889586, |
|
"grad_norm": 0.3588533103466034, |
|
"learning_rate": 4.464904848480523e-06, |
|
"loss": 0.5482, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 9.084065244667503, |
|
"grad_norm": 0.3596334457397461, |
|
"learning_rate": 4.0103679843142895e-06, |
|
"loss": 0.5402, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 9.134253450439147, |
|
"grad_norm": 0.35277649760246277, |
|
"learning_rate": 3.5797537488388323e-06, |
|
"loss": 0.5431, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 9.18444165621079, |
|
"grad_norm": 0.35417917370796204, |
|
"learning_rate": 3.1731694601492833e-06, |
|
"loss": 0.5352, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 9.234629861982434, |
|
"grad_norm": 0.36016353964805603, |
|
"learning_rate": 2.7907164475743043e-06, |
|
"loss": 0.5395, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 9.284818067754077, |
|
"grad_norm": 0.36541038751602173, |
|
"learning_rate": 2.4324900264226403e-06, |
|
"loss": 0.5348, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 9.335006273525721, |
|
"grad_norm": 0.36023426055908203, |
|
"learning_rate": 2.098579474228546e-06, |
|
"loss": 0.5324, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 9.385194479297365, |
|
"grad_norm": 0.3567328155040741, |
|
"learning_rate": 1.7890680085019595e-06, |
|
"loss": 0.5341, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 9.435382685069008, |
|
"grad_norm": 0.3682873547077179, |
|
"learning_rate": 1.5040327659889608e-06, |
|
"loss": 0.5382, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 9.485570890840652, |
|
"grad_norm": 0.36713671684265137, |
|
"learning_rate": 1.2435447834476255e-06, |
|
"loss": 0.537, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 9.535759096612296, |
|
"grad_norm": 0.36034858226776123, |
|
"learning_rate": 1.0076689799442873e-06, |
|
"loss": 0.5435, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 9.58594730238394, |
|
"grad_norm": 0.3527128994464874, |
|
"learning_rate": 7.964641406742135e-07, |
|
"loss": 0.5464, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 9.636135508155583, |
|
"grad_norm": 0.3527335226535797, |
|
"learning_rate": 6.099829023112235e-07, |
|
"loss": 0.5396, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 9.686323713927226, |
|
"grad_norm": 0.36540141701698303, |
|
"learning_rate": 4.482717398894165e-07, |
|
"loss": 0.5424, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 9.73651191969887, |
|
"grad_norm": 0.35210534930229187, |
|
"learning_rate": 3.1137095522068007e-07, |
|
"loss": 0.5456, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 9.786700125470514, |
|
"grad_norm": 0.3526809811592102, |
|
"learning_rate": 1.9931466685065847e-07, |
|
"loss": 0.5394, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 9.836888331242157, |
|
"grad_norm": 0.36059680581092834, |
|
"learning_rate": 1.1213080155564326e-07, |
|
"loss": 0.5359, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 9.887076537013801, |
|
"grad_norm": 0.36295098066329956, |
|
"learning_rate": 4.9841087382618276e-08, |
|
"loss": 0.5404, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 9.937264742785445, |
|
"grad_norm": 0.35113370418548584, |
|
"learning_rate": 1.2461048234269079e-08, |
|
"loss": 0.5373, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 9.987452948557088, |
|
"grad_norm": 0.3604467511177063, |
|
"learning_rate": 0.0, |
|
"loss": 0.5361, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 9.987452948557088, |
|
"step": 1990, |
|
"total_flos": 1.0444655785672704e+18, |
|
"train_loss": 0.65177170523447, |
|
"train_runtime": 44465.981, |
|
"train_samples_per_second": 1.432, |
|
"train_steps_per_second": 0.045 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1990, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 1.0444655785672704e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|