|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9997069597069597, |
|
"eval_steps": 100, |
|
"global_step": 1706, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005860805860805861, |
|
"grad_norm": 2.521568536758423, |
|
"learning_rate": 0.00019941383352872216, |
|
"loss": 2.3132, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.011721611721611722, |
|
"grad_norm": 2.104935884475708, |
|
"learning_rate": 0.00019882766705744433, |
|
"loss": 0.9444, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.017582417582417582, |
|
"grad_norm": 1.2950881719589233, |
|
"learning_rate": 0.00019824150058616647, |
|
"loss": 0.357, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.023443223443223443, |
|
"grad_norm": 1.1862170696258545, |
|
"learning_rate": 0.00019765533411488865, |
|
"loss": 0.2105, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.029304029304029304, |
|
"grad_norm": 0.5247148871421814, |
|
"learning_rate": 0.0001970691676436108, |
|
"loss": 0.1087, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.035164835164835165, |
|
"grad_norm": 1.2535285949707031, |
|
"learning_rate": 0.00019648300117233296, |
|
"loss": 0.1185, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.041025641025641026, |
|
"grad_norm": 0.5407606959342957, |
|
"learning_rate": 0.0001958968347010551, |
|
"loss": 0.0775, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.046886446886446886, |
|
"grad_norm": 1.137266993522644, |
|
"learning_rate": 0.00019531066822977726, |
|
"loss": 0.0728, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05274725274725275, |
|
"grad_norm": 0.5244052410125732, |
|
"learning_rate": 0.00019472450175849943, |
|
"loss": 0.056, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05860805860805861, |
|
"grad_norm": 0.4511496126651764, |
|
"learning_rate": 0.00019413833528722157, |
|
"loss": 0.0648, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06446886446886448, |
|
"grad_norm": 0.33913975954055786, |
|
"learning_rate": 0.00019355216881594375, |
|
"loss": 0.0479, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.07032967032967033, |
|
"grad_norm": 0.354777991771698, |
|
"learning_rate": 0.0001929660023446659, |
|
"loss": 0.0758, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0761904761904762, |
|
"grad_norm": 0.3968910276889801, |
|
"learning_rate": 0.00019237983587338807, |
|
"loss": 0.0776, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.08205128205128205, |
|
"grad_norm": 0.5061652660369873, |
|
"learning_rate": 0.0001917936694021102, |
|
"loss": 0.0349, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08791208791208792, |
|
"grad_norm": 0.24455250799655914, |
|
"learning_rate": 0.00019120750293083236, |
|
"loss": 0.0369, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.09377289377289377, |
|
"grad_norm": 0.3959537446498871, |
|
"learning_rate": 0.00019062133645955453, |
|
"loss": 0.0513, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09963369963369964, |
|
"grad_norm": 0.4534469544887543, |
|
"learning_rate": 0.00019003516998827668, |
|
"loss": 0.0459, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1054945054945055, |
|
"grad_norm": 0.30694451928138733, |
|
"learning_rate": 0.00018944900351699885, |
|
"loss": 0.0377, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11135531135531136, |
|
"grad_norm": 0.15078052878379822, |
|
"learning_rate": 0.000188862837045721, |
|
"loss": 0.0297, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.11721611721611722, |
|
"grad_norm": 0.3153330981731415, |
|
"learning_rate": 0.00018827667057444317, |
|
"loss": 0.0301, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11721611721611722, |
|
"eval_loss": 0.028654273599386215, |
|
"eval_runtime": 4.8016, |
|
"eval_samples_per_second": 5.831, |
|
"eval_steps_per_second": 2.916, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12307692307692308, |
|
"grad_norm": 0.5777396559715271, |
|
"learning_rate": 0.0001876905041031653, |
|
"loss": 0.0515, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.12893772893772895, |
|
"grad_norm": 0.3677718937397003, |
|
"learning_rate": 0.00018710433763188746, |
|
"loss": 0.0315, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1347985347985348, |
|
"grad_norm": 0.30658507347106934, |
|
"learning_rate": 0.0001865181711606096, |
|
"loss": 0.0248, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.14065934065934066, |
|
"grad_norm": 0.21649648249149323, |
|
"learning_rate": 0.00018593200468933178, |
|
"loss": 0.0352, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.14652014652014653, |
|
"grad_norm": 0.2877885103225708, |
|
"learning_rate": 0.00018534583821805395, |
|
"loss": 0.0456, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.1523809523809524, |
|
"grad_norm": 0.1782904863357544, |
|
"learning_rate": 0.0001847596717467761, |
|
"loss": 0.0257, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.15824175824175823, |
|
"grad_norm": 0.1211300641298294, |
|
"learning_rate": 0.00018417350527549827, |
|
"loss": 0.0437, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.1641025641025641, |
|
"grad_norm": 0.298053503036499, |
|
"learning_rate": 0.0001835873388042204, |
|
"loss": 0.036, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.16996336996336997, |
|
"grad_norm": 0.2280658483505249, |
|
"learning_rate": 0.00018300117233294256, |
|
"loss": 0.0331, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.17582417582417584, |
|
"grad_norm": 0.14896267652511597, |
|
"learning_rate": 0.0001824150058616647, |
|
"loss": 0.0239, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.18168498168498168, |
|
"grad_norm": 0.1769961267709732, |
|
"learning_rate": 0.00018182883939038688, |
|
"loss": 0.0375, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.18754578754578755, |
|
"grad_norm": 0.19849297404289246, |
|
"learning_rate": 0.00018124267291910902, |
|
"loss": 0.0357, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1934065934065934, |
|
"grad_norm": 0.2294420450925827, |
|
"learning_rate": 0.0001806565064478312, |
|
"loss": 0.0504, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.19926739926739928, |
|
"grad_norm": 0.10744224488735199, |
|
"learning_rate": 0.00018007033997655337, |
|
"loss": 0.0209, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.20512820512820512, |
|
"grad_norm": 0.06066066771745682, |
|
"learning_rate": 0.00017948417350527551, |
|
"loss": 0.0175, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.210989010989011, |
|
"grad_norm": 0.5421485304832458, |
|
"learning_rate": 0.00017889800703399766, |
|
"loss": 0.0398, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.21684981684981686, |
|
"grad_norm": 0.1306767612695694, |
|
"learning_rate": 0.0001783118405627198, |
|
"loss": 0.0258, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.22271062271062272, |
|
"grad_norm": 0.11510124802589417, |
|
"learning_rate": 0.00017772567409144198, |
|
"loss": 0.0253, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 0.20577751100063324, |
|
"learning_rate": 0.00017713950762016412, |
|
"loss": 0.0277, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.23443223443223443, |
|
"grad_norm": 0.20588932931423187, |
|
"learning_rate": 0.0001765533411488863, |
|
"loss": 0.0296, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.23443223443223443, |
|
"eval_loss": 0.019306689500808716, |
|
"eval_runtime": 4.7344, |
|
"eval_samples_per_second": 5.914, |
|
"eval_steps_per_second": 2.957, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2402930402930403, |
|
"grad_norm": 0.13713975250720978, |
|
"learning_rate": 0.00017596717467760847, |
|
"loss": 0.0372, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.24615384615384617, |
|
"grad_norm": 0.14788508415222168, |
|
"learning_rate": 0.00017538100820633061, |
|
"loss": 0.033, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.252014652014652, |
|
"grad_norm": 0.2372630536556244, |
|
"learning_rate": 0.00017479484173505276, |
|
"loss": 0.0281, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.2578754578754579, |
|
"grad_norm": 0.2357954978942871, |
|
"learning_rate": 0.0001742086752637749, |
|
"loss": 0.0295, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.26373626373626374, |
|
"grad_norm": 0.2537606358528137, |
|
"learning_rate": 0.00017362250879249708, |
|
"loss": 0.036, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2695970695970696, |
|
"grad_norm": 0.2222289741039276, |
|
"learning_rate": 0.00017303634232121922, |
|
"loss": 0.0402, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2754578754578755, |
|
"grad_norm": 0.19696177542209625, |
|
"learning_rate": 0.0001724501758499414, |
|
"loss": 0.025, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.2813186813186813, |
|
"grad_norm": 0.08915109932422638, |
|
"learning_rate": 0.00017186400937866357, |
|
"loss": 0.0292, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.28717948717948716, |
|
"grad_norm": 0.39625948667526245, |
|
"learning_rate": 0.00017127784290738572, |
|
"loss": 0.0324, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.29304029304029305, |
|
"grad_norm": 0.05654177442193031, |
|
"learning_rate": 0.00017069167643610786, |
|
"loss": 0.0384, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2989010989010989, |
|
"grad_norm": 0.23707075417041779, |
|
"learning_rate": 0.00017010550996483, |
|
"loss": 0.0311, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.3047619047619048, |
|
"grad_norm": 0.2619571387767792, |
|
"learning_rate": 0.00016951934349355218, |
|
"loss": 0.0275, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.31062271062271063, |
|
"grad_norm": 0.11028550565242767, |
|
"learning_rate": 0.00016893317702227432, |
|
"loss": 0.0194, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.31648351648351647, |
|
"grad_norm": 0.26710912585258484, |
|
"learning_rate": 0.0001683470105509965, |
|
"loss": 0.0318, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.32234432234432236, |
|
"grad_norm": 0.20064710080623627, |
|
"learning_rate": 0.00016776084407971864, |
|
"loss": 0.0517, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3282051282051282, |
|
"grad_norm": 0.06760745495557785, |
|
"learning_rate": 0.00016717467760844082, |
|
"loss": 0.0223, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.33406593406593404, |
|
"grad_norm": 0.14518442749977112, |
|
"learning_rate": 0.00016658851113716296, |
|
"loss": 0.0216, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.33992673992673994, |
|
"grad_norm": 0.35427016019821167, |
|
"learning_rate": 0.0001660023446658851, |
|
"loss": 0.0268, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3457875457875458, |
|
"grad_norm": 0.14693213999271393, |
|
"learning_rate": 0.00016541617819460728, |
|
"loss": 0.0246, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.3516483516483517, |
|
"grad_norm": 0.2351713478565216, |
|
"learning_rate": 0.00016483001172332943, |
|
"loss": 0.0399, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3516483516483517, |
|
"eval_loss": 0.02264154888689518, |
|
"eval_runtime": 4.8013, |
|
"eval_samples_per_second": 5.832, |
|
"eval_steps_per_second": 2.916, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3575091575091575, |
|
"grad_norm": 0.16985514760017395, |
|
"learning_rate": 0.0001642438452520516, |
|
"loss": 0.0243, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.36336996336996336, |
|
"grad_norm": 0.199642613530159, |
|
"learning_rate": 0.00016365767878077374, |
|
"loss": 0.0329, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.36923076923076925, |
|
"grad_norm": 0.05538804084062576, |
|
"learning_rate": 0.00016307151230949592, |
|
"loss": 0.0474, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.3750915750915751, |
|
"grad_norm": 0.1542443037033081, |
|
"learning_rate": 0.00016248534583821806, |
|
"loss": 0.0232, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.38095238095238093, |
|
"grad_norm": 0.16808335483074188, |
|
"learning_rate": 0.0001618991793669402, |
|
"loss": 0.0272, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.3868131868131868, |
|
"grad_norm": 0.20816853642463684, |
|
"learning_rate": 0.00016131301289566238, |
|
"loss": 0.0375, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.39267399267399267, |
|
"grad_norm": 0.10338038206100464, |
|
"learning_rate": 0.00016072684642438453, |
|
"loss": 0.0233, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.39853479853479856, |
|
"grad_norm": 0.06522126495838165, |
|
"learning_rate": 0.0001601406799531067, |
|
"loss": 0.0288, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4043956043956044, |
|
"grad_norm": 0.11830403655767441, |
|
"learning_rate": 0.00015955451348182884, |
|
"loss": 0.0164, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.41025641025641024, |
|
"grad_norm": 0.2777006924152374, |
|
"learning_rate": 0.00015896834701055102, |
|
"loss": 0.0345, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.41611721611721614, |
|
"grad_norm": 0.1277918815612793, |
|
"learning_rate": 0.00015838218053927316, |
|
"loss": 0.0229, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.421978021978022, |
|
"grad_norm": 0.09861145913600922, |
|
"learning_rate": 0.0001577960140679953, |
|
"loss": 0.0181, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4278388278388278, |
|
"grad_norm": 0.08698171377182007, |
|
"learning_rate": 0.00015720984759671748, |
|
"loss": 0.0365, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.4336996336996337, |
|
"grad_norm": 0.23488883674144745, |
|
"learning_rate": 0.00015662368112543963, |
|
"loss": 0.0352, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.43956043956043955, |
|
"grad_norm": 0.05140375718474388, |
|
"learning_rate": 0.0001560375146541618, |
|
"loss": 0.0235, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.44542124542124545, |
|
"grad_norm": 0.15759135782718658, |
|
"learning_rate": 0.00015545134818288395, |
|
"loss": 0.0236, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4512820512820513, |
|
"grad_norm": 0.07523085922002792, |
|
"learning_rate": 0.00015486518171160612, |
|
"loss": 0.0218, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.45714285714285713, |
|
"grad_norm": 0.1892630159854889, |
|
"learning_rate": 0.00015427901524032826, |
|
"loss": 0.0264, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.463003663003663, |
|
"grad_norm": 0.12749487161636353, |
|
"learning_rate": 0.0001536928487690504, |
|
"loss": 0.0414, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.46886446886446886, |
|
"grad_norm": 0.11519593745470047, |
|
"learning_rate": 0.00015310668229777258, |
|
"loss": 0.0173, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.46886446886446886, |
|
"eval_loss": 0.01765686459839344, |
|
"eval_runtime": 4.8182, |
|
"eval_samples_per_second": 5.811, |
|
"eval_steps_per_second": 2.906, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4747252747252747, |
|
"grad_norm": 0.14240577816963196, |
|
"learning_rate": 0.00015252051582649473, |
|
"loss": 0.0214, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.4805860805860806, |
|
"grad_norm": 0.11283282935619354, |
|
"learning_rate": 0.0001519343493552169, |
|
"loss": 0.0266, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.48644688644688644, |
|
"grad_norm": 0.16279707849025726, |
|
"learning_rate": 0.00015134818288393905, |
|
"loss": 0.0274, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.49230769230769234, |
|
"grad_norm": 0.07319923490285873, |
|
"learning_rate": 0.00015076201641266122, |
|
"loss": 0.0216, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4981684981684982, |
|
"grad_norm": 0.13532328605651855, |
|
"learning_rate": 0.00015017584994138336, |
|
"loss": 0.0308, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.504029304029304, |
|
"grad_norm": 0.12896127998828888, |
|
"learning_rate": 0.0001495896834701055, |
|
"loss": 0.0129, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5098901098901099, |
|
"grad_norm": 0.03718201071023941, |
|
"learning_rate": 0.00014900351699882766, |
|
"loss": 0.0181, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.5157509157509158, |
|
"grad_norm": 0.05926808714866638, |
|
"learning_rate": 0.00014841735052754983, |
|
"loss": 0.0146, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5216117216117216, |
|
"grad_norm": 0.09976931661367416, |
|
"learning_rate": 0.00014783118405627197, |
|
"loss": 0.0231, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.5274725274725275, |
|
"grad_norm": 0.18460391461849213, |
|
"learning_rate": 0.00014724501758499415, |
|
"loss": 0.02, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.07457377016544342, |
|
"learning_rate": 0.00014665885111371632, |
|
"loss": 0.0209, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.5391941391941392, |
|
"grad_norm": 0.03992030769586563, |
|
"learning_rate": 0.00014607268464243847, |
|
"loss": 0.0251, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.545054945054945, |
|
"grad_norm": 0.196414053440094, |
|
"learning_rate": 0.0001454865181711606, |
|
"loss": 0.0344, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.550915750915751, |
|
"grad_norm": 0.19978556036949158, |
|
"learning_rate": 0.00014490035169988276, |
|
"loss": 0.0159, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5567765567765568, |
|
"grad_norm": 0.11389517784118652, |
|
"learning_rate": 0.00014431418522860493, |
|
"loss": 0.0197, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5626373626373626, |
|
"grad_norm": 0.07047716528177261, |
|
"learning_rate": 0.00014372801875732708, |
|
"loss": 0.0138, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5684981684981685, |
|
"grad_norm": 0.10614708811044693, |
|
"learning_rate": 0.00014314185228604925, |
|
"loss": 0.0208, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.5743589743589743, |
|
"grad_norm": 0.20569799840450287, |
|
"learning_rate": 0.00014255568581477142, |
|
"loss": 0.0203, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5802197802197803, |
|
"grad_norm": 0.19309553503990173, |
|
"learning_rate": 0.00014196951934349357, |
|
"loss": 0.0282, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.5860805860805861, |
|
"grad_norm": 0.07542768865823746, |
|
"learning_rate": 0.0001413833528722157, |
|
"loss": 0.0173, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5860805860805861, |
|
"eval_loss": 0.022126102820038795, |
|
"eval_runtime": 4.8755, |
|
"eval_samples_per_second": 5.743, |
|
"eval_steps_per_second": 2.872, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.591941391941392, |
|
"grad_norm": 0.1205630674958229, |
|
"learning_rate": 0.00014079718640093786, |
|
"loss": 0.0322, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.5978021978021978, |
|
"grad_norm": 0.13761042058467865, |
|
"learning_rate": 0.00014021101992966003, |
|
"loss": 0.0203, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6036630036630036, |
|
"grad_norm": 0.08595598489046097, |
|
"learning_rate": 0.00013962485345838218, |
|
"loss": 0.0145, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.6095238095238096, |
|
"grad_norm": 0.11087319999933243, |
|
"learning_rate": 0.00013903868698710435, |
|
"loss": 0.0218, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.14962054789066315, |
|
"learning_rate": 0.00013845252051582652, |
|
"loss": 0.0322, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.6212454212454213, |
|
"grad_norm": 0.07573894411325455, |
|
"learning_rate": 0.00013786635404454867, |
|
"loss": 0.0275, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6271062271062271, |
|
"grad_norm": 0.069780133664608, |
|
"learning_rate": 0.0001372801875732708, |
|
"loss": 0.0235, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.6329670329670329, |
|
"grad_norm": 0.07833613455295563, |
|
"learning_rate": 0.00013669402110199296, |
|
"loss": 0.0344, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6388278388278388, |
|
"grad_norm": 0.07331829518079758, |
|
"learning_rate": 0.00013610785463071513, |
|
"loss": 0.0135, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.6446886446886447, |
|
"grad_norm": 0.22369089722633362, |
|
"learning_rate": 0.00013552168815943728, |
|
"loss": 0.0222, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6505494505494506, |
|
"grad_norm": 0.1465146392583847, |
|
"learning_rate": 0.00013493552168815945, |
|
"loss": 0.0307, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.6564102564102564, |
|
"grad_norm": 0.06348715722560883, |
|
"learning_rate": 0.00013434935521688162, |
|
"loss": 0.0358, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6622710622710622, |
|
"grad_norm": 0.09298256784677505, |
|
"learning_rate": 0.00013376318874560377, |
|
"loss": 0.0224, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.6681318681318681, |
|
"grad_norm": 0.18280836939811707, |
|
"learning_rate": 0.00013317702227432591, |
|
"loss": 0.0263, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.673992673992674, |
|
"grad_norm": 0.07080171257257462, |
|
"learning_rate": 0.00013259085580304806, |
|
"loss": 0.0192, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.6798534798534799, |
|
"grad_norm": 0.11019092798233032, |
|
"learning_rate": 0.00013200468933177023, |
|
"loss": 0.0211, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6857142857142857, |
|
"grad_norm": 0.13162659108638763, |
|
"learning_rate": 0.00013141852286049238, |
|
"loss": 0.0284, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.6915750915750916, |
|
"grad_norm": 0.19205588102340698, |
|
"learning_rate": 0.00013083235638921455, |
|
"loss": 0.0245, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6974358974358974, |
|
"grad_norm": 0.0971173569560051, |
|
"learning_rate": 0.0001302461899179367, |
|
"loss": 0.0216, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.7032967032967034, |
|
"grad_norm": 0.2179749310016632, |
|
"learning_rate": 0.00012966002344665887, |
|
"loss": 0.0268, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7032967032967034, |
|
"eval_loss": 0.01850169710814953, |
|
"eval_runtime": 4.6688, |
|
"eval_samples_per_second": 5.997, |
|
"eval_steps_per_second": 2.999, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7091575091575092, |
|
"grad_norm": 0.23079490661621094, |
|
"learning_rate": 0.00012907385697538101, |
|
"loss": 0.0269, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.715018315018315, |
|
"grad_norm": 0.08325810730457306, |
|
"learning_rate": 0.00012848769050410316, |
|
"loss": 0.0211, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7208791208791209, |
|
"grad_norm": 0.05983910337090492, |
|
"learning_rate": 0.00012790152403282533, |
|
"loss": 0.0217, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.7267399267399267, |
|
"grad_norm": 0.13831888139247894, |
|
"learning_rate": 0.00012731535756154748, |
|
"loss": 0.0142, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7326007326007326, |
|
"grad_norm": 0.12296965718269348, |
|
"learning_rate": 0.00012672919109026965, |
|
"loss": 0.0253, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.7384615384615385, |
|
"grad_norm": 0.13777951896190643, |
|
"learning_rate": 0.0001261430246189918, |
|
"loss": 0.0214, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7443223443223443, |
|
"grad_norm": 0.12136834859848022, |
|
"learning_rate": 0.00012555685814771397, |
|
"loss": 0.0244, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.7501831501831502, |
|
"grad_norm": 0.050576552748680115, |
|
"learning_rate": 0.00012497069167643612, |
|
"loss": 0.0137, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.756043956043956, |
|
"grad_norm": 0.22222141921520233, |
|
"learning_rate": 0.00012438452520515826, |
|
"loss": 0.0254, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.7619047619047619, |
|
"grad_norm": 0.06815624237060547, |
|
"learning_rate": 0.00012379835873388043, |
|
"loss": 0.0231, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7677655677655678, |
|
"grad_norm": 0.19518345594406128, |
|
"learning_rate": 0.00012321219226260258, |
|
"loss": 0.0218, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.7736263736263737, |
|
"grad_norm": 0.06349798291921616, |
|
"learning_rate": 0.00012262602579132475, |
|
"loss": 0.0265, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7794871794871795, |
|
"grad_norm": 0.09031341969966888, |
|
"learning_rate": 0.00012203985932004688, |
|
"loss": 0.0299, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.7853479853479853, |
|
"grad_norm": 0.060232892632484436, |
|
"learning_rate": 0.00012145369284876906, |
|
"loss": 0.0227, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7912087912087912, |
|
"grad_norm": 0.23972396552562714, |
|
"learning_rate": 0.00012086752637749122, |
|
"loss": 0.0233, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.7970695970695971, |
|
"grad_norm": 0.06141636520624161, |
|
"learning_rate": 0.00012028135990621336, |
|
"loss": 0.017, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.802930402930403, |
|
"grad_norm": 0.05603253096342087, |
|
"learning_rate": 0.00011969519343493553, |
|
"loss": 0.025, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.8087912087912088, |
|
"grad_norm": 0.06840907782316208, |
|
"learning_rate": 0.0001191090269636577, |
|
"loss": 0.0164, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8146520146520146, |
|
"grad_norm": 0.1270790845155716, |
|
"learning_rate": 0.00011852286049237984, |
|
"loss": 0.0237, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.8205128205128205, |
|
"grad_norm": 0.03222518041729927, |
|
"learning_rate": 0.00011793669402110198, |
|
"loss": 0.0169, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8205128205128205, |
|
"eval_loss": 0.026108432561159134, |
|
"eval_runtime": 4.6865, |
|
"eval_samples_per_second": 5.975, |
|
"eval_steps_per_second": 2.987, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8263736263736263, |
|
"grad_norm": 0.06273896247148514, |
|
"learning_rate": 0.00011735052754982416, |
|
"loss": 0.0245, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.8322344322344323, |
|
"grad_norm": 0.1422451138496399, |
|
"learning_rate": 0.00011676436107854632, |
|
"loss": 0.0218, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8380952380952381, |
|
"grad_norm": 0.07166247069835663, |
|
"learning_rate": 0.00011617819460726846, |
|
"loss": 0.0259, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.843956043956044, |
|
"grad_norm": 0.13188450038433075, |
|
"learning_rate": 0.00011559202813599064, |
|
"loss": 0.0185, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8498168498168498, |
|
"grad_norm": 0.11839079111814499, |
|
"learning_rate": 0.0001150058616647128, |
|
"loss": 0.0196, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.8556776556776556, |
|
"grad_norm": 0.09421879053115845, |
|
"learning_rate": 0.00011441969519343494, |
|
"loss": 0.0207, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8615384615384616, |
|
"grad_norm": 0.11722107976675034, |
|
"learning_rate": 0.00011383352872215709, |
|
"loss": 0.0286, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.8673992673992674, |
|
"grad_norm": 0.07790110260248184, |
|
"learning_rate": 0.00011324736225087926, |
|
"loss": 0.0157, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8732600732600733, |
|
"grad_norm": 0.11153840273618698, |
|
"learning_rate": 0.00011266119577960142, |
|
"loss": 0.0184, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.8791208791208791, |
|
"grad_norm": 0.07105362415313721, |
|
"learning_rate": 0.00011207502930832356, |
|
"loss": 0.0193, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.884981684981685, |
|
"grad_norm": 0.11616308242082596, |
|
"learning_rate": 0.00011148886283704571, |
|
"loss": 0.0219, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.8908424908424909, |
|
"grad_norm": 0.10045047104358673, |
|
"learning_rate": 0.0001109026963657679, |
|
"loss": 0.0177, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.8967032967032967, |
|
"grad_norm": 0.07033990323543549, |
|
"learning_rate": 0.00011031652989449004, |
|
"loss": 0.0227, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.9025641025641026, |
|
"grad_norm": 0.07648850232362747, |
|
"learning_rate": 0.00010973036342321219, |
|
"loss": 0.023, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9084249084249084, |
|
"grad_norm": 0.05392804369330406, |
|
"learning_rate": 0.00010914419695193436, |
|
"loss": 0.0136, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.9142857142857143, |
|
"grad_norm": 0.17311276495456696, |
|
"learning_rate": 0.00010855803048065652, |
|
"loss": 0.0257, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9201465201465201, |
|
"grad_norm": 0.07022574543952942, |
|
"learning_rate": 0.00010797186400937866, |
|
"loss": 0.0282, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.926007326007326, |
|
"grad_norm": 0.15858297049999237, |
|
"learning_rate": 0.00010738569753810081, |
|
"loss": 0.0219, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9318681318681319, |
|
"grad_norm": 0.06796769052743912, |
|
"learning_rate": 0.00010679953106682298, |
|
"loss": 0.0288, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.9377289377289377, |
|
"grad_norm": 0.11868051439523697, |
|
"learning_rate": 0.00010621336459554514, |
|
"loss": 0.0248, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9377289377289377, |
|
"eval_loss": 0.019950957968831062, |
|
"eval_runtime": 4.8291, |
|
"eval_samples_per_second": 5.798, |
|
"eval_steps_per_second": 2.899, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9435897435897436, |
|
"grad_norm": 0.19787561893463135, |
|
"learning_rate": 0.00010562719812426729, |
|
"loss": 0.0249, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.9494505494505494, |
|
"grad_norm": 0.06437662243843079, |
|
"learning_rate": 0.00010504103165298946, |
|
"loss": 0.0141, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9553113553113554, |
|
"grad_norm": 0.09178975969552994, |
|
"learning_rate": 0.00010445486518171162, |
|
"loss": 0.0218, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.9611721611721612, |
|
"grad_norm": 0.09567834436893463, |
|
"learning_rate": 0.00010386869871043376, |
|
"loss": 0.0229, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.967032967032967, |
|
"grad_norm": 0.039594829082489014, |
|
"learning_rate": 0.00010328253223915591, |
|
"loss": 0.0186, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.9728937728937729, |
|
"grad_norm": 0.18495650589466095, |
|
"learning_rate": 0.00010269636576787808, |
|
"loss": 0.0237, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.9787545787545787, |
|
"grad_norm": 0.1861388385295868, |
|
"learning_rate": 0.00010211019929660024, |
|
"loss": 0.0367, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.9846153846153847, |
|
"grad_norm": 0.05491223558783531, |
|
"learning_rate": 0.00010152403282532239, |
|
"loss": 0.015, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9904761904761905, |
|
"grad_norm": 0.04110349714756012, |
|
"learning_rate": 0.00010093786635404456, |
|
"loss": 0.0258, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.9963369963369964, |
|
"grad_norm": 0.07649147510528564, |
|
"learning_rate": 0.00010035169988276672, |
|
"loss": 0.0235, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.0021978021978022, |
|
"grad_norm": 0.11078579723834991, |
|
"learning_rate": 9.976553341148887e-05, |
|
"loss": 0.0204, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.008058608058608, |
|
"grad_norm": 0.08302613347768784, |
|
"learning_rate": 9.917936694021102e-05, |
|
"loss": 0.0188, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.0139194139194139, |
|
"grad_norm": 0.19045108556747437, |
|
"learning_rate": 9.859320046893318e-05, |
|
"loss": 0.0226, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.0197802197802197, |
|
"grad_norm": 0.04657626897096634, |
|
"learning_rate": 9.800703399765534e-05, |
|
"loss": 0.0205, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 0.10237232595682144, |
|
"learning_rate": 9.742086752637749e-05, |
|
"loss": 0.0259, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.0315018315018314, |
|
"grad_norm": 0.1746947020292282, |
|
"learning_rate": 9.683470105509965e-05, |
|
"loss": 0.0229, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.0373626373626375, |
|
"grad_norm": 0.1032433807849884, |
|
"learning_rate": 9.624853458382182e-05, |
|
"loss": 0.0277, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.0432234432234433, |
|
"grad_norm": 0.08428288996219635, |
|
"learning_rate": 9.566236811254397e-05, |
|
"loss": 0.0161, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.0490842490842491, |
|
"grad_norm": 0.16661523282527924, |
|
"learning_rate": 9.507620164126613e-05, |
|
"loss": 0.0212, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.054945054945055, |
|
"grad_norm": 0.08473166078329086, |
|
"learning_rate": 9.449003516998827e-05, |
|
"loss": 0.0285, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.054945054945055, |
|
"eval_loss": 0.018958253785967827, |
|
"eval_runtime": 4.8179, |
|
"eval_samples_per_second": 5.812, |
|
"eval_steps_per_second": 2.906, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.0608058608058608, |
|
"grad_norm": 0.04149739816784859, |
|
"learning_rate": 9.390386869871044e-05, |
|
"loss": 0.0138, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 0.05616866052150726, |
|
"learning_rate": 9.331770222743259e-05, |
|
"loss": 0.0194, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.0725274725274725, |
|
"grad_norm": 0.16730394959449768, |
|
"learning_rate": 9.273153575615475e-05, |
|
"loss": 0.0359, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.0783882783882783, |
|
"grad_norm": 0.06336849927902222, |
|
"learning_rate": 9.214536928487691e-05, |
|
"loss": 0.0147, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.0842490842490842, |
|
"grad_norm": 0.0882687047123909, |
|
"learning_rate": 9.155920281359907e-05, |
|
"loss": 0.015, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.09010989010989, |
|
"grad_norm": 0.05476200208067894, |
|
"learning_rate": 9.097303634232123e-05, |
|
"loss": 0.019, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.095970695970696, |
|
"grad_norm": 0.05358652025461197, |
|
"learning_rate": 9.038686987104337e-05, |
|
"loss": 0.0174, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.101831501831502, |
|
"grad_norm": 0.24095569550991058, |
|
"learning_rate": 8.980070339976554e-05, |
|
"loss": 0.0293, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.1076923076923078, |
|
"grad_norm": 0.06653840094804764, |
|
"learning_rate": 8.921453692848769e-05, |
|
"loss": 0.0133, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.1135531135531136, |
|
"grad_norm": 0.08839567750692368, |
|
"learning_rate": 8.862837045720985e-05, |
|
"loss": 0.0208, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.1194139194139194, |
|
"grad_norm": 0.032916922122240067, |
|
"learning_rate": 8.804220398593201e-05, |
|
"loss": 0.0267, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.1252747252747253, |
|
"grad_norm": 0.11819420754909515, |
|
"learning_rate": 8.745603751465417e-05, |
|
"loss": 0.0384, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.1311355311355311, |
|
"grad_norm": 0.06757565587759018, |
|
"learning_rate": 8.686987104337633e-05, |
|
"loss": 0.0135, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.136996336996337, |
|
"grad_norm": 0.0970228835940361, |
|
"learning_rate": 8.628370457209847e-05, |
|
"loss": 0.0166, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 0.13436350226402283, |
|
"learning_rate": 8.569753810082065e-05, |
|
"loss": 0.016, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.1487179487179486, |
|
"grad_norm": 0.10584839433431625, |
|
"learning_rate": 8.511137162954279e-05, |
|
"loss": 0.0177, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.1545787545787545, |
|
"grad_norm": 0.21206024289131165, |
|
"learning_rate": 8.452520515826495e-05, |
|
"loss": 0.0325, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.1604395604395605, |
|
"grad_norm": 0.04815613850951195, |
|
"learning_rate": 8.393903868698711e-05, |
|
"loss": 0.0137, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.1663003663003664, |
|
"grad_norm": 0.07466138154268265, |
|
"learning_rate": 8.335287221570927e-05, |
|
"loss": 0.0173, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.1721611721611722, |
|
"grad_norm": 0.09366811066865921, |
|
"learning_rate": 8.276670574443143e-05, |
|
"loss": 0.025, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.1721611721611722, |
|
"eval_loss": 0.019907595589756966, |
|
"eval_runtime": 4.819, |
|
"eval_samples_per_second": 5.81, |
|
"eval_steps_per_second": 2.905, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.178021978021978, |
|
"grad_norm": 0.08527784794569016, |
|
"learning_rate": 8.218053927315357e-05, |
|
"loss": 0.0208, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.183882783882784, |
|
"grad_norm": 0.08328138291835785, |
|
"learning_rate": 8.159437280187575e-05, |
|
"loss": 0.0216, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.1897435897435897, |
|
"grad_norm": 0.05000188946723938, |
|
"learning_rate": 8.100820633059789e-05, |
|
"loss": 0.0211, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.1956043956043956, |
|
"grad_norm": 0.028807902708649635, |
|
"learning_rate": 8.042203985932005e-05, |
|
"loss": 0.0096, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.2014652014652014, |
|
"grad_norm": 0.20507606863975525, |
|
"learning_rate": 7.983587338804221e-05, |
|
"loss": 0.0222, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.2073260073260073, |
|
"grad_norm": 0.04885656014084816, |
|
"learning_rate": 7.924970691676437e-05, |
|
"loss": 0.0215, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.213186813186813, |
|
"grad_norm": 0.047489382326602936, |
|
"learning_rate": 7.866354044548652e-05, |
|
"loss": 0.0178, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.2190476190476192, |
|
"grad_norm": 0.05971779301762581, |
|
"learning_rate": 7.807737397420867e-05, |
|
"loss": 0.0176, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.224908424908425, |
|
"grad_norm": 0.04695736616849899, |
|
"learning_rate": 7.749120750293083e-05, |
|
"loss": 0.0148, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 0.08131909370422363, |
|
"learning_rate": 7.690504103165299e-05, |
|
"loss": 0.0201, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.2366300366300367, |
|
"grad_norm": 0.06954577565193176, |
|
"learning_rate": 7.631887456037515e-05, |
|
"loss": 0.0149, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.2424908424908425, |
|
"grad_norm": 0.054430391639471054, |
|
"learning_rate": 7.57327080890973e-05, |
|
"loss": 0.0078, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.2483516483516484, |
|
"grad_norm": 0.12231959402561188, |
|
"learning_rate": 7.514654161781947e-05, |
|
"loss": 0.0258, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.2542124542124542, |
|
"grad_norm": 0.04983118548989296, |
|
"learning_rate": 7.456037514654162e-05, |
|
"loss": 0.0189, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.26007326007326, |
|
"grad_norm": 0.11981873214244843, |
|
"learning_rate": 7.397420867526378e-05, |
|
"loss": 0.0156, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.2659340659340659, |
|
"grad_norm": 0.03829724341630936, |
|
"learning_rate": 7.338804220398593e-05, |
|
"loss": 0.0162, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.2717948717948717, |
|
"grad_norm": 0.1572490632534027, |
|
"learning_rate": 7.28018757327081e-05, |
|
"loss": 0.0188, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.2776556776556776, |
|
"grad_norm": 0.122514508664608, |
|
"learning_rate": 7.221570926143025e-05, |
|
"loss": 0.0229, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.2835164835164834, |
|
"grad_norm": 0.06537042558193207, |
|
"learning_rate": 7.16295427901524e-05, |
|
"loss": 0.0222, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.2893772893772895, |
|
"grad_norm": 0.1269371509552002, |
|
"learning_rate": 7.104337631887457e-05, |
|
"loss": 0.0272, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.2893772893772895, |
|
"eval_loss": 0.015397748909890652, |
|
"eval_runtime": 4.7526, |
|
"eval_samples_per_second": 5.892, |
|
"eval_steps_per_second": 2.946, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.2952380952380953, |
|
"grad_norm": 0.07507819682359695, |
|
"learning_rate": 7.045720984759672e-05, |
|
"loss": 0.0256, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.3010989010989011, |
|
"grad_norm": 0.04198193550109863, |
|
"learning_rate": 6.987104337631888e-05, |
|
"loss": 0.0102, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.306959706959707, |
|
"grad_norm": 0.053751297295093536, |
|
"learning_rate": 6.928487690504104e-05, |
|
"loss": 0.0141, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.3128205128205128, |
|
"grad_norm": 0.12076237797737122, |
|
"learning_rate": 6.86987104337632e-05, |
|
"loss": 0.0165, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.3186813186813187, |
|
"grad_norm": 0.0769004300236702, |
|
"learning_rate": 6.811254396248535e-05, |
|
"loss": 0.0191, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.3245421245421245, |
|
"grad_norm": 0.08021704852581024, |
|
"learning_rate": 6.75263774912075e-05, |
|
"loss": 0.0253, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.3304029304029303, |
|
"grad_norm": 0.09786754846572876, |
|
"learning_rate": 6.694021101992967e-05, |
|
"loss": 0.0191, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.3362637362637364, |
|
"grad_norm": 0.06878714263439178, |
|
"learning_rate": 6.635404454865182e-05, |
|
"loss": 0.0326, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.3421245421245422, |
|
"grad_norm": 0.11297193914651871, |
|
"learning_rate": 6.576787807737398e-05, |
|
"loss": 0.0185, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.347985347985348, |
|
"grad_norm": 0.10731342434883118, |
|
"learning_rate": 6.518171160609614e-05, |
|
"loss": 0.0168, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.353846153846154, |
|
"grad_norm": 0.08888328820466995, |
|
"learning_rate": 6.45955451348183e-05, |
|
"loss": 0.0182, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.3597069597069598, |
|
"grad_norm": 0.1666301190853119, |
|
"learning_rate": 6.400937866354045e-05, |
|
"loss": 0.0254, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.3655677655677656, |
|
"grad_norm": 0.1334419697523117, |
|
"learning_rate": 6.34232121922626e-05, |
|
"loss": 0.0215, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.3714285714285714, |
|
"grad_norm": 0.05616243556141853, |
|
"learning_rate": 6.283704572098477e-05, |
|
"loss": 0.0156, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.3772893772893773, |
|
"grad_norm": 0.1660885214805603, |
|
"learning_rate": 6.225087924970692e-05, |
|
"loss": 0.0241, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.3831501831501831, |
|
"grad_norm": 0.09245380759239197, |
|
"learning_rate": 6.166471277842908e-05, |
|
"loss": 0.0222, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.389010989010989, |
|
"grad_norm": 0.08635041117668152, |
|
"learning_rate": 6.107854630715122e-05, |
|
"loss": 0.0203, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.3948717948717948, |
|
"grad_norm": 0.07752135396003723, |
|
"learning_rate": 6.049237983587339e-05, |
|
"loss": 0.0216, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.4007326007326006, |
|
"grad_norm": 0.10603225976228714, |
|
"learning_rate": 5.990621336459554e-05, |
|
"loss": 0.0205, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.4065934065934065, |
|
"grad_norm": 0.04343140870332718, |
|
"learning_rate": 5.932004689331771e-05, |
|
"loss": 0.0105, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.4065934065934065, |
|
"eval_loss": 0.015952473506331444, |
|
"eval_runtime": 4.7686, |
|
"eval_samples_per_second": 5.872, |
|
"eval_steps_per_second": 2.936, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.4124542124542123, |
|
"grad_norm": 0.12541887164115906, |
|
"learning_rate": 5.873388042203987e-05, |
|
"loss": 0.0217, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.4183150183150184, |
|
"grad_norm": 0.05417335778474808, |
|
"learning_rate": 5.814771395076202e-05, |
|
"loss": 0.0205, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.4241758241758242, |
|
"grad_norm": 0.04461506009101868, |
|
"learning_rate": 5.756154747948418e-05, |
|
"loss": 0.0211, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.43003663003663, |
|
"grad_norm": 0.09423286467790604, |
|
"learning_rate": 5.697538100820633e-05, |
|
"loss": 0.0238, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.435897435897436, |
|
"grad_norm": 0.050094570964574814, |
|
"learning_rate": 5.638921453692849e-05, |
|
"loss": 0.0163, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.4417582417582417, |
|
"grad_norm": 0.13104532659053802, |
|
"learning_rate": 5.580304806565064e-05, |
|
"loss": 0.0219, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.4476190476190476, |
|
"grad_norm": 0.08238503336906433, |
|
"learning_rate": 5.52168815943728e-05, |
|
"loss": 0.0126, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.4534798534798534, |
|
"grad_norm": 0.1029452383518219, |
|
"learning_rate": 5.463071512309497e-05, |
|
"loss": 0.0247, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.4593406593406593, |
|
"grad_norm": 0.05564792454242706, |
|
"learning_rate": 5.404454865181712e-05, |
|
"loss": 0.0212, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.4652014652014653, |
|
"grad_norm": 0.08589282631874084, |
|
"learning_rate": 5.345838218053928e-05, |
|
"loss": 0.0184, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.4710622710622712, |
|
"grad_norm": 0.15644195675849915, |
|
"learning_rate": 5.287221570926143e-05, |
|
"loss": 0.0165, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.476923076923077, |
|
"grad_norm": 0.11301274597644806, |
|
"learning_rate": 5.228604923798359e-05, |
|
"loss": 0.0322, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.4827838827838828, |
|
"grad_norm": 0.044553741812705994, |
|
"learning_rate": 5.1699882766705743e-05, |
|
"loss": 0.0183, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.4886446886446887, |
|
"grad_norm": 0.06141185760498047, |
|
"learning_rate": 5.11137162954279e-05, |
|
"loss": 0.0102, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.4945054945054945, |
|
"grad_norm": 0.08107537031173706, |
|
"learning_rate": 5.0527549824150055e-05, |
|
"loss": 0.0215, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.5003663003663004, |
|
"grad_norm": 0.06017793342471123, |
|
"learning_rate": 4.9941383352872214e-05, |
|
"loss": 0.0176, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.5062271062271062, |
|
"grad_norm": 0.051033902913331985, |
|
"learning_rate": 4.9355216881594373e-05, |
|
"loss": 0.0149, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.512087912087912, |
|
"grad_norm": 0.124452143907547, |
|
"learning_rate": 4.876905041031653e-05, |
|
"loss": 0.0209, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.5179487179487179, |
|
"grad_norm": 0.1616523712873459, |
|
"learning_rate": 4.8182883939038685e-05, |
|
"loss": 0.0192, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.5238095238095237, |
|
"grad_norm": 0.07067764550447464, |
|
"learning_rate": 4.759671746776085e-05, |
|
"loss": 0.0279, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.5238095238095237, |
|
"eval_loss": 0.017229218035936356, |
|
"eval_runtime": 4.6702, |
|
"eval_samples_per_second": 5.996, |
|
"eval_steps_per_second": 2.998, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.5296703296703296, |
|
"grad_norm": 0.06332267820835114, |
|
"learning_rate": 4.7010550996483003e-05, |
|
"loss": 0.0253, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.5355311355311354, |
|
"grad_norm": 0.08032066375017166, |
|
"learning_rate": 4.642438452520516e-05, |
|
"loss": 0.0128, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.5413919413919412, |
|
"grad_norm": 0.11456907540559769, |
|
"learning_rate": 4.5838218053927315e-05, |
|
"loss": 0.0127, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.5472527472527473, |
|
"grad_norm": 0.19653138518333435, |
|
"learning_rate": 4.5252051582649474e-05, |
|
"loss": 0.0236, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.5531135531135531, |
|
"grad_norm": 0.08195839822292328, |
|
"learning_rate": 4.4665885111371633e-05, |
|
"loss": 0.0173, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.558974358974359, |
|
"grad_norm": 0.11376089602708817, |
|
"learning_rate": 4.4079718640093786e-05, |
|
"loss": 0.0115, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.5648351648351648, |
|
"grad_norm": 0.055264201015233994, |
|
"learning_rate": 4.3493552168815945e-05, |
|
"loss": 0.0198, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.5706959706959707, |
|
"grad_norm": 0.13038881123065948, |
|
"learning_rate": 4.2907385697538104e-05, |
|
"loss": 0.0231, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.5765567765567765, |
|
"grad_norm": 0.0317939892411232, |
|
"learning_rate": 4.2321219226260263e-05, |
|
"loss": 0.0083, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.5824175824175826, |
|
"grad_norm": 0.151336207985878, |
|
"learning_rate": 4.1735052754982416e-05, |
|
"loss": 0.0225, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.5882783882783884, |
|
"grad_norm": 0.07817093282938004, |
|
"learning_rate": 4.1148886283704575e-05, |
|
"loss": 0.0226, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.5941391941391942, |
|
"grad_norm": 0.1341279149055481, |
|
"learning_rate": 4.056271981242673e-05, |
|
"loss": 0.0263, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.06353727728128433, |
|
"learning_rate": 3.997655334114889e-05, |
|
"loss": 0.0198, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.605860805860806, |
|
"grad_norm": 0.11177901178598404, |
|
"learning_rate": 3.9390386869871046e-05, |
|
"loss": 0.0172, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.6117216117216118, |
|
"grad_norm": 0.047024596482515335, |
|
"learning_rate": 3.88042203985932e-05, |
|
"loss": 0.0207, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.6175824175824176, |
|
"grad_norm": 0.04343528300523758, |
|
"learning_rate": 3.8218053927315364e-05, |
|
"loss": 0.0214, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.6234432234432234, |
|
"grad_norm": 0.08330193161964417, |
|
"learning_rate": 3.763188745603752e-05, |
|
"loss": 0.0286, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.6293040293040293, |
|
"grad_norm": 0.0811009556055069, |
|
"learning_rate": 3.7045720984759676e-05, |
|
"loss": 0.0148, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.6351648351648351, |
|
"grad_norm": 0.1049441322684288, |
|
"learning_rate": 3.645955451348183e-05, |
|
"loss": 0.0184, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.641025641025641, |
|
"grad_norm": 0.11944428086280823, |
|
"learning_rate": 3.587338804220399e-05, |
|
"loss": 0.0122, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.641025641025641, |
|
"eval_loss": 0.017561230808496475, |
|
"eval_runtime": 4.685, |
|
"eval_samples_per_second": 5.977, |
|
"eval_steps_per_second": 2.988, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.6468864468864468, |
|
"grad_norm": 0.14023366570472717, |
|
"learning_rate": 3.528722157092615e-05, |
|
"loss": 0.0178, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.6527472527472526, |
|
"grad_norm": 0.14057691395282745, |
|
"learning_rate": 3.47010550996483e-05, |
|
"loss": 0.0268, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.6586080586080585, |
|
"grad_norm": 0.1253061443567276, |
|
"learning_rate": 3.411488862837046e-05, |
|
"loss": 0.0266, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.6644688644688643, |
|
"grad_norm": 0.03431854769587517, |
|
"learning_rate": 3.352872215709262e-05, |
|
"loss": 0.02, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.6703296703296702, |
|
"grad_norm": 0.13929079473018646, |
|
"learning_rate": 3.294255568581478e-05, |
|
"loss": 0.0226, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.6761904761904762, |
|
"grad_norm": 0.06429693102836609, |
|
"learning_rate": 3.235638921453693e-05, |
|
"loss": 0.0225, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.682051282051282, |
|
"grad_norm": 0.029311953112483025, |
|
"learning_rate": 3.177022274325909e-05, |
|
"loss": 0.0161, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.687912087912088, |
|
"grad_norm": 0.04346455633640289, |
|
"learning_rate": 3.118405627198124e-05, |
|
"loss": 0.0155, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.6937728937728938, |
|
"grad_norm": 0.09009824693202972, |
|
"learning_rate": 3.05978898007034e-05, |
|
"loss": 0.0153, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.6996336996336996, |
|
"grad_norm": 0.071926549077034, |
|
"learning_rate": 3.0011723329425556e-05, |
|
"loss": 0.0136, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.7054945054945057, |
|
"grad_norm": 0.06461833417415619, |
|
"learning_rate": 2.9425556858147718e-05, |
|
"loss": 0.0237, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.7113553113553115, |
|
"grad_norm": 0.039929524064064026, |
|
"learning_rate": 2.8839390386869874e-05, |
|
"loss": 0.0187, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.7172161172161173, |
|
"grad_norm": 0.0534372515976429, |
|
"learning_rate": 2.825322391559203e-05, |
|
"loss": 0.0192, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.7230769230769232, |
|
"grad_norm": 0.1467376947402954, |
|
"learning_rate": 2.7667057444314186e-05, |
|
"loss": 0.0203, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.728937728937729, |
|
"grad_norm": 0.0830003172159195, |
|
"learning_rate": 2.7080890973036345e-05, |
|
"loss": 0.0188, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.7347985347985349, |
|
"grad_norm": 0.07220768928527832, |
|
"learning_rate": 2.64947245017585e-05, |
|
"loss": 0.0118, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.7406593406593407, |
|
"grad_norm": 0.0751115009188652, |
|
"learning_rate": 2.5908558030480656e-05, |
|
"loss": 0.0156, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.7465201465201465, |
|
"grad_norm": 0.07690921425819397, |
|
"learning_rate": 2.5322391559202812e-05, |
|
"loss": 0.0347, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.7523809523809524, |
|
"grad_norm": 0.05416159704327583, |
|
"learning_rate": 2.473622508792497e-05, |
|
"loss": 0.0167, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.7582417582417582, |
|
"grad_norm": 0.0676250010728836, |
|
"learning_rate": 2.4150058616647127e-05, |
|
"loss": 0.0205, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.7582417582417582, |
|
"eval_loss": 0.019522378221154213, |
|
"eval_runtime": 4.6942, |
|
"eval_samples_per_second": 5.965, |
|
"eval_steps_per_second": 2.982, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.764102564102564, |
|
"grad_norm": 0.08909754455089569, |
|
"learning_rate": 2.3563892145369286e-05, |
|
"loss": 0.0249, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.76996336996337, |
|
"grad_norm": 0.042161233723163605, |
|
"learning_rate": 2.2977725674091442e-05, |
|
"loss": 0.0117, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.7758241758241757, |
|
"grad_norm": 0.07136218249797821, |
|
"learning_rate": 2.23915592028136e-05, |
|
"loss": 0.0212, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.7816849816849816, |
|
"grad_norm": 0.14128735661506653, |
|
"learning_rate": 2.1805392731535757e-05, |
|
"loss": 0.0189, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.7875457875457874, |
|
"grad_norm": 0.05959760770201683, |
|
"learning_rate": 2.1219226260257916e-05, |
|
"loss": 0.0119, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.7934065934065933, |
|
"grad_norm": 0.038479190319776535, |
|
"learning_rate": 2.0633059788980072e-05, |
|
"loss": 0.013, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.7992673992673993, |
|
"grad_norm": 0.09512809664011002, |
|
"learning_rate": 2.0046893317702228e-05, |
|
"loss": 0.0148, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.8051282051282052, |
|
"grad_norm": 0.14848454296588898, |
|
"learning_rate": 1.9460726846424384e-05, |
|
"loss": 0.019, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.810989010989011, |
|
"grad_norm": 0.10240516811609268, |
|
"learning_rate": 1.8874560375146543e-05, |
|
"loss": 0.017, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.8168498168498168, |
|
"grad_norm": 0.09345954656600952, |
|
"learning_rate": 1.82883939038687e-05, |
|
"loss": 0.0237, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.8227106227106227, |
|
"grad_norm": 0.03815275430679321, |
|
"learning_rate": 1.7702227432590858e-05, |
|
"loss": 0.0188, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.8285714285714287, |
|
"grad_norm": 0.027827398851513863, |
|
"learning_rate": 1.7116060961313014e-05, |
|
"loss": 0.0183, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.8344322344322346, |
|
"grad_norm": 0.08811303228139877, |
|
"learning_rate": 1.6529894490035173e-05, |
|
"loss": 0.0191, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.8402930402930404, |
|
"grad_norm": 0.03119056299328804, |
|
"learning_rate": 1.594372801875733e-05, |
|
"loss": 0.0183, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 0.09752997010946274, |
|
"learning_rate": 1.5357561547479485e-05, |
|
"loss": 0.0161, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.852014652014652, |
|
"grad_norm": 0.0855243131518364, |
|
"learning_rate": 1.477139507620164e-05, |
|
"loss": 0.015, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.857875457875458, |
|
"grad_norm": 0.08388842642307281, |
|
"learning_rate": 1.41852286049238e-05, |
|
"loss": 0.0148, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.8637362637362638, |
|
"grad_norm": 0.10147551447153091, |
|
"learning_rate": 1.3599062133645957e-05, |
|
"loss": 0.0154, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.8695970695970696, |
|
"grad_norm": 0.0457012839615345, |
|
"learning_rate": 1.3012895662368113e-05, |
|
"loss": 0.0186, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.8754578754578755, |
|
"grad_norm": 0.03654688224196434, |
|
"learning_rate": 1.242672919109027e-05, |
|
"loss": 0.0321, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.8754578754578755, |
|
"eval_loss": 0.017443044111132622, |
|
"eval_runtime": 4.7108, |
|
"eval_samples_per_second": 5.944, |
|
"eval_steps_per_second": 2.972, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.8813186813186813, |
|
"grad_norm": 0.07887323200702667, |
|
"learning_rate": 1.1840562719812428e-05, |
|
"loss": 0.0142, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.8871794871794871, |
|
"grad_norm": 0.11328335106372833, |
|
"learning_rate": 1.1254396248534585e-05, |
|
"loss": 0.025, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.893040293040293, |
|
"grad_norm": 0.09318089485168457, |
|
"learning_rate": 1.0668229777256741e-05, |
|
"loss": 0.0204, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.8989010989010988, |
|
"grad_norm": 0.06992164254188538, |
|
"learning_rate": 1.0082063305978899e-05, |
|
"loss": 0.0135, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.14927181601524353, |
|
"learning_rate": 9.495896834701056e-06, |
|
"loss": 0.0245, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.9106227106227105, |
|
"grad_norm": 0.11408836394548416, |
|
"learning_rate": 8.909730363423214e-06, |
|
"loss": 0.0161, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.9164835164835163, |
|
"grad_norm": 0.06911155581474304, |
|
"learning_rate": 8.32356389214537e-06, |
|
"loss": 0.0154, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.9223443223443224, |
|
"grad_norm": 0.11621779948472977, |
|
"learning_rate": 7.737397420867527e-06, |
|
"loss": 0.0144, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.9282051282051282, |
|
"grad_norm": 0.046058397740125656, |
|
"learning_rate": 7.151230949589684e-06, |
|
"loss": 0.0093, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.934065934065934, |
|
"grad_norm": 0.11228576302528381, |
|
"learning_rate": 6.565064478311841e-06, |
|
"loss": 0.022, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.93992673992674, |
|
"grad_norm": 0.1315338909626007, |
|
"learning_rate": 5.978898007033998e-06, |
|
"loss": 0.0193, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 1.9457875457875458, |
|
"grad_norm": 0.040056392550468445, |
|
"learning_rate": 5.3927315357561546e-06, |
|
"loss": 0.0132, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.9516483516483516, |
|
"grad_norm": 0.10741738229990005, |
|
"learning_rate": 4.806565064478312e-06, |
|
"loss": 0.0352, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.9575091575091577, |
|
"grad_norm": 0.059029560536146164, |
|
"learning_rate": 4.220398593200469e-06, |
|
"loss": 0.019, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.9633699633699635, |
|
"grad_norm": 0.06789711117744446, |
|
"learning_rate": 3.6342321219226262e-06, |
|
"loss": 0.0189, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.9692307692307693, |
|
"grad_norm": 0.02918117679655552, |
|
"learning_rate": 3.0480656506447833e-06, |
|
"loss": 0.0259, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.9750915750915752, |
|
"grad_norm": 0.08073403686285019, |
|
"learning_rate": 2.4618991793669404e-06, |
|
"loss": 0.0286, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.980952380952381, |
|
"grad_norm": 0.1617717295885086, |
|
"learning_rate": 1.8757327080890972e-06, |
|
"loss": 0.0191, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.9868131868131869, |
|
"grad_norm": 0.06613462418317795, |
|
"learning_rate": 1.2895662368112545e-06, |
|
"loss": 0.0128, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.9926739926739927, |
|
"grad_norm": 0.08398256450891495, |
|
"learning_rate": 7.033997655334116e-07, |
|
"loss": 0.0118, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.9926739926739927, |
|
"eval_loss": 0.018166696652770042, |
|
"eval_runtime": 4.6959, |
|
"eval_samples_per_second": 5.963, |
|
"eval_steps_per_second": 2.981, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.9985347985347985, |
|
"grad_norm": 0.0773661658167839, |
|
"learning_rate": 1.1723329425556858e-07, |
|
"loss": 0.0146, |
|
"step": 1705 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1706, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.764499308335456e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|