|
{ |
|
"best_metric": 3.8748562335968018, |
|
"best_model_checkpoint": "/kaggle/working/checkpoint-669", |
|
"epoch": 50.0, |
|
"eval_steps": 500, |
|
"global_step": 11150, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 3.875725507736206, |
|
"eval_mean_perplexity": 366.3703887939453, |
|
"eval_perplexities": [ |
|
335.80645751953125, |
|
377.47796630859375, |
|
475.9695129394531, |
|
276.7535705566406, |
|
294.2838134765625, |
|
264.73394775390625, |
|
394.9538269042969, |
|
325.2862854003906, |
|
525.1426391601562, |
|
393.2958679199219 |
|
], |
|
"eval_runtime": 2.3638, |
|
"eval_samples_per_second": 4.231, |
|
"eval_steps_per_second": 0.846, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 3.877554416656494, |
|
"eval_mean_perplexity": 373.7104461669922, |
|
"eval_perplexities": [ |
|
356.83929443359375, |
|
425.78094482421875, |
|
495.7661437988281, |
|
287.1143798828125, |
|
270.86737060546875, |
|
256.76934814453125, |
|
391.27044677734375, |
|
345.6097412109375, |
|
498.4945373535156, |
|
408.5922546386719 |
|
], |
|
"eval_runtime": 2.2583, |
|
"eval_samples_per_second": 4.428, |
|
"eval_steps_per_second": 0.886, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 3.8748562335968018, |
|
"eval_mean_perplexity": 378.68709869384764, |
|
"eval_perplexities": [ |
|
404.7926940917969, |
|
400.53558349609375, |
|
487.69427490234375, |
|
229.20298767089844, |
|
278.294189453125, |
|
259.6055908203125, |
|
387.93963623046875, |
|
365.1226501464844, |
|
508.79669189453125, |
|
464.8866882324219 |
|
], |
|
"eval_runtime": 2.4538, |
|
"eval_samples_per_second": 4.075, |
|
"eval_steps_per_second": 0.815, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 3.889125108718872, |
|
"eval_mean_perplexity": 395.6389587402344, |
|
"eval_perplexities": [ |
|
386.92559814453125, |
|
444.32073974609375, |
|
480.409912109375, |
|
278.6746520996094, |
|
279.8287353515625, |
|
286.8155212402344, |
|
426.2447204589844, |
|
352.7798767089844, |
|
545.115234375, |
|
475.27459716796875 |
|
], |
|
"eval_runtime": 2.2494, |
|
"eval_samples_per_second": 4.446, |
|
"eval_steps_per_second": 0.889, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 3.9018893241882324, |
|
"eval_mean_perplexity": 418.4004302978516, |
|
"eval_perplexities": [ |
|
426.7615051269531, |
|
474.9723815917969, |
|
516.2012329101562, |
|
285.6946105957031, |
|
314.9617919921875, |
|
305.4250793457031, |
|
411.5309143066406, |
|
412.8928527832031, |
|
537.16943359375, |
|
498.3945007324219 |
|
], |
|
"eval_runtime": 2.2536, |
|
"eval_samples_per_second": 4.437, |
|
"eval_steps_per_second": 0.887, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 3.9105305671691895, |
|
"eval_mean_perplexity": 423.3804229736328, |
|
"eval_perplexities": [ |
|
421.5846862792969, |
|
457.750244140625, |
|
521.7881469726562, |
|
293.4595642089844, |
|
283.9613037109375, |
|
287.46807861328125, |
|
451.5904846191406, |
|
458.28509521484375, |
|
608.7252197265625, |
|
449.19140625 |
|
], |
|
"eval_runtime": 2.261, |
|
"eval_samples_per_second": 4.423, |
|
"eval_steps_per_second": 0.885, |
|
"step": 1338 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 3.910905122756958, |
|
"eval_mean_perplexity": 437.6560882568359, |
|
"eval_perplexities": [ |
|
459.6704406738281, |
|
471.1756591796875, |
|
508.8789367675781, |
|
278.22161865234375, |
|
305.4996337890625, |
|
335.0756530761719, |
|
449.5406494140625, |
|
402.190673828125, |
|
618.786376953125, |
|
547.521240234375 |
|
], |
|
"eval_runtime": 2.4541, |
|
"eval_samples_per_second": 4.075, |
|
"eval_steps_per_second": 0.815, |
|
"step": 1561 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 3.9314892292022705, |
|
"eval_mean_perplexity": 451.49346618652345, |
|
"eval_perplexities": [ |
|
432.6178283691406, |
|
494.5645446777344, |
|
518.4781494140625, |
|
289.39727783203125, |
|
317.63031005859375, |
|
327.0579528808594, |
|
491.4974670410156, |
|
445.9523620605469, |
|
667.2817993164062, |
|
530.4569702148438 |
|
], |
|
"eval_runtime": 3.1442, |
|
"eval_samples_per_second": 3.18, |
|
"eval_steps_per_second": 0.636, |
|
"step": 1784 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 3.9446353912353516, |
|
"eval_mean_perplexity": 451.08067626953124, |
|
"eval_perplexities": [ |
|
452.6179504394531, |
|
445.2970275878906, |
|
576.3162841796875, |
|
262.3009033203125, |
|
326.6243591308594, |
|
311.2583923339844, |
|
440.389404296875, |
|
448.5811767578125, |
|
677.4688110351562, |
|
569.9524536132812 |
|
], |
|
"eval_runtime": 2.2555, |
|
"eval_samples_per_second": 4.434, |
|
"eval_steps_per_second": 0.887, |
|
"step": 2007 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 3.979356288909912, |
|
"eval_mean_perplexity": 466.0081390380859, |
|
"eval_perplexities": [ |
|
479.5843505859375, |
|
542.103759765625, |
|
561.8331298828125, |
|
283.5645751953125, |
|
342.3445739746094, |
|
315.0624694824219, |
|
494.7655334472656, |
|
397.63812255859375, |
|
683.822265625, |
|
559.3626098632812 |
|
], |
|
"eval_runtime": 2.255, |
|
"eval_samples_per_second": 4.435, |
|
"eval_steps_per_second": 0.887, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 3.985069751739502, |
|
"eval_mean_perplexity": 471.23480529785155, |
|
"eval_perplexities": [ |
|
442.65118408203125, |
|
578.3666381835938, |
|
530.2559204101562, |
|
292.97174072265625, |
|
325.9568786621094, |
|
331.6105651855469, |
|
466.1777648925781, |
|
453.5052490234375, |
|
693.37646484375, |
|
597.4756469726562 |
|
], |
|
"eval_runtime": 2.2568, |
|
"eval_samples_per_second": 4.431, |
|
"eval_steps_per_second": 0.886, |
|
"step": 2453 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 4.011897087097168, |
|
"eval_mean_perplexity": 481.7092010498047, |
|
"eval_perplexities": [ |
|
490.259765625, |
|
590.09716796875, |
|
524.1170043945312, |
|
292.7325134277344, |
|
362.09210205078125, |
|
320.8348388671875, |
|
483.432861328125, |
|
492.00469970703125, |
|
676.3094482421875, |
|
585.2116088867188 |
|
], |
|
"eval_runtime": 2.2714, |
|
"eval_samples_per_second": 4.402, |
|
"eval_steps_per_second": 0.88, |
|
"step": 2676 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 4.028537750244141, |
|
"eval_mean_perplexity": 491.83470153808594, |
|
"eval_perplexities": [ |
|
434.7430725097656, |
|
579.150390625, |
|
531.6643676757812, |
|
323.02447509765625, |
|
381.5349426269531, |
|
329.8095397949219, |
|
569.4808349609375, |
|
454.2198486328125, |
|
723.43798828125, |
|
591.2815551757812 |
|
], |
|
"eval_runtime": 2.268, |
|
"eval_samples_per_second": 4.409, |
|
"eval_steps_per_second": 0.882, |
|
"step": 2899 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 4.046214580535889, |
|
"eval_mean_perplexity": 488.7431976318359, |
|
"eval_perplexities": [ |
|
465.3315734863281, |
|
577.7365112304688, |
|
532.9083251953125, |
|
298.2411193847656, |
|
371.57415771484375, |
|
335.2172546386719, |
|
547.2592163085938, |
|
480.532470703125, |
|
732.7112426757812, |
|
545.9201049804688 |
|
], |
|
"eval_runtime": 2.2446, |
|
"eval_samples_per_second": 4.455, |
|
"eval_steps_per_second": 0.891, |
|
"step": 3122 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 4.064830780029297, |
|
"eval_mean_perplexity": 493.6821044921875, |
|
"eval_perplexities": [ |
|
508.1759948730469, |
|
588.5740356445312, |
|
494.7876892089844, |
|
310.3844299316406, |
|
357.8840026855469, |
|
349.0634460449219, |
|
574.1718139648438, |
|
489.5400085449219, |
|
730.5029907226562, |
|
533.7366333007812 |
|
], |
|
"eval_runtime": 2.2496, |
|
"eval_samples_per_second": 4.445, |
|
"eval_steps_per_second": 0.889, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 4.0865068435668945, |
|
"eval_mean_perplexity": 527.1822723388672, |
|
"eval_perplexities": [ |
|
516.6371459960938, |
|
603.1266479492188, |
|
558.2211303710938, |
|
320.877685546875, |
|
440.86993408203125, |
|
384.0972595214844, |
|
586.449951171875, |
|
441.53643798828125, |
|
805.31201171875, |
|
614.6945190429688 |
|
], |
|
"eval_runtime": 2.2935, |
|
"eval_samples_per_second": 4.36, |
|
"eval_steps_per_second": 0.872, |
|
"step": 3568 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 4.112179756164551, |
|
"eval_mean_perplexity": 517.5738647460937, |
|
"eval_perplexities": [ |
|
484.16485595703125, |
|
623.9596557617188, |
|
544.3837890625, |
|
327.7882995605469, |
|
432.2279052734375, |
|
325.2957458496094, |
|
587.2503051757812, |
|
500.35589599609375, |
|
768.452392578125, |
|
581.8598022460938 |
|
], |
|
"eval_runtime": 2.6951, |
|
"eval_samples_per_second": 3.71, |
|
"eval_steps_per_second": 0.742, |
|
"step": 3791 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 4.133630275726318, |
|
"eval_mean_perplexity": 532.7662231445313, |
|
"eval_perplexities": [ |
|
459.6676025390625, |
|
644.7689819335938, |
|
559.6870727539062, |
|
313.52886962890625, |
|
407.5067443847656, |
|
358.4698486328125, |
|
635.401123046875, |
|
511.8723449707031, |
|
728.2614135742188, |
|
708.4982299804688 |
|
], |
|
"eval_runtime": 2.2702, |
|
"eval_samples_per_second": 4.405, |
|
"eval_steps_per_second": 0.881, |
|
"step": 4014 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 4.14639949798584, |
|
"eval_mean_perplexity": 546.8367309570312, |
|
"eval_perplexities": [ |
|
513.874267578125, |
|
633.9168090820312, |
|
623.620849609375, |
|
317.8645324707031, |
|
417.3320007324219, |
|
351.769287109375, |
|
557.8528442382812, |
|
530.8079223632812, |
|
834.164306640625, |
|
687.1644897460938 |
|
], |
|
"eval_runtime": 2.28, |
|
"eval_samples_per_second": 4.386, |
|
"eval_steps_per_second": 0.877, |
|
"step": 4237 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 4.1683173179626465, |
|
"eval_mean_perplexity": 530.1545104980469, |
|
"eval_perplexities": [ |
|
518.4563598632812, |
|
616.3810424804688, |
|
584.39404296875, |
|
357.87530517578125, |
|
424.8235778808594, |
|
363.0649719238281, |
|
572.78369140625, |
|
489.98748779296875, |
|
762.8850708007812, |
|
610.8935546875 |
|
], |
|
"eval_runtime": 2.5117, |
|
"eval_samples_per_second": 3.981, |
|
"eval_steps_per_second": 0.796, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_loss": 4.197434425354004, |
|
"eval_mean_perplexity": 552.9715087890625, |
|
"eval_perplexities": [ |
|
496.9158630371094, |
|
622.5189208984375, |
|
620.3365478515625, |
|
315.29119873046875, |
|
508.8257751464844, |
|
386.271728515625, |
|
597.5241088867188, |
|
540.0391845703125, |
|
771.0007934570312, |
|
670.990966796875 |
|
], |
|
"eval_runtime": 2.2574, |
|
"eval_samples_per_second": 4.43, |
|
"eval_steps_per_second": 0.886, |
|
"step": 4683 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_loss": 4.2334303855896, |
|
"eval_mean_perplexity": 591.004769897461, |
|
"eval_perplexities": [ |
|
542.1947631835938, |
|
699.9281616210938, |
|
655.9802856445312, |
|
351.9259948730469, |
|
517.2227783203125, |
|
364.0205078125, |
|
738.9130859375, |
|
537.7424926757812, |
|
836.2189331054688, |
|
665.9006958007812 |
|
], |
|
"eval_runtime": 2.2478, |
|
"eval_samples_per_second": 4.449, |
|
"eval_steps_per_second": 0.89, |
|
"step": 4906 |
|
}, |
|
{ |
|
"epoch": 22.42, |
|
"grad_norm": 1.9367769956588745, |
|
"learning_rate": 2.7578475336322873e-05, |
|
"loss": 3.112, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_loss": 4.230788707733154, |
|
"eval_mean_perplexity": 571.9811889648438, |
|
"eval_perplexities": [ |
|
529.1312255859375, |
|
677.629638671875, |
|
639.4375, |
|
351.9476623535156, |
|
471.0734558105469, |
|
388.02935791015625, |
|
630.8019409179688, |
|
550.748046875, |
|
854.4290771484375, |
|
626.583984375 |
|
], |
|
"eval_runtime": 2.3322, |
|
"eval_samples_per_second": 4.288, |
|
"eval_steps_per_second": 0.858, |
|
"step": 5129 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 4.264792442321777, |
|
"eval_mean_perplexity": 588.0643798828125, |
|
"eval_perplexities": [ |
|
532.3706665039062, |
|
650.5995483398438, |
|
640.096435546875, |
|
367.4383850097656, |
|
510.7347106933594, |
|
402.79925537109375, |
|
740.1875610351562, |
|
580.5010375976562, |
|
837.0123901367188, |
|
618.90380859375 |
|
], |
|
"eval_runtime": 2.414, |
|
"eval_samples_per_second": 4.142, |
|
"eval_steps_per_second": 0.828, |
|
"step": 5352 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_loss": 4.280625820159912, |
|
"eval_mean_perplexity": 568.3704132080078, |
|
"eval_perplexities": [ |
|
493.6273193359375, |
|
687.0193481445312, |
|
617.8013305664062, |
|
356.9241943359375, |
|
462.1817321777344, |
|
377.38800048828125, |
|
662.7510375976562, |
|
553.1047973632812, |
|
769.9375, |
|
702.9688720703125 |
|
], |
|
"eval_runtime": 2.2617, |
|
"eval_samples_per_second": 4.421, |
|
"eval_steps_per_second": 0.884, |
|
"step": 5575 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_loss": 4.299654006958008, |
|
"eval_mean_perplexity": 585.3176055908203, |
|
"eval_perplexities": [ |
|
507.7140808105469, |
|
719.7584228515625, |
|
610.5079956054688, |
|
355.23577880859375, |
|
474.7140197753906, |
|
343.5024719238281, |
|
644.6328125, |
|
599.3197631835938, |
|
868.6974487304688, |
|
729.09326171875 |
|
], |
|
"eval_runtime": 2.2752, |
|
"eval_samples_per_second": 4.395, |
|
"eval_steps_per_second": 0.879, |
|
"step": 5798 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_loss": 4.309383869171143, |
|
"eval_mean_perplexity": 572.4837829589844, |
|
"eval_perplexities": [ |
|
532.3777465820312, |
|
701.5357055664062, |
|
574.2848510742188, |
|
366.85614013671875, |
|
481.5206298828125, |
|
389.685546875, |
|
579.2503662109375, |
|
525.9729614257812, |
|
906.0205078125, |
|
667.3333740234375 |
|
], |
|
"eval_runtime": 2.4436, |
|
"eval_samples_per_second": 4.092, |
|
"eval_steps_per_second": 0.818, |
|
"step": 6021 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_loss": 4.338665962219238, |
|
"eval_mean_perplexity": 617.5483917236328, |
|
"eval_perplexities": [ |
|
530.51416015625, |
|
770.9551391601562, |
|
616.0216674804688, |
|
386.1516418457031, |
|
501.05426025390625, |
|
419.87841796875, |
|
735.8825073242188, |
|
594.3335571289062, |
|
911.9882202148438, |
|
708.704345703125 |
|
], |
|
"eval_runtime": 2.2597, |
|
"eval_samples_per_second": 4.425, |
|
"eval_steps_per_second": 0.885, |
|
"step": 6244 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_loss": 4.354172229766846, |
|
"eval_mean_perplexity": 609.77958984375, |
|
"eval_perplexities": [ |
|
571.1878662109375, |
|
744.3867797851562, |
|
607.4262084960938, |
|
374.7521057128906, |
|
501.3298034667969, |
|
401.453369140625, |
|
706.0652465820312, |
|
600.0166625976562, |
|
898.824951171875, |
|
692.3529052734375 |
|
], |
|
"eval_runtime": 2.4521, |
|
"eval_samples_per_second": 4.078, |
|
"eval_steps_per_second": 0.816, |
|
"step": 6467 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_loss": 4.371840000152588, |
|
"eval_mean_perplexity": 639.5644592285156, |
|
"eval_perplexities": [ |
|
518.4427490234375, |
|
768.5755615234375, |
|
643.536376953125, |
|
434.2029724121094, |
|
553.5997924804688, |
|
401.9341735839844, |
|
738.0169677734375, |
|
627.0206909179688, |
|
923.0927734375, |
|
787.2225341796875 |
|
], |
|
"eval_runtime": 2.2759, |
|
"eval_samples_per_second": 4.394, |
|
"eval_steps_per_second": 0.879, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_loss": 4.397173881530762, |
|
"eval_mean_perplexity": 666.7775909423829, |
|
"eval_perplexities": [ |
|
549.1520385742188, |
|
851.6005249023438, |
|
712.2999267578125, |
|
395.8175048828125, |
|
540.697021484375, |
|
451.8854064941406, |
|
723.3524780273438, |
|
630.9887084960938, |
|
1021.4713745117188, |
|
790.5109252929688 |
|
], |
|
"eval_runtime": 2.2555, |
|
"eval_samples_per_second": 4.434, |
|
"eval_steps_per_second": 0.887, |
|
"step": 6913 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_loss": 4.415882110595703, |
|
"eval_mean_perplexity": 660.8353820800781, |
|
"eval_perplexities": [ |
|
512.7852783203125, |
|
785.50439453125, |
|
668.9815063476562, |
|
457.69830322265625, |
|
539.0671997070312, |
|
459.60736083984375, |
|
751.9481201171875, |
|
625.0054321289062, |
|
999.4456787109375, |
|
808.310546875 |
|
], |
|
"eval_runtime": 2.2544, |
|
"eval_samples_per_second": 4.436, |
|
"eval_steps_per_second": 0.887, |
|
"step": 7136 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_loss": 4.42371940612793, |
|
"eval_mean_perplexity": 662.7573822021484, |
|
"eval_perplexities": [ |
|
559.4247436523438, |
|
747.0921020507812, |
|
678.9531860351562, |
|
437.0193786621094, |
|
544.0770874023438, |
|
431.26568603515625, |
|
756.7093505859375, |
|
626.279052734375, |
|
1022.3748168945312, |
|
824.37841796875 |
|
], |
|
"eval_runtime": 2.2901, |
|
"eval_samples_per_second": 4.367, |
|
"eval_steps_per_second": 0.873, |
|
"step": 7359 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_loss": 4.434357166290283, |
|
"eval_mean_perplexity": 650.7052612304688, |
|
"eval_perplexities": [ |
|
549.6817626953125, |
|
811.8685302734375, |
|
639.683349609375, |
|
410.87249755859375, |
|
510.8334045410156, |
|
438.6333923339844, |
|
779.5153198242188, |
|
584.6727294921875, |
|
1027.2081298828125, |
|
754.08349609375 |
|
], |
|
"eval_runtime": 2.3566, |
|
"eval_samples_per_second": 4.243, |
|
"eval_steps_per_second": 0.849, |
|
"step": 7582 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_loss": 4.456236839294434, |
|
"eval_mean_perplexity": 683.213168334961, |
|
"eval_perplexities": [ |
|
584.2586059570312, |
|
837.6990966796875, |
|
639.265869140625, |
|
431.7611389160156, |
|
558.5822143554688, |
|
446.50067138671875, |
|
781.8605346679688, |
|
649.52392578125, |
|
1031.22900390625, |
|
871.4506225585938 |
|
], |
|
"eval_runtime": 2.2758, |
|
"eval_samples_per_second": 4.394, |
|
"eval_steps_per_second": 0.879, |
|
"step": 7805 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_loss": 4.476213455200195, |
|
"eval_mean_perplexity": 673.9735595703125, |
|
"eval_perplexities": [ |
|
570.093505859375, |
|
876.3085327148438, |
|
637.6167602539062, |
|
410.8495788574219, |
|
565.6197509765625, |
|
431.1340637207031, |
|
784.1497192382812, |
|
635.3842163085938, |
|
1020.8198852539062, |
|
807.7595825195312 |
|
], |
|
"eval_runtime": 2.2666, |
|
"eval_samples_per_second": 4.412, |
|
"eval_steps_per_second": 0.882, |
|
"step": 8028 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_loss": 4.485024452209473, |
|
"eval_mean_perplexity": 678.1103912353516, |
|
"eval_perplexities": [ |
|
524.7376708984375, |
|
897.5795288085938, |
|
604.8717651367188, |
|
421.68621826171875, |
|
562.4842529296875, |
|
447.8833312988281, |
|
743.5353393554688, |
|
641.0125122070312, |
|
1038.24658203125, |
|
899.0667114257812 |
|
], |
|
"eval_runtime": 2.2714, |
|
"eval_samples_per_second": 4.403, |
|
"eval_steps_per_second": 0.881, |
|
"step": 8251 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_loss": 4.500662326812744, |
|
"eval_mean_perplexity": 667.2204986572266, |
|
"eval_perplexities": [ |
|
517.3828125, |
|
849.9619140625, |
|
632.2133178710938, |
|
421.2346496582031, |
|
562.2227783203125, |
|
412.367919921875, |
|
801.9597778320312, |
|
634.914794921875, |
|
955.3936767578125, |
|
884.5533447265625 |
|
], |
|
"eval_runtime": 2.2586, |
|
"eval_samples_per_second": 4.428, |
|
"eval_steps_per_second": 0.886, |
|
"step": 8474 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_loss": 4.507510662078857, |
|
"eval_mean_perplexity": 674.6702362060547, |
|
"eval_perplexities": [ |
|
523.6810913085938, |
|
871.532958984375, |
|
702.7888793945312, |
|
412.113525390625, |
|
559.32421875, |
|
436.3467712402344, |
|
741.5978393554688, |
|
612.4104614257812, |
|
969.3262329101562, |
|
917.5803833007812 |
|
], |
|
"eval_runtime": 2.2926, |
|
"eval_samples_per_second": 4.362, |
|
"eval_steps_per_second": 0.872, |
|
"step": 8697 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_loss": 4.5184006690979, |
|
"eval_mean_perplexity": 690.1411804199219, |
|
"eval_perplexities": [ |
|
533.463623046875, |
|
880.5369262695312, |
|
677.1242065429688, |
|
468.8255920410156, |
|
524.9594116210938, |
|
481.3450012207031, |
|
724.6446533203125, |
|
644.5559692382812, |
|
1083.451416015625, |
|
882.5050048828125 |
|
], |
|
"eval_runtime": 2.2507, |
|
"eval_samples_per_second": 4.443, |
|
"eval_steps_per_second": 0.889, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_loss": 4.528339385986328, |
|
"eval_mean_perplexity": 686.3480163574219, |
|
"eval_perplexities": [ |
|
540.8236083984375, |
|
941.376708984375, |
|
641.9276733398438, |
|
446.772216796875, |
|
562.5877685546875, |
|
455.73175048828125, |
|
764.4960327148438, |
|
647.960205078125, |
|
983.826416015625, |
|
877.977783203125 |
|
], |
|
"eval_runtime": 2.2901, |
|
"eval_samples_per_second": 4.367, |
|
"eval_steps_per_second": 0.873, |
|
"step": 9143 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_loss": 4.525776386260986, |
|
"eval_mean_perplexity": 692.94345703125, |
|
"eval_perplexities": [ |
|
512.2537841796875, |
|
996.9277954101562, |
|
626.796142578125, |
|
407.5407409667969, |
|
572.6943969726562, |
|
439.5016174316406, |
|
776.8684692382812, |
|
660.894775390625, |
|
1015.8809204101562, |
|
920.075927734375 |
|
], |
|
"eval_runtime": 2.2958, |
|
"eval_samples_per_second": 4.356, |
|
"eval_steps_per_second": 0.871, |
|
"step": 9366 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_loss": 4.5432844161987305, |
|
"eval_mean_perplexity": 698.305697631836, |
|
"eval_perplexities": [ |
|
520.623779296875, |
|
898.4200439453125, |
|
635.4287719726562, |
|
451.1359558105469, |
|
590.0744018554688, |
|
448.103759765625, |
|
752.5787353515625, |
|
682.3783569335938, |
|
1070.27294921875, |
|
934.0402221679688 |
|
], |
|
"eval_runtime": 2.8416, |
|
"eval_samples_per_second": 3.519, |
|
"eval_steps_per_second": 0.704, |
|
"step": 9589 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_loss": 4.546627998352051, |
|
"eval_mean_perplexity": 698.2709533691407, |
|
"eval_perplexities": [ |
|
496.5508728027344, |
|
900.9185180664062, |
|
651.6212768554688, |
|
429.4538879394531, |
|
572.0066528320312, |
|
429.689208984375, |
|
786.3671264648438, |
|
686.788818359375, |
|
1078.3935546875, |
|
950.9196166992188 |
|
], |
|
"eval_runtime": 2.2997, |
|
"eval_samples_per_second": 4.348, |
|
"eval_steps_per_second": 0.87, |
|
"step": 9812 |
|
}, |
|
{ |
|
"epoch": 44.84, |
|
"grad_norm": 2.2081024646759033, |
|
"learning_rate": 5.15695067264574e-06, |
|
"loss": 2.4732, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_loss": 4.5482940673828125, |
|
"eval_mean_perplexity": 683.0953948974609, |
|
"eval_perplexities": [ |
|
513.7635498046875, |
|
881.8916625976562, |
|
650.1325073242188, |
|
426.90740966796875, |
|
583.7874145507812, |
|
441.8877258300781, |
|
793.0048828125, |
|
660.0620727539062, |
|
992.5838012695312, |
|
886.9329223632812 |
|
], |
|
"eval_runtime": 2.6736, |
|
"eval_samples_per_second": 3.74, |
|
"eval_steps_per_second": 0.748, |
|
"step": 10035 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_loss": 4.556254863739014, |
|
"eval_mean_perplexity": 694.8246520996094, |
|
"eval_perplexities": [ |
|
484.3752136230469, |
|
864.9951171875, |
|
721.4644775390625, |
|
440.7280578613281, |
|
583.1522216796875, |
|
458.59246826171875, |
|
795.0460815429688, |
|
658.3065795898438, |
|
1043.74267578125, |
|
897.8436279296875 |
|
], |
|
"eval_runtime": 2.6592, |
|
"eval_samples_per_second": 3.761, |
|
"eval_steps_per_second": 0.752, |
|
"step": 10258 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_loss": 4.565934181213379, |
|
"eval_mean_perplexity": 697.1294586181641, |
|
"eval_perplexities": [ |
|
521.9780883789062, |
|
891.4306030273438, |
|
694.7803344726562, |
|
465.2270812988281, |
|
585.0350341796875, |
|
454.22308349609375, |
|
784.2135620117188, |
|
672.1557006835938, |
|
1016.0956420898438, |
|
886.1554565429688 |
|
], |
|
"eval_runtime": 2.3829, |
|
"eval_samples_per_second": 4.197, |
|
"eval_steps_per_second": 0.839, |
|
"step": 10481 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_loss": 4.569299221038818, |
|
"eval_mean_perplexity": 698.8598907470703, |
|
"eval_perplexities": [ |
|
523.372802734375, |
|
930.4688720703125, |
|
683.875732421875, |
|
451.5999755859375, |
|
597.7974243164062, |
|
447.0304870605469, |
|
766.9913940429688, |
|
671.3782348632812, |
|
1039.4984130859375, |
|
876.5855712890625 |
|
], |
|
"eval_runtime": 2.2797, |
|
"eval_samples_per_second": 4.387, |
|
"eval_steps_per_second": 0.877, |
|
"step": 10704 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_loss": 4.569689750671387, |
|
"eval_mean_perplexity": 694.5519165039062, |
|
"eval_perplexities": [ |
|
508.9573059082031, |
|
927.446533203125, |
|
672.3489379882812, |
|
473.5368347167969, |
|
568.0377807617188, |
|
454.00933837890625, |
|
733.6597900390625, |
|
695.4989624023438, |
|
1047.673828125, |
|
864.349853515625 |
|
], |
|
"eval_runtime": 2.3973, |
|
"eval_samples_per_second": 4.171, |
|
"eval_steps_per_second": 0.834, |
|
"step": 10927 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_loss": 4.569555759429932, |
|
"eval_mean_perplexity": 698.6035186767579, |
|
"eval_perplexities": [ |
|
517.4149169921875, |
|
924.535888671875, |
|
704.73291015625, |
|
465.9677429199219, |
|
577.629150390625, |
|
443.994140625, |
|
770.1861572265625, |
|
683.028076171875, |
|
1017.7510375976562, |
|
880.795166015625 |
|
], |
|
"eval_runtime": 2.2771, |
|
"eval_samples_per_second": 4.391, |
|
"eval_steps_per_second": 0.878, |
|
"step": 11150 |
|
} |
|
], |
|
"logging_steps": 5000, |
|
"max_steps": 11150, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"total_flos": 1.1355181056e+16, |
|
"train_batch_size": 10, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|