| { |
| "best_global_step": 4000, |
| "best_metric": 0.19092191755771637, |
| "best_model_checkpoint": "/home/flytekit/n0w0f/data/mattext_ckpt/results/2026-02-05/18-01-14/pretrain/checkpoints/robocrys_rep_test-pretrain/checkpoint-4000", |
| "epoch": 8.602150537634408, |
| "eval_steps": 50, |
| "global_step": 4000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.10752688172043011, |
| "grad_norm": 1.1888866424560547, |
| "learning_rate": 0.00019957849462365592, |
| "loss": 5.97920654296875, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.10752688172043011, |
| "eval_loss": 4.124914646148682, |
| "eval_runtime": 60.5178, |
| "eval_samples_per_second": 314.023, |
| "eval_steps_per_second": 39.261, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.21505376344086022, |
| "grad_norm": 0.9824994802474976, |
| "learning_rate": 0.00019914838709677422, |
| "loss": 3.916483154296875, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.21505376344086022, |
| "eval_loss": 3.675534248352051, |
| "eval_runtime": 61.1234, |
| "eval_samples_per_second": 310.912, |
| "eval_steps_per_second": 38.872, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3225806451612903, |
| "grad_norm": 0.867065966129303, |
| "learning_rate": 0.00019871827956989248, |
| "loss": 3.620672302246094, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.3225806451612903, |
| "eval_loss": 3.4746599197387695, |
| "eval_runtime": 61.4793, |
| "eval_samples_per_second": 309.112, |
| "eval_steps_per_second": 38.647, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.43010752688172044, |
| "grad_norm": 1.192267894744873, |
| "learning_rate": 0.00019828817204301075, |
| "loss": 3.471976013183594, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.43010752688172044, |
| "eval_loss": 3.353644371032715, |
| "eval_runtime": 60.5187, |
| "eval_samples_per_second": 314.019, |
| "eval_steps_per_second": 39.261, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5376344086021505, |
| "grad_norm": 1.0798981189727783, |
| "learning_rate": 0.00019785806451612904, |
| "loss": 3.360224609375, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5376344086021505, |
| "eval_loss": 3.247636079788208, |
| "eval_runtime": 61.527, |
| "eval_samples_per_second": 308.873, |
| "eval_steps_per_second": 38.617, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.6451612903225806, |
| "grad_norm": 1.3051457405090332, |
| "learning_rate": 0.00019742795698924733, |
| "loss": 3.262052307128906, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6451612903225806, |
| "eval_loss": 3.1502654552459717, |
| "eval_runtime": 60.999, |
| "eval_samples_per_second": 311.546, |
| "eval_steps_per_second": 38.951, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.7526881720430108, |
| "grad_norm": 1.1396135091781616, |
| "learning_rate": 0.0001969978494623656, |
| "loss": 3.225200500488281, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.7526881720430108, |
| "eval_loss": 3.094292163848877, |
| "eval_runtime": 61.381, |
| "eval_samples_per_second": 309.607, |
| "eval_steps_per_second": 38.709, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.8602150537634409, |
| "grad_norm": 1.0816289186477661, |
| "learning_rate": 0.0001965677419354839, |
| "loss": 3.1344537353515625, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.8602150537634409, |
| "eval_loss": 3.0037944316864014, |
| "eval_runtime": 61.1417, |
| "eval_samples_per_second": 310.819, |
| "eval_steps_per_second": 38.861, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.967741935483871, |
| "grad_norm": 1.220457673072815, |
| "learning_rate": 0.00019613763440860216, |
| "loss": 3.024658203125, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.967741935483871, |
| "eval_loss": 2.9253640174865723, |
| "eval_runtime": 61.6823, |
| "eval_samples_per_second": 308.095, |
| "eval_steps_per_second": 38.52, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.075268817204301, |
| "grad_norm": 1.18031644821167, |
| "learning_rate": 0.00019570752688172045, |
| "loss": 2.9539215087890627, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.075268817204301, |
| "eval_loss": 2.827315092086792, |
| "eval_runtime": 64.027, |
| "eval_samples_per_second": 296.812, |
| "eval_steps_per_second": 37.109, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.1827956989247312, |
| "grad_norm": 1.4481481313705444, |
| "learning_rate": 0.00019527741935483872, |
| "loss": 2.8536431884765623, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.1827956989247312, |
| "eval_loss": 2.6743366718292236, |
| "eval_runtime": 60.9092, |
| "eval_samples_per_second": 312.005, |
| "eval_steps_per_second": 39.009, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.2903225806451613, |
| "grad_norm": 1.5985803604125977, |
| "learning_rate": 0.00019484731182795698, |
| "loss": 2.7353704833984374, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.2903225806451613, |
| "eval_loss": 2.4861812591552734, |
| "eval_runtime": 61.6826, |
| "eval_samples_per_second": 308.093, |
| "eval_steps_per_second": 38.52, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.3978494623655915, |
| "grad_norm": 2.046145439147949, |
| "learning_rate": 0.00019441720430107528, |
| "loss": 2.464430084228516, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.3978494623655915, |
| "eval_loss": 2.0265886783599854, |
| "eval_runtime": 61.2709, |
| "eval_samples_per_second": 310.164, |
| "eval_steps_per_second": 38.779, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.5053763440860215, |
| "grad_norm": 1.8674232959747314, |
| "learning_rate": 0.00019398709677419354, |
| "loss": 1.9112973022460937, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.5053763440860215, |
| "eval_loss": 1.3678908348083496, |
| "eval_runtime": 62.2031, |
| "eval_samples_per_second": 305.515, |
| "eval_steps_per_second": 38.197, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.6129032258064515, |
| "grad_norm": 1.708408236503601, |
| "learning_rate": 0.00019355698924731184, |
| "loss": 1.4241523742675781, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.6129032258064515, |
| "eval_loss": 1.0675994157791138, |
| "eval_runtime": 62.2, |
| "eval_samples_per_second": 305.53, |
| "eval_steps_per_second": 38.199, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.7204301075268817, |
| "grad_norm": 1.6592656373977661, |
| "learning_rate": 0.00019312688172043013, |
| "loss": 1.2252975463867188, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.7204301075268817, |
| "eval_loss": 0.9175282716751099, |
| "eval_runtime": 61.3094, |
| "eval_samples_per_second": 309.969, |
| "eval_steps_per_second": 38.754, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.827956989247312, |
| "grad_norm": 1.2984247207641602, |
| "learning_rate": 0.0001926967741935484, |
| "loss": 1.0399230194091797, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.827956989247312, |
| "eval_loss": 0.8346064686775208, |
| "eval_runtime": 61.1605, |
| "eval_samples_per_second": 310.724, |
| "eval_steps_per_second": 38.849, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.935483870967742, |
| "grad_norm": 1.1744712591171265, |
| "learning_rate": 0.0001922666666666667, |
| "loss": 0.9568134307861328, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.935483870967742, |
| "eval_loss": 0.7724924087524414, |
| "eval_runtime": 62.2824, |
| "eval_samples_per_second": 305.126, |
| "eval_steps_per_second": 38.149, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.043010752688172, |
| "grad_norm": 1.2494049072265625, |
| "learning_rate": 0.00019183655913978495, |
| "loss": 0.8979853820800782, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.043010752688172, |
| "eval_loss": 0.7325491905212402, |
| "eval_runtime": 62.8935, |
| "eval_samples_per_second": 302.161, |
| "eval_steps_per_second": 37.778, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.150537634408602, |
| "grad_norm": 1.0687495470046997, |
| "learning_rate": 0.00019140645161290322, |
| "loss": 0.8724540710449219, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.150537634408602, |
| "eval_loss": 0.6943864822387695, |
| "eval_runtime": 64.2005, |
| "eval_samples_per_second": 296.01, |
| "eval_steps_per_second": 37.009, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.258064516129032, |
| "grad_norm": 0.9108296036720276, |
| "learning_rate": 0.0001909763440860215, |
| "loss": 0.8106794738769532, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.258064516129032, |
| "eval_loss": 0.666123628616333, |
| "eval_runtime": 60.9142, |
| "eval_samples_per_second": 311.98, |
| "eval_steps_per_second": 39.006, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.3655913978494625, |
| "grad_norm": 0.8529163002967834, |
| "learning_rate": 0.00019054623655913978, |
| "loss": 0.7816014862060547, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.3655913978494625, |
| "eval_loss": 0.6435992121696472, |
| "eval_runtime": 61.9346, |
| "eval_samples_per_second": 306.84, |
| "eval_steps_per_second": 38.363, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.4731182795698925, |
| "grad_norm": 0.9023746848106384, |
| "learning_rate": 0.00019011612903225807, |
| "loss": 0.7448858642578124, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.4731182795698925, |
| "eval_loss": 0.6147477626800537, |
| "eval_runtime": 60.7037, |
| "eval_samples_per_second": 313.062, |
| "eval_steps_per_second": 39.141, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.5806451612903225, |
| "grad_norm": 0.7893891930580139, |
| "learning_rate": 0.00018968602150537636, |
| "loss": 0.7744358062744141, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.5806451612903225, |
| "eval_loss": 0.6008749604225159, |
| "eval_runtime": 62.0421, |
| "eval_samples_per_second": 306.308, |
| "eval_steps_per_second": 38.297, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.688172043010753, |
| "grad_norm": 0.8543435335159302, |
| "learning_rate": 0.00018925591397849463, |
| "loss": 0.698813705444336, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.688172043010753, |
| "eval_loss": 0.5843669176101685, |
| "eval_runtime": 61.7236, |
| "eval_samples_per_second": 307.889, |
| "eval_steps_per_second": 38.494, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.795698924731183, |
| "grad_norm": 0.862782895565033, |
| "learning_rate": 0.00018882580645161292, |
| "loss": 0.7231275939941406, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.795698924731183, |
| "eval_loss": 0.560819149017334, |
| "eval_runtime": 61.272, |
| "eval_samples_per_second": 310.158, |
| "eval_steps_per_second": 38.778, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.903225806451613, |
| "grad_norm": 0.8126527667045593, |
| "learning_rate": 0.0001883956989247312, |
| "loss": 0.6607036590576172, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.903225806451613, |
| "eval_loss": 0.5523199439048767, |
| "eval_runtime": 61.41, |
| "eval_samples_per_second": 309.461, |
| "eval_steps_per_second": 38.691, |
| "step": 1350 |
| }, |
| { |
| "epoch": 3.010752688172043, |
| "grad_norm": 0.8788714408874512, |
| "learning_rate": 0.00018796559139784945, |
| "loss": 0.658017349243164, |
| "step": 1400 |
| }, |
| { |
| "epoch": 3.010752688172043, |
| "eval_loss": 0.5504087805747986, |
| "eval_runtime": 61.2893, |
| "eval_samples_per_second": 310.07, |
| "eval_steps_per_second": 38.767, |
| "step": 1400 |
| }, |
| { |
| "epoch": 3.118279569892473, |
| "grad_norm": 0.8354722857475281, |
| "learning_rate": 0.00018753548387096775, |
| "loss": 0.6500599670410157, |
| "step": 1450 |
| }, |
| { |
| "epoch": 3.118279569892473, |
| "eval_loss": 0.5395110845565796, |
| "eval_runtime": 60.5063, |
| "eval_samples_per_second": 314.083, |
| "eval_steps_per_second": 39.269, |
| "step": 1450 |
| }, |
| { |
| "epoch": 3.225806451612903, |
| "grad_norm": 0.8122305870056152, |
| "learning_rate": 0.000187105376344086, |
| "loss": 0.6230792999267578, |
| "step": 1500 |
| }, |
| { |
| "epoch": 3.225806451612903, |
| "eval_loss": 0.5187473297119141, |
| "eval_runtime": 60.7322, |
| "eval_samples_per_second": 312.915, |
| "eval_steps_per_second": 39.123, |
| "step": 1500 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 0.673494815826416, |
| "learning_rate": 0.0001866752688172043, |
| "loss": 0.6118016052246094, |
| "step": 1550 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "eval_loss": 0.5081239938735962, |
| "eval_runtime": 60.5862, |
| "eval_samples_per_second": 313.669, |
| "eval_steps_per_second": 39.217, |
| "step": 1550 |
| }, |
| { |
| "epoch": 3.4408602150537635, |
| "grad_norm": 0.8055212497711182, |
| "learning_rate": 0.0001862451612903226, |
| "loss": 0.6122843170166016, |
| "step": 1600 |
| }, |
| { |
| "epoch": 3.4408602150537635, |
| "eval_loss": 0.49499744176864624, |
| "eval_runtime": 60.6568, |
| "eval_samples_per_second": 313.304, |
| "eval_steps_per_second": 39.171, |
| "step": 1600 |
| }, |
| { |
| "epoch": 3.5483870967741935, |
| "grad_norm": 0.7935542464256287, |
| "learning_rate": 0.00018581505376344087, |
| "loss": 0.5825344467163086, |
| "step": 1650 |
| }, |
| { |
| "epoch": 3.5483870967741935, |
| "eval_loss": 0.48452192544937134, |
| "eval_runtime": 60.5763, |
| "eval_samples_per_second": 313.72, |
| "eval_steps_per_second": 39.223, |
| "step": 1650 |
| }, |
| { |
| "epoch": 3.6559139784946235, |
| "grad_norm": 0.6395400166511536, |
| "learning_rate": 0.00018538494623655916, |
| "loss": 0.5727723693847656, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.6559139784946235, |
| "eval_loss": 0.4738766551017761, |
| "eval_runtime": 60.5051, |
| "eval_samples_per_second": 314.089, |
| "eval_steps_per_second": 39.269, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.763440860215054, |
| "grad_norm": 0.6544663906097412, |
| "learning_rate": 0.00018495483870967742, |
| "loss": 0.5858316421508789, |
| "step": 1750 |
| }, |
| { |
| "epoch": 3.763440860215054, |
| "eval_loss": 0.4562221169471741, |
| "eval_runtime": 60.4697, |
| "eval_samples_per_second": 314.273, |
| "eval_steps_per_second": 39.292, |
| "step": 1750 |
| }, |
| { |
| "epoch": 3.870967741935484, |
| "grad_norm": 0.773256778717041, |
| "learning_rate": 0.00018452473118279572, |
| "loss": 0.5555976867675781, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.870967741935484, |
| "eval_loss": 0.4462752342224121, |
| "eval_runtime": 61.139, |
| "eval_samples_per_second": 310.833, |
| "eval_steps_per_second": 38.862, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.978494623655914, |
| "grad_norm": 0.6679997444152832, |
| "learning_rate": 0.00018409462365591398, |
| "loss": 0.5079600143432618, |
| "step": 1850 |
| }, |
| { |
| "epoch": 3.978494623655914, |
| "eval_loss": 0.43978169560432434, |
| "eval_runtime": 60.5103, |
| "eval_samples_per_second": 314.062, |
| "eval_steps_per_second": 39.266, |
| "step": 1850 |
| }, |
| { |
| "epoch": 4.086021505376344, |
| "grad_norm": 0.7930998206138611, |
| "learning_rate": 0.00018366451612903225, |
| "loss": 0.5580390548706055, |
| "step": 1900 |
| }, |
| { |
| "epoch": 4.086021505376344, |
| "eval_loss": 0.4352206587791443, |
| "eval_runtime": 60.8357, |
| "eval_samples_per_second": 312.382, |
| "eval_steps_per_second": 39.056, |
| "step": 1900 |
| }, |
| { |
| "epoch": 4.193548387096774, |
| "grad_norm": 0.6607942581176758, |
| "learning_rate": 0.00018323440860215054, |
| "loss": 0.49173324584960937, |
| "step": 1950 |
| }, |
| { |
| "epoch": 4.193548387096774, |
| "eval_loss": 0.4238659143447876, |
| "eval_runtime": 60.9872, |
| "eval_samples_per_second": 311.606, |
| "eval_steps_per_second": 38.959, |
| "step": 1950 |
| }, |
| { |
| "epoch": 4.301075268817204, |
| "grad_norm": 0.6287643909454346, |
| "learning_rate": 0.00018280430107526884, |
| "loss": 0.4687882232666016, |
| "step": 2000 |
| }, |
| { |
| "epoch": 4.301075268817204, |
| "eval_loss": 0.4168907403945923, |
| "eval_runtime": 61.005, |
| "eval_samples_per_second": 311.515, |
| "eval_steps_per_second": 38.948, |
| "step": 2000 |
| }, |
| { |
| "epoch": 4.408602150537634, |
| "grad_norm": 0.6433095932006836, |
| "learning_rate": 0.0001823741935483871, |
| "loss": 0.4763982009887695, |
| "step": 2050 |
| }, |
| { |
| "epoch": 4.408602150537634, |
| "eval_loss": 0.4120262861251831, |
| "eval_runtime": 61.5507, |
| "eval_samples_per_second": 308.753, |
| "eval_steps_per_second": 38.602, |
| "step": 2050 |
| }, |
| { |
| "epoch": 4.516129032258064, |
| "grad_norm": 0.76325523853302, |
| "learning_rate": 0.0001819440860215054, |
| "loss": 0.5169943237304687, |
| "step": 2100 |
| }, |
| { |
| "epoch": 4.516129032258064, |
| "eval_loss": 0.40777090191841125, |
| "eval_runtime": 61.9659, |
| "eval_samples_per_second": 306.685, |
| "eval_steps_per_second": 38.344, |
| "step": 2100 |
| }, |
| { |
| "epoch": 4.623655913978495, |
| "grad_norm": 0.7534022331237793, |
| "learning_rate": 0.00018151397849462366, |
| "loss": 0.4840876770019531, |
| "step": 2150 |
| }, |
| { |
| "epoch": 4.623655913978495, |
| "eval_loss": 0.396854966878891, |
| "eval_runtime": 61.4429, |
| "eval_samples_per_second": 309.295, |
| "eval_steps_per_second": 38.67, |
| "step": 2150 |
| }, |
| { |
| "epoch": 4.731182795698925, |
| "grad_norm": 0.688862144947052, |
| "learning_rate": 0.00018108387096774195, |
| "loss": 0.46516273498535154, |
| "step": 2200 |
| }, |
| { |
| "epoch": 4.731182795698925, |
| "eval_loss": 0.38546594977378845, |
| "eval_runtime": 60.8637, |
| "eval_samples_per_second": 312.239, |
| "eval_steps_per_second": 39.038, |
| "step": 2200 |
| }, |
| { |
| "epoch": 4.838709677419355, |
| "grad_norm": 0.5328208208084106, |
| "learning_rate": 0.00018065376344086022, |
| "loss": 0.5028326034545898, |
| "step": 2250 |
| }, |
| { |
| "epoch": 4.838709677419355, |
| "eval_loss": 0.37445569038391113, |
| "eval_runtime": 61.5819, |
| "eval_samples_per_second": 308.597, |
| "eval_steps_per_second": 38.583, |
| "step": 2250 |
| }, |
| { |
| "epoch": 4.946236559139785, |
| "grad_norm": 0.5857045650482178, |
| "learning_rate": 0.00018022365591397848, |
| "loss": 0.43645286560058594, |
| "step": 2300 |
| }, |
| { |
| "epoch": 4.946236559139785, |
| "eval_loss": 0.3690737187862396, |
| "eval_runtime": 61.4895, |
| "eval_samples_per_second": 309.061, |
| "eval_steps_per_second": 38.641, |
| "step": 2300 |
| }, |
| { |
| "epoch": 5.053763440860215, |
| "grad_norm": 0.6344749331474304, |
| "learning_rate": 0.00017979354838709678, |
| "loss": 0.42147178649902345, |
| "step": 2350 |
| }, |
| { |
| "epoch": 5.053763440860215, |
| "eval_loss": 0.3570445775985718, |
| "eval_runtime": 62.1748, |
| "eval_samples_per_second": 305.654, |
| "eval_steps_per_second": 38.215, |
| "step": 2350 |
| }, |
| { |
| "epoch": 5.161290322580645, |
| "grad_norm": 0.6610215306282043, |
| "learning_rate": 0.00017936344086021507, |
| "loss": 0.4157654571533203, |
| "step": 2400 |
| }, |
| { |
| "epoch": 5.161290322580645, |
| "eval_loss": 0.3497065603733063, |
| "eval_runtime": 61.6389, |
| "eval_samples_per_second": 308.312, |
| "eval_steps_per_second": 38.547, |
| "step": 2400 |
| }, |
| { |
| "epoch": 5.268817204301075, |
| "grad_norm": 0.5334368348121643, |
| "learning_rate": 0.00017893333333333336, |
| "loss": 0.4012648391723633, |
| "step": 2450 |
| }, |
| { |
| "epoch": 5.268817204301075, |
| "eval_loss": 0.33196908235549927, |
| "eval_runtime": 64.4623, |
| "eval_samples_per_second": 294.808, |
| "eval_steps_per_second": 36.859, |
| "step": 2450 |
| }, |
| { |
| "epoch": 5.376344086021505, |
| "grad_norm": 0.7559072971343994, |
| "learning_rate": 0.00017850322580645163, |
| "loss": 0.4343834686279297, |
| "step": 2500 |
| }, |
| { |
| "epoch": 5.376344086021505, |
| "eval_loss": 0.31756916642189026, |
| "eval_runtime": 64.0899, |
| "eval_samples_per_second": 296.521, |
| "eval_steps_per_second": 37.073, |
| "step": 2500 |
| }, |
| { |
| "epoch": 5.483870967741936, |
| "grad_norm": 0.6970711946487427, |
| "learning_rate": 0.0001780731182795699, |
| "loss": 0.3609016799926758, |
| "step": 2550 |
| }, |
| { |
| "epoch": 5.483870967741936, |
| "eval_loss": 0.3129482567310333, |
| "eval_runtime": 64.2007, |
| "eval_samples_per_second": 296.009, |
| "eval_steps_per_second": 37.009, |
| "step": 2550 |
| }, |
| { |
| "epoch": 5.591397849462366, |
| "grad_norm": 0.7393150329589844, |
| "learning_rate": 0.0001776430107526882, |
| "loss": 0.36085220336914064, |
| "step": 2600 |
| }, |
| { |
| "epoch": 5.591397849462366, |
| "eval_loss": 0.29907363653182983, |
| "eval_runtime": 64.2974, |
| "eval_samples_per_second": 295.564, |
| "eval_steps_per_second": 36.953, |
| "step": 2600 |
| }, |
| { |
| "epoch": 5.698924731182796, |
| "grad_norm": 0.6760246157646179, |
| "learning_rate": 0.00017721290322580645, |
| "loss": 0.3354073715209961, |
| "step": 2650 |
| }, |
| { |
| "epoch": 5.698924731182796, |
| "eval_loss": 0.28903692960739136, |
| "eval_runtime": 64.2379, |
| "eval_samples_per_second": 295.838, |
| "eval_steps_per_second": 36.988, |
| "step": 2650 |
| }, |
| { |
| "epoch": 5.806451612903226, |
| "grad_norm": 0.6342934370040894, |
| "learning_rate": 0.00017678279569892472, |
| "loss": 0.33487789154052733, |
| "step": 2700 |
| }, |
| { |
| "epoch": 5.806451612903226, |
| "eval_loss": 0.2763662040233612, |
| "eval_runtime": 63.0262, |
| "eval_samples_per_second": 301.525, |
| "eval_steps_per_second": 37.699, |
| "step": 2700 |
| }, |
| { |
| "epoch": 5.913978494623656, |
| "grad_norm": 0.6288059949874878, |
| "learning_rate": 0.00017635268817204301, |
| "loss": 0.3166103744506836, |
| "step": 2750 |
| }, |
| { |
| "epoch": 5.913978494623656, |
| "eval_loss": 0.27043381333351135, |
| "eval_runtime": 63.0792, |
| "eval_samples_per_second": 301.272, |
| "eval_steps_per_second": 37.667, |
| "step": 2750 |
| }, |
| { |
| "epoch": 6.021505376344086, |
| "grad_norm": 0.8228830695152283, |
| "learning_rate": 0.0001759225806451613, |
| "loss": 0.3166475486755371, |
| "step": 2800 |
| }, |
| { |
| "epoch": 6.021505376344086, |
| "eval_loss": 0.26023828983306885, |
| "eval_runtime": 64.4666, |
| "eval_samples_per_second": 294.788, |
| "eval_steps_per_second": 36.856, |
| "step": 2800 |
| }, |
| { |
| "epoch": 6.129032258064516, |
| "grad_norm": 0.6261463165283203, |
| "learning_rate": 0.0001754924731182796, |
| "loss": 0.30168416976928714, |
| "step": 2850 |
| }, |
| { |
| "epoch": 6.129032258064516, |
| "eval_loss": 0.2530518174171448, |
| "eval_runtime": 63.8775, |
| "eval_samples_per_second": 297.507, |
| "eval_steps_per_second": 37.196, |
| "step": 2850 |
| }, |
| { |
| "epoch": 6.236559139784946, |
| "grad_norm": 0.7265720367431641, |
| "learning_rate": 0.00017506236559139787, |
| "loss": 0.29341196060180663, |
| "step": 2900 |
| }, |
| { |
| "epoch": 6.236559139784946, |
| "eval_loss": 0.24442243576049805, |
| "eval_runtime": 63.2991, |
| "eval_samples_per_second": 300.226, |
| "eval_steps_per_second": 37.536, |
| "step": 2900 |
| }, |
| { |
| "epoch": 6.344086021505376, |
| "grad_norm": 0.5499133467674255, |
| "learning_rate": 0.00017463225806451613, |
| "loss": 0.2850730323791504, |
| "step": 2950 |
| }, |
| { |
| "epoch": 6.344086021505376, |
| "eval_loss": 0.237361341714859, |
| "eval_runtime": 64.5725, |
| "eval_samples_per_second": 294.305, |
| "eval_steps_per_second": 36.796, |
| "step": 2950 |
| }, |
| { |
| "epoch": 6.451612903225806, |
| "grad_norm": 0.7466527223587036, |
| "learning_rate": 0.00017420215053763442, |
| "loss": 0.2737441635131836, |
| "step": 3000 |
| }, |
| { |
| "epoch": 6.451612903225806, |
| "eval_loss": 0.22867611050605774, |
| "eval_runtime": 64.8912, |
| "eval_samples_per_second": 292.86, |
| "eval_steps_per_second": 36.615, |
| "step": 3000 |
| }, |
| { |
| "epoch": 6.559139784946236, |
| "grad_norm": 0.605771005153656, |
| "learning_rate": 0.0001737720430107527, |
| "loss": 0.26982501983642576, |
| "step": 3050 |
| }, |
| { |
| "epoch": 6.559139784946236, |
| "eval_loss": 0.22686000168323517, |
| "eval_runtime": 64.8566, |
| "eval_samples_per_second": 293.016, |
| "eval_steps_per_second": 36.635, |
| "step": 3050 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 0.6927595138549805, |
| "learning_rate": 0.00017334193548387096, |
| "loss": 0.2592777633666992, |
| "step": 3100 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "eval_loss": 0.22359216213226318, |
| "eval_runtime": 64.9559, |
| "eval_samples_per_second": 292.568, |
| "eval_steps_per_second": 36.579, |
| "step": 3100 |
| }, |
| { |
| "epoch": 6.774193548387097, |
| "grad_norm": 0.6070519685745239, |
| "learning_rate": 0.00017291182795698925, |
| "loss": 0.2539858436584473, |
| "step": 3150 |
| }, |
| { |
| "epoch": 6.774193548387097, |
| "eval_loss": 0.22382962703704834, |
| "eval_runtime": 64.9172, |
| "eval_samples_per_second": 292.742, |
| "eval_steps_per_second": 36.6, |
| "step": 3150 |
| }, |
| { |
| "epoch": 6.881720430107527, |
| "grad_norm": 0.7206361889839172, |
| "learning_rate": 0.00017248172043010754, |
| "loss": 0.2550803184509277, |
| "step": 3200 |
| }, |
| { |
| "epoch": 6.881720430107527, |
| "eval_loss": 0.22055239975452423, |
| "eval_runtime": 65.5818, |
| "eval_samples_per_second": 289.775, |
| "eval_steps_per_second": 36.23, |
| "step": 3200 |
| }, |
| { |
| "epoch": 6.989247311827957, |
| "grad_norm": 0.6855896711349487, |
| "learning_rate": 0.00017205161290322584, |
| "loss": 0.2432615852355957, |
| "step": 3250 |
| }, |
| { |
| "epoch": 6.989247311827957, |
| "eval_loss": 0.21467819809913635, |
| "eval_runtime": 66.2905, |
| "eval_samples_per_second": 286.677, |
| "eval_steps_per_second": 35.842, |
| "step": 3250 |
| }, |
| { |
| "epoch": 7.096774193548387, |
| "grad_norm": 0.5612008571624756, |
| "learning_rate": 0.0001716215053763441, |
| "loss": 0.24562849044799806, |
| "step": 3300 |
| }, |
| { |
| "epoch": 7.096774193548387, |
| "eval_loss": 0.21375121176242828, |
| "eval_runtime": 66.0151, |
| "eval_samples_per_second": 287.874, |
| "eval_steps_per_second": 35.992, |
| "step": 3300 |
| }, |
| { |
| "epoch": 7.204301075268817, |
| "grad_norm": 0.7433006763458252, |
| "learning_rate": 0.00017119139784946237, |
| "loss": 0.2393852424621582, |
| "step": 3350 |
| }, |
| { |
| "epoch": 7.204301075268817, |
| "eval_loss": 0.20871323347091675, |
| "eval_runtime": 61.9563, |
| "eval_samples_per_second": 306.732, |
| "eval_steps_per_second": 38.35, |
| "step": 3350 |
| }, |
| { |
| "epoch": 7.311827956989247, |
| "grad_norm": 0.6491153836250305, |
| "learning_rate": 0.00017076129032258066, |
| "loss": 0.24959787368774414, |
| "step": 3400 |
| }, |
| { |
| "epoch": 7.311827956989247, |
| "eval_loss": 0.21120016276836395, |
| "eval_runtime": 60.6864, |
| "eval_samples_per_second": 313.151, |
| "eval_steps_per_second": 39.152, |
| "step": 3400 |
| }, |
| { |
| "epoch": 7.419354838709677, |
| "grad_norm": 0.5620025992393494, |
| "learning_rate": 0.00017033118279569893, |
| "loss": 0.2320168685913086, |
| "step": 3450 |
| }, |
| { |
| "epoch": 7.419354838709677, |
| "eval_loss": 0.20816229283809662, |
| "eval_runtime": 61.036, |
| "eval_samples_per_second": 311.357, |
| "eval_steps_per_second": 38.928, |
| "step": 3450 |
| }, |
| { |
| "epoch": 7.526881720430108, |
| "grad_norm": 0.6183444261550903, |
| "learning_rate": 0.00016990107526881722, |
| "loss": 0.2322225570678711, |
| "step": 3500 |
| }, |
| { |
| "epoch": 7.526881720430108, |
| "eval_loss": 0.20497609674930573, |
| "eval_runtime": 60.5328, |
| "eval_samples_per_second": 313.946, |
| "eval_steps_per_second": 39.251, |
| "step": 3500 |
| }, |
| { |
| "epoch": 7.634408602150538, |
| "grad_norm": 0.5328448414802551, |
| "learning_rate": 0.00016947096774193548, |
| "loss": 0.23304037094116212, |
| "step": 3550 |
| }, |
| { |
| "epoch": 7.634408602150538, |
| "eval_loss": 0.20321960747241974, |
| "eval_runtime": 62.1711, |
| "eval_samples_per_second": 305.672, |
| "eval_steps_per_second": 38.217, |
| "step": 3550 |
| }, |
| { |
| "epoch": 7.741935483870968, |
| "grad_norm": 0.5241938829421997, |
| "learning_rate": 0.00016904086021505378, |
| "loss": 0.22476686477661134, |
| "step": 3600 |
| }, |
| { |
| "epoch": 7.741935483870968, |
| "eval_loss": 0.2034502625465393, |
| "eval_runtime": 64.8022, |
| "eval_samples_per_second": 293.262, |
| "eval_steps_per_second": 36.665, |
| "step": 3600 |
| }, |
| { |
| "epoch": 7.849462365591398, |
| "grad_norm": 0.5440294742584229, |
| "learning_rate": 0.00016861075268817207, |
| "loss": 0.227796630859375, |
| "step": 3650 |
| }, |
| { |
| "epoch": 7.849462365591398, |
| "eval_loss": 0.20562465488910675, |
| "eval_runtime": 65.1543, |
| "eval_samples_per_second": 291.677, |
| "eval_steps_per_second": 36.467, |
| "step": 3650 |
| }, |
| { |
| "epoch": 7.956989247311828, |
| "grad_norm": 0.5037738680839539, |
| "learning_rate": 0.00016818064516129034, |
| "loss": 0.23125221252441405, |
| "step": 3700 |
| }, |
| { |
| "epoch": 7.956989247311828, |
| "eval_loss": 0.20223356783390045, |
| "eval_runtime": 65.5561, |
| "eval_samples_per_second": 289.889, |
| "eval_steps_per_second": 36.244, |
| "step": 3700 |
| }, |
| { |
| "epoch": 8.064516129032258, |
| "grad_norm": 0.843550980091095, |
| "learning_rate": 0.0001677505376344086, |
| "loss": 0.2236369514465332, |
| "step": 3750 |
| }, |
| { |
| "epoch": 8.064516129032258, |
| "eval_loss": 0.19716867804527283, |
| "eval_runtime": 66.4534, |
| "eval_samples_per_second": 285.975, |
| "eval_steps_per_second": 35.754, |
| "step": 3750 |
| }, |
| { |
| "epoch": 8.172043010752688, |
| "grad_norm": 0.5562386512756348, |
| "learning_rate": 0.0001673204301075269, |
| "loss": 0.22720510482788087, |
| "step": 3800 |
| }, |
| { |
| "epoch": 8.172043010752688, |
| "eval_loss": 0.1974799931049347, |
| "eval_runtime": 66.0022, |
| "eval_samples_per_second": 287.93, |
| "eval_steps_per_second": 35.999, |
| "step": 3800 |
| }, |
| { |
| "epoch": 8.279569892473118, |
| "grad_norm": 0.5003981590270996, |
| "learning_rate": 0.00016689032258064516, |
| "loss": 0.22547555923461915, |
| "step": 3850 |
| }, |
| { |
| "epoch": 8.279569892473118, |
| "eval_loss": 0.19821035861968994, |
| "eval_runtime": 60.464, |
| "eval_samples_per_second": 314.303, |
| "eval_steps_per_second": 39.296, |
| "step": 3850 |
| }, |
| { |
| "epoch": 8.387096774193548, |
| "grad_norm": 0.4629065692424774, |
| "learning_rate": 0.00016646021505376345, |
| "loss": 0.22113780975341796, |
| "step": 3900 |
| }, |
| { |
| "epoch": 8.387096774193548, |
| "eval_loss": 0.1924905627965927, |
| "eval_runtime": 60.595, |
| "eval_samples_per_second": 313.623, |
| "eval_steps_per_second": 39.211, |
| "step": 3900 |
| }, |
| { |
| "epoch": 8.494623655913978, |
| "grad_norm": 0.5043092966079712, |
| "learning_rate": 0.00016603010752688172, |
| "loss": 0.21599315643310546, |
| "step": 3950 |
| }, |
| { |
| "epoch": 8.494623655913978, |
| "eval_loss": 0.19553141295909882, |
| "eval_runtime": 60.5, |
| "eval_samples_per_second": 314.116, |
| "eval_steps_per_second": 39.273, |
| "step": 3950 |
| }, |
| { |
| "epoch": 8.602150537634408, |
| "grad_norm": 0.6413733959197998, |
| "learning_rate": 0.0001656, |
| "loss": 0.2173159408569336, |
| "step": 4000 |
| }, |
| { |
| "epoch": 8.602150537634408, |
| "eval_loss": 0.19092191755771637, |
| "eval_runtime": 60.5854, |
| "eval_samples_per_second": 313.673, |
| "eval_steps_per_second": 39.217, |
| "step": 4000 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 23250, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.685471179194368e+16, |
| "train_batch_size": 64, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|