|
{ |
|
"best_metric": 0.012130416929721832, |
|
"best_model_checkpoint": "./outputs/models/flan-t5-large-summ/checkpoint-97000", |
|
"epoch": 20.0, |
|
"eval_steps": 1000, |
|
"global_step": 104660, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01910949742021785, |
|
"grad_norm": 2.4572737216949463, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3075, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0382189948404357, |
|
"grad_norm": 1.3809181451797485, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1747, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05732849226065354, |
|
"grad_norm": 1.076543927192688, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1598, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0764379896808714, |
|
"grad_norm": 1.3211930990219116, |
|
"learning_rate": 4e-05, |
|
"loss": 1.1265, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.09554748710108923, |
|
"grad_norm": 1.7730636596679688, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0228, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.11465698452130708, |
|
"grad_norm": 1.103384256362915, |
|
"learning_rate": 4.995199692780338e-05, |
|
"loss": 0.9792, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.13376648194152493, |
|
"grad_norm": 0.9394871592521667, |
|
"learning_rate": 4.990399385560676e-05, |
|
"loss": 0.9821, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.1528759793617428, |
|
"grad_norm": 0.9111909866333008, |
|
"learning_rate": 4.985599078341014e-05, |
|
"loss": 0.9793, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.17198547678196063, |
|
"grad_norm": 1.2519441843032837, |
|
"learning_rate": 4.9807987711213524e-05, |
|
"loss": 0.9411, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.19109497420217847, |
|
"grad_norm": 0.7635079622268677, |
|
"learning_rate": 4.9759984639016896e-05, |
|
"loss": 0.922, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.19109497420217847, |
|
"eval_events-synergy/entsum_processed_loss": 0.7808578610420227, |
|
"eval_events-synergy/entsum_processed_runtime": 86.3527, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.936, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.74, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.21020447162239633, |
|
"grad_norm": 1.012012004852295, |
|
"learning_rate": 4.971198156682028e-05, |
|
"loss": 0.908, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.22931396904261417, |
|
"grad_norm": 1.2864316701889038, |
|
"learning_rate": 4.966397849462366e-05, |
|
"loss": 0.8234, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.24842346646283203, |
|
"grad_norm": 1.4988912343978882, |
|
"learning_rate": 4.961597542242703e-05, |
|
"loss": 0.8118, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.26753296388304987, |
|
"grad_norm": 1.2037246227264404, |
|
"learning_rate": 4.956797235023042e-05, |
|
"loss": 0.8052, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.2866424613032677, |
|
"grad_norm": 1.3416869640350342, |
|
"learning_rate": 4.9519969278033797e-05, |
|
"loss": 0.6814, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3057519587234856, |
|
"grad_norm": 0.7591760158538818, |
|
"learning_rate": 4.9471966205837175e-05, |
|
"loss": 0.835, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.32486145614370343, |
|
"grad_norm": 1.0816859006881714, |
|
"learning_rate": 4.9423963133640554e-05, |
|
"loss": 0.7875, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.34397095356392127, |
|
"grad_norm": 1.381515383720398, |
|
"learning_rate": 4.937596006144393e-05, |
|
"loss": 0.7727, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.3630804509841391, |
|
"grad_norm": 1.031540870666504, |
|
"learning_rate": 4.932795698924731e-05, |
|
"loss": 0.7416, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.38218994840435694, |
|
"grad_norm": 1.013592004776001, |
|
"learning_rate": 4.927995391705069e-05, |
|
"loss": 0.7544, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.38218994840435694, |
|
"eval_events-synergy/entsum_processed_loss": 0.5898063778877258, |
|
"eval_events-synergy/entsum_processed_runtime": 86.5907, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.862, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.721, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.40129944582457483, |
|
"grad_norm": 0.993009626865387, |
|
"learning_rate": 4.9231950844854076e-05, |
|
"loss": 0.6104, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.42040894324479267, |
|
"grad_norm": 1.2820205688476562, |
|
"learning_rate": 4.9183947772657455e-05, |
|
"loss": 0.6266, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.4395184406650105, |
|
"grad_norm": 1.0375840663909912, |
|
"learning_rate": 4.913594470046083e-05, |
|
"loss": 0.6959, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.45862793808522834, |
|
"grad_norm": 1.4805232286453247, |
|
"learning_rate": 4.908794162826421e-05, |
|
"loss": 0.7472, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.47773743550544623, |
|
"grad_norm": 1.1135224103927612, |
|
"learning_rate": 4.903993855606759e-05, |
|
"loss": 0.6271, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.49684693292566406, |
|
"grad_norm": 0.6857364773750305, |
|
"learning_rate": 4.899193548387097e-05, |
|
"loss": 0.5972, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.5159564303458819, |
|
"grad_norm": 1.0915530920028687, |
|
"learning_rate": 4.894393241167435e-05, |
|
"loss": 0.6082, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.5350659277660997, |
|
"grad_norm": 0.6362367272377014, |
|
"learning_rate": 4.8895929339477734e-05, |
|
"loss": 0.6479, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.5541754251863176, |
|
"grad_norm": 1.4901950359344482, |
|
"learning_rate": 4.8847926267281106e-05, |
|
"loss": 0.6431, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.5732849226065354, |
|
"grad_norm": 1.0960026979446411, |
|
"learning_rate": 4.879992319508449e-05, |
|
"loss": 0.5337, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5732849226065354, |
|
"eval_events-synergy/entsum_processed_loss": 0.4623138904571533, |
|
"eval_events-synergy/entsum_processed_runtime": 86.5166, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.885, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.727, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5923944200267532, |
|
"grad_norm": 1.622821569442749, |
|
"learning_rate": 4.875192012288787e-05, |
|
"loss": 0.5383, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.6115039174469712, |
|
"grad_norm": 0.9446718096733093, |
|
"learning_rate": 4.870391705069124e-05, |
|
"loss": 0.5512, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.630613414867189, |
|
"grad_norm": 1.555004358291626, |
|
"learning_rate": 4.865591397849463e-05, |
|
"loss": 0.5534, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.6497229122874069, |
|
"grad_norm": 1.4712913036346436, |
|
"learning_rate": 4.8607910906298006e-05, |
|
"loss": 0.571, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6688324097076247, |
|
"grad_norm": 0.7977001070976257, |
|
"learning_rate": 4.8559907834101385e-05, |
|
"loss": 0.5197, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6879419071278425, |
|
"grad_norm": 0.742138683795929, |
|
"learning_rate": 4.8511904761904764e-05, |
|
"loss": 0.5445, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.7070514045480604, |
|
"grad_norm": 2.7435715198516846, |
|
"learning_rate": 4.846390168970814e-05, |
|
"loss": 0.5571, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.7261609019682782, |
|
"grad_norm": 2.0406129360198975, |
|
"learning_rate": 4.841589861751152e-05, |
|
"loss": 0.5156, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.745270399388496, |
|
"grad_norm": 2.4138622283935547, |
|
"learning_rate": 4.83678955453149e-05, |
|
"loss": 0.4924, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.7643798968087139, |
|
"grad_norm": 1.1996113061904907, |
|
"learning_rate": 4.8319892473118285e-05, |
|
"loss": 0.4759, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7643798968087139, |
|
"eval_events-synergy/entsum_processed_loss": 0.37834471464157104, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4756, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.898, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.73, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7834893942289318, |
|
"grad_norm": 1.4116686582565308, |
|
"learning_rate": 4.827188940092166e-05, |
|
"loss": 0.4346, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.8025988916491497, |
|
"grad_norm": 1.2636014223098755, |
|
"learning_rate": 4.822388632872504e-05, |
|
"loss": 0.4435, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.8217083890693675, |
|
"grad_norm": 1.1817740201950073, |
|
"learning_rate": 4.817588325652842e-05, |
|
"loss": 0.4403, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.8408178864895853, |
|
"grad_norm": 1.4977765083312988, |
|
"learning_rate": 4.81278801843318e-05, |
|
"loss": 0.4201, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.8599273839098032, |
|
"grad_norm": 1.4559648036956787, |
|
"learning_rate": 4.807987711213518e-05, |
|
"loss": 0.4928, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.879036881330021, |
|
"grad_norm": 1.160703182220459, |
|
"learning_rate": 4.803187403993856e-05, |
|
"loss": 0.4822, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.8981463787502388, |
|
"grad_norm": 1.7978248596191406, |
|
"learning_rate": 4.7983870967741937e-05, |
|
"loss": 0.4342, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.9172558761704567, |
|
"grad_norm": 0.9042201638221741, |
|
"learning_rate": 4.7935867895545315e-05, |
|
"loss": 0.5015, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.9363653735906745, |
|
"grad_norm": 0.9743239879608154, |
|
"learning_rate": 4.78878648233487e-05, |
|
"loss": 0.4664, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.9554748710108925, |
|
"grad_norm": 2.9479408264160156, |
|
"learning_rate": 4.783986175115208e-05, |
|
"loss": 0.4586, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.9554748710108925, |
|
"eval_events-synergy/entsum_processed_loss": 0.30807018280029297, |
|
"eval_events-synergy/entsum_processed_runtime": 86.3824, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.927, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.737, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.9745843684311103, |
|
"grad_norm": 0.7617493271827698, |
|
"learning_rate": 4.779185867895545e-05, |
|
"loss": 0.4103, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.9936938658513281, |
|
"grad_norm": 1.3358664512634277, |
|
"learning_rate": 4.774385560675884e-05, |
|
"loss": 0.4177, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.0128033632715459, |
|
"grad_norm": 1.880685567855835, |
|
"learning_rate": 4.7695852534562216e-05, |
|
"loss": 0.3909, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.0319128606917638, |
|
"grad_norm": 1.12627375125885, |
|
"learning_rate": 4.7647849462365594e-05, |
|
"loss": 0.3738, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.0510223581119817, |
|
"grad_norm": 1.3265856504440308, |
|
"learning_rate": 4.759984639016897e-05, |
|
"loss": 0.3564, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.0701318555321995, |
|
"grad_norm": 2.0652267932891846, |
|
"learning_rate": 4.755184331797235e-05, |
|
"loss": 0.3301, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.0892413529524174, |
|
"grad_norm": 1.5408031940460205, |
|
"learning_rate": 4.750384024577573e-05, |
|
"loss": 0.3282, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.1083508503726351, |
|
"grad_norm": 2.1643123626708984, |
|
"learning_rate": 4.745583717357911e-05, |
|
"loss": 0.4158, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.127460347792853, |
|
"grad_norm": 2.325970411300659, |
|
"learning_rate": 4.7407834101382495e-05, |
|
"loss": 0.3348, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.1465698452130708, |
|
"grad_norm": 1.4004697799682617, |
|
"learning_rate": 4.735983102918587e-05, |
|
"loss": 0.3563, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.1465698452130708, |
|
"eval_events-synergy/entsum_processed_loss": 0.25603175163269043, |
|
"eval_events-synergy/entsum_processed_runtime": 86.5218, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.883, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.727, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.1656793426332888, |
|
"grad_norm": 1.0601403713226318, |
|
"learning_rate": 4.731182795698925e-05, |
|
"loss": 0.3606, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.1847888400535065, |
|
"grad_norm": 1.7814487218856812, |
|
"learning_rate": 4.726382488479263e-05, |
|
"loss": 0.3574, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.2038983374737244, |
|
"grad_norm": 0.9902318716049194, |
|
"learning_rate": 4.7215821812596e-05, |
|
"loss": 0.2871, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.2230078348939424, |
|
"grad_norm": 1.6244627237319946, |
|
"learning_rate": 4.716781874039939e-05, |
|
"loss": 0.3045, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.24211733231416, |
|
"grad_norm": 0.4082527458667755, |
|
"learning_rate": 4.711981566820277e-05, |
|
"loss": 0.3104, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.261226829734378, |
|
"grad_norm": 2.545142889022827, |
|
"learning_rate": 4.7071812596006146e-05, |
|
"loss": 0.3443, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.2803363271545958, |
|
"grad_norm": 1.6806279420852661, |
|
"learning_rate": 4.7023809523809525e-05, |
|
"loss": 0.3004, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.2994458245748137, |
|
"grad_norm": 1.1669882535934448, |
|
"learning_rate": 4.697580645161291e-05, |
|
"loss": 0.3168, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.3185553219950314, |
|
"grad_norm": 1.3955861330032349, |
|
"learning_rate": 4.692780337941628e-05, |
|
"loss": 0.2662, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.3376648194152494, |
|
"grad_norm": 1.6163365840911865, |
|
"learning_rate": 4.687980030721966e-05, |
|
"loss": 0.3172, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.3376648194152494, |
|
"eval_events-synergy/entsum_processed_loss": 0.21874384582042694, |
|
"eval_events-synergy/entsum_processed_runtime": 86.5224, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.883, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.727, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.3567743168354673, |
|
"grad_norm": 0.675295352935791, |
|
"learning_rate": 4.6831797235023047e-05, |
|
"loss": 0.2594, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.375883814255685, |
|
"grad_norm": 1.194022297859192, |
|
"learning_rate": 4.6783794162826425e-05, |
|
"loss": 0.329, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.3949933116759028, |
|
"grad_norm": 2.0018792152404785, |
|
"learning_rate": 4.6735791090629804e-05, |
|
"loss": 0.2772, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.4141028090961207, |
|
"grad_norm": 1.3288326263427734, |
|
"learning_rate": 4.668778801843318e-05, |
|
"loss": 0.2779, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.4332123065163387, |
|
"grad_norm": 0.8486484885215759, |
|
"learning_rate": 4.663978494623656e-05, |
|
"loss": 0.3124, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.4523218039365564, |
|
"grad_norm": 1.3498897552490234, |
|
"learning_rate": 4.659178187403994e-05, |
|
"loss": 0.3085, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.4714313013567744, |
|
"grad_norm": 1.0366922616958618, |
|
"learning_rate": 4.654377880184332e-05, |
|
"loss": 0.2847, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.490540798776992, |
|
"grad_norm": 1.5916386842727661, |
|
"learning_rate": 4.6495775729646705e-05, |
|
"loss": 0.2589, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.50965029619721, |
|
"grad_norm": 2.4099597930908203, |
|
"learning_rate": 4.6447772657450076e-05, |
|
"loss": 0.3322, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.5287597936174278, |
|
"grad_norm": 2.6880438327789307, |
|
"learning_rate": 4.639976958525346e-05, |
|
"loss": 0.2397, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.5287597936174278, |
|
"eval_events-synergy/entsum_processed_loss": 0.18345867097377777, |
|
"eval_events-synergy/entsum_processed_runtime": 86.5562, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.873, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.724, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.5478692910376457, |
|
"grad_norm": 0.704814612865448, |
|
"learning_rate": 4.635176651305684e-05, |
|
"loss": 0.2491, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.5669787884578636, |
|
"grad_norm": 0.7323247790336609, |
|
"learning_rate": 4.630376344086021e-05, |
|
"loss": 0.227, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.5860882858780814, |
|
"grad_norm": 1.2306073904037476, |
|
"learning_rate": 4.62557603686636e-05, |
|
"loss": 0.2801, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.605197783298299, |
|
"grad_norm": 2.0440738201141357, |
|
"learning_rate": 4.620775729646698e-05, |
|
"loss": 0.2542, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.6243072807185173, |
|
"grad_norm": 2.2261791229248047, |
|
"learning_rate": 4.6159754224270356e-05, |
|
"loss": 0.2575, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.643416778138735, |
|
"grad_norm": 1.3056387901306152, |
|
"learning_rate": 4.6111751152073734e-05, |
|
"loss": 0.2149, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.6625262755589527, |
|
"grad_norm": 1.1693623065948486, |
|
"learning_rate": 4.606374807987712e-05, |
|
"loss": 0.2344, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.6816357729791707, |
|
"grad_norm": 1.912260890007019, |
|
"learning_rate": 4.601574500768049e-05, |
|
"loss": 0.2575, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.7007452703993886, |
|
"grad_norm": 0.8953251838684082, |
|
"learning_rate": 4.596774193548387e-05, |
|
"loss": 0.2718, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.7198547678196063, |
|
"grad_norm": 1.2062451839447021, |
|
"learning_rate": 4.5919738863287256e-05, |
|
"loss": 0.2208, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.7198547678196063, |
|
"eval_events-synergy/entsum_processed_loss": 0.15974336862564087, |
|
"eval_events-synergy/entsum_processed_runtime": 86.5109, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.887, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.727, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.738964265239824, |
|
"grad_norm": 1.1229045391082764, |
|
"learning_rate": 4.587173579109063e-05, |
|
"loss": 0.225, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 1.758073762660042, |
|
"grad_norm": 1.2543933391571045, |
|
"learning_rate": 4.5823732718894014e-05, |
|
"loss": 0.2131, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 1.77718326008026, |
|
"grad_norm": 0.8586951494216919, |
|
"learning_rate": 4.577572964669739e-05, |
|
"loss": 0.2323, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 1.7962927575004777, |
|
"grad_norm": 1.023917555809021, |
|
"learning_rate": 4.5727726574500764e-05, |
|
"loss": 0.2316, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 1.8154022549206956, |
|
"grad_norm": 1.0598609447479248, |
|
"learning_rate": 4.567972350230415e-05, |
|
"loss": 0.2307, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.8345117523409136, |
|
"grad_norm": 1.7368364334106445, |
|
"learning_rate": 4.563172043010753e-05, |
|
"loss": 0.2052, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.8536212497611313, |
|
"grad_norm": 1.440076470375061, |
|
"learning_rate": 4.558371735791091e-05, |
|
"loss": 0.2044, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 1.872730747181349, |
|
"grad_norm": 0.9440079927444458, |
|
"learning_rate": 4.5535714285714286e-05, |
|
"loss": 0.2446, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 1.891840244601567, |
|
"grad_norm": 0.9267650246620178, |
|
"learning_rate": 4.548771121351767e-05, |
|
"loss": 0.1726, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 1.910949742021785, |
|
"grad_norm": 1.5710424184799194, |
|
"learning_rate": 4.5439708141321044e-05, |
|
"loss": 0.2366, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.910949742021785, |
|
"eval_events-synergy/entsum_processed_loss": 0.13830891251564026, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4674, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.9, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.731, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.9300592394420026, |
|
"grad_norm": 0.9097877740859985, |
|
"learning_rate": 4.539170506912442e-05, |
|
"loss": 0.1884, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 1.9491687368622204, |
|
"grad_norm": 1.7585983276367188, |
|
"learning_rate": 4.534370199692781e-05, |
|
"loss": 0.2067, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 1.9682782342824385, |
|
"grad_norm": 1.4785964488983154, |
|
"learning_rate": 4.5295698924731187e-05, |
|
"loss": 0.2044, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 1.9873877317026563, |
|
"grad_norm": 2.9566831588745117, |
|
"learning_rate": 4.5247695852534565e-05, |
|
"loss": 0.1765, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 2.006497229122874, |
|
"grad_norm": 0.8678722381591797, |
|
"learning_rate": 4.5199692780337944e-05, |
|
"loss": 0.1899, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.0256067265430917, |
|
"grad_norm": 1.2664000988006592, |
|
"learning_rate": 4.515168970814132e-05, |
|
"loss": 0.1667, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 2.04471622396331, |
|
"grad_norm": 1.3981448411941528, |
|
"learning_rate": 4.51036866359447e-05, |
|
"loss": 0.1902, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 2.0638257213835276, |
|
"grad_norm": 0.7927885055541992, |
|
"learning_rate": 4.505568356374808e-05, |
|
"loss": 0.1743, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 2.0829352188037453, |
|
"grad_norm": 1.3861682415008545, |
|
"learning_rate": 4.5007680491551466e-05, |
|
"loss": 0.1807, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 2.1020447162239635, |
|
"grad_norm": 1.4484479427337646, |
|
"learning_rate": 4.495967741935484e-05, |
|
"loss": 0.1485, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.1020447162239635, |
|
"eval_events-synergy/entsum_processed_loss": 0.1214805543422699, |
|
"eval_events-synergy/entsum_processed_runtime": 86.1466, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.0, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.756, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.121154213644181, |
|
"grad_norm": 0.9914194941520691, |
|
"learning_rate": 4.491167434715822e-05, |
|
"loss": 0.1889, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 2.140263711064399, |
|
"grad_norm": 0.7929261326789856, |
|
"learning_rate": 4.48636712749616e-05, |
|
"loss": 0.1562, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 2.1593732084846167, |
|
"grad_norm": 1.4530657529830933, |
|
"learning_rate": 4.4815668202764974e-05, |
|
"loss": 0.1641, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 2.178482705904835, |
|
"grad_norm": 1.6586015224456787, |
|
"learning_rate": 4.476766513056836e-05, |
|
"loss": 0.1759, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 2.1975922033250526, |
|
"grad_norm": 1.327807068824768, |
|
"learning_rate": 4.471966205837174e-05, |
|
"loss": 0.1919, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.2167017007452703, |
|
"grad_norm": 2.2015275955200195, |
|
"learning_rate": 4.467165898617512e-05, |
|
"loss": 0.1933, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 2.2358111981654885, |
|
"grad_norm": 1.014055609703064, |
|
"learning_rate": 4.4623655913978496e-05, |
|
"loss": 0.1655, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 2.254920695585706, |
|
"grad_norm": 1.8720685243606567, |
|
"learning_rate": 4.457565284178188e-05, |
|
"loss": 0.1782, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 2.274030193005924, |
|
"grad_norm": 1.1258978843688965, |
|
"learning_rate": 4.452764976958525e-05, |
|
"loss": 0.1533, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 2.2931396904261416, |
|
"grad_norm": 1.5024638175964355, |
|
"learning_rate": 4.447964669738863e-05, |
|
"loss": 0.1114, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.2931396904261416, |
|
"eval_events-synergy/entsum_processed_loss": 0.10810433328151703, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4359, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.91, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.733, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.31224918784636, |
|
"grad_norm": 1.3650909662246704, |
|
"learning_rate": 4.443164362519202e-05, |
|
"loss": 0.1627, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 2.3313586852665775, |
|
"grad_norm": 1.9314254522323608, |
|
"learning_rate": 4.438364055299539e-05, |
|
"loss": 0.1495, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 2.3504681826867952, |
|
"grad_norm": 0.8482472896575928, |
|
"learning_rate": 4.4335637480798775e-05, |
|
"loss": 0.1428, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 2.369577680107013, |
|
"grad_norm": 0.9307497143745422, |
|
"learning_rate": 4.4287634408602154e-05, |
|
"loss": 0.1649, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 2.388687177527231, |
|
"grad_norm": 1.1554982662200928, |
|
"learning_rate": 4.423963133640553e-05, |
|
"loss": 0.1257, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 2.407796674947449, |
|
"grad_norm": 3.0138888359069824, |
|
"learning_rate": 4.419162826420891e-05, |
|
"loss": 0.1481, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 2.4269061723676666, |
|
"grad_norm": 0.6184584498405457, |
|
"learning_rate": 4.414362519201229e-05, |
|
"loss": 0.1619, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 2.4460156697878848, |
|
"grad_norm": 0.8029180765151978, |
|
"learning_rate": 4.409562211981567e-05, |
|
"loss": 0.1383, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 2.4651251672081025, |
|
"grad_norm": 0.27215951681137085, |
|
"learning_rate": 4.404761904761905e-05, |
|
"loss": 0.1604, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 2.48423466462832, |
|
"grad_norm": 0.5887466073036194, |
|
"learning_rate": 4.399961597542243e-05, |
|
"loss": 0.1126, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 2.48423466462832, |
|
"eval_events-synergy/entsum_processed_loss": 0.09368874132633209, |
|
"eval_events-synergy/entsum_processed_runtime": 86.3411, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.94, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.741, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 2.5033441620485384, |
|
"grad_norm": 2.6320173740386963, |
|
"learning_rate": 4.395161290322581e-05, |
|
"loss": 0.1223, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 2.522453659468756, |
|
"grad_norm": 1.1739882230758667, |
|
"learning_rate": 4.3903609831029184e-05, |
|
"loss": 0.1801, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 2.541563156888974, |
|
"grad_norm": 0.972703218460083, |
|
"learning_rate": 4.385560675883257e-05, |
|
"loss": 0.1248, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 2.5606726543091916, |
|
"grad_norm": 0.37191712856292725, |
|
"learning_rate": 4.380760368663595e-05, |
|
"loss": 0.1428, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 2.5797821517294093, |
|
"grad_norm": 1.420135498046875, |
|
"learning_rate": 4.3759600614439327e-05, |
|
"loss": 0.1234, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 2.5988916491496274, |
|
"grad_norm": 0.970924437046051, |
|
"learning_rate": 4.3711597542242705e-05, |
|
"loss": 0.1447, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 2.618001146569845, |
|
"grad_norm": 1.2393814325332642, |
|
"learning_rate": 4.366359447004609e-05, |
|
"loss": 0.1264, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 2.637110643990063, |
|
"grad_norm": 1.179800271987915, |
|
"learning_rate": 4.361559139784946e-05, |
|
"loss": 0.1418, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 2.656220141410281, |
|
"grad_norm": 0.8592195510864258, |
|
"learning_rate": 4.356758832565284e-05, |
|
"loss": 0.1326, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 2.675329638830499, |
|
"grad_norm": 2.3403942584991455, |
|
"learning_rate": 4.351958525345623e-05, |
|
"loss": 0.1579, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 2.675329638830499, |
|
"eval_events-synergy/entsum_processed_loss": 0.08366435021162033, |
|
"eval_events-synergy/entsum_processed_runtime": 85.9027, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.077, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.775, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 2.6944391362507165, |
|
"grad_norm": 1.2683295011520386, |
|
"learning_rate": 4.34715821812596e-05, |
|
"loss": 0.1452, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 2.7135486336709347, |
|
"grad_norm": 0.9251212477684021, |
|
"learning_rate": 4.3423579109062984e-05, |
|
"loss": 0.1261, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 2.7326581310911524, |
|
"grad_norm": 0.2922651171684265, |
|
"learning_rate": 4.337557603686636e-05, |
|
"loss": 0.1131, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 2.75176762851137, |
|
"grad_norm": 0.5204638838768005, |
|
"learning_rate": 4.332757296466974e-05, |
|
"loss": 0.109, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 2.770877125931588, |
|
"grad_norm": 1.6479909420013428, |
|
"learning_rate": 4.327956989247312e-05, |
|
"loss": 0.1217, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 2.7899866233518056, |
|
"grad_norm": 0.693134605884552, |
|
"learning_rate": 4.32315668202765e-05, |
|
"loss": 0.1259, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 2.8090961207720238, |
|
"grad_norm": 0.1304488480091095, |
|
"learning_rate": 4.318356374807988e-05, |
|
"loss": 0.1271, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 2.8282056181922415, |
|
"grad_norm": 1.7225552797317505, |
|
"learning_rate": 4.313556067588326e-05, |
|
"loss": 0.1205, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 2.847315115612459, |
|
"grad_norm": 1.0020724534988403, |
|
"learning_rate": 4.308755760368664e-05, |
|
"loss": 0.1177, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 2.8664246130326774, |
|
"grad_norm": 1.1193674802780151, |
|
"learning_rate": 4.3039554531490014e-05, |
|
"loss": 0.1099, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.8664246130326774, |
|
"eval_events-synergy/entsum_processed_loss": 0.07461204379796982, |
|
"eval_events-synergy/entsum_processed_runtime": 86.2815, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.958, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.745, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.885534110452895, |
|
"grad_norm": 1.0606135129928589, |
|
"learning_rate": 4.299155145929339e-05, |
|
"loss": 0.1193, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 2.904643607873113, |
|
"grad_norm": 0.7155654430389404, |
|
"learning_rate": 4.294354838709678e-05, |
|
"loss": 0.1064, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 2.923753105293331, |
|
"grad_norm": 0.7445634007453918, |
|
"learning_rate": 4.289554531490016e-05, |
|
"loss": 0.1343, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 2.9428626027135487, |
|
"grad_norm": 2.339707612991333, |
|
"learning_rate": 4.2847542242703536e-05, |
|
"loss": 0.1443, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 2.9619721001337664, |
|
"grad_norm": 1.8104724884033203, |
|
"learning_rate": 4.2799539170506915e-05, |
|
"loss": 0.101, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.981081597553984, |
|
"grad_norm": 1.3034611940383911, |
|
"learning_rate": 4.2751536098310294e-05, |
|
"loss": 0.1061, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 3.0001910949742023, |
|
"grad_norm": 0.9755160212516785, |
|
"learning_rate": 4.270353302611367e-05, |
|
"loss": 0.1145, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 3.01930059239442, |
|
"grad_norm": 1.1601845026016235, |
|
"learning_rate": 4.265552995391705e-05, |
|
"loss": 0.0902, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 3.038410089814638, |
|
"grad_norm": 0.6864597201347351, |
|
"learning_rate": 4.260752688172043e-05, |
|
"loss": 0.1238, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 3.0575195872348555, |
|
"grad_norm": 1.6638556718826294, |
|
"learning_rate": 4.255952380952381e-05, |
|
"loss": 0.0891, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 3.0575195872348555, |
|
"eval_events-synergy/entsum_processed_loss": 0.06794700771570206, |
|
"eval_events-synergy/entsum_processed_runtime": 86.5192, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.884, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.727, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 3.0766290846550737, |
|
"grad_norm": 0.6231021285057068, |
|
"learning_rate": 4.2511520737327194e-05, |
|
"loss": 0.1146, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 3.0957385820752914, |
|
"grad_norm": 0.8148559927940369, |
|
"learning_rate": 4.246351766513057e-05, |
|
"loss": 0.0907, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 3.114848079495509, |
|
"grad_norm": 0.6905009150505066, |
|
"learning_rate": 4.241551459293395e-05, |
|
"loss": 0.0984, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 3.1339575769157273, |
|
"grad_norm": 0.5867490768432617, |
|
"learning_rate": 4.236751152073733e-05, |
|
"loss": 0.105, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 3.153067074335945, |
|
"grad_norm": 1.3492326736450195, |
|
"learning_rate": 4.231950844854071e-05, |
|
"loss": 0.0838, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 3.1721765717561627, |
|
"grad_norm": 1.4845542907714844, |
|
"learning_rate": 4.227150537634409e-05, |
|
"loss": 0.0988, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 3.1912860691763805, |
|
"grad_norm": 0.6312690377235413, |
|
"learning_rate": 4.2223502304147466e-05, |
|
"loss": 0.0831, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 3.2103955665965986, |
|
"grad_norm": 1.1302180290222168, |
|
"learning_rate": 4.217549923195085e-05, |
|
"loss": 0.0818, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 3.2295050640168164, |
|
"grad_norm": 0.8319332003593445, |
|
"learning_rate": 4.2127496159754224e-05, |
|
"loss": 0.0874, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 3.248614561437034, |
|
"grad_norm": 1.3741902112960815, |
|
"learning_rate": 4.20794930875576e-05, |
|
"loss": 0.081, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 3.248614561437034, |
|
"eval_events-synergy/entsum_processed_loss": 0.06280920654535294, |
|
"eval_events-synergy/entsum_processed_runtime": 86.681, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.834, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.714, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 3.267724058857252, |
|
"grad_norm": 1.1995327472686768, |
|
"learning_rate": 4.203149001536099e-05, |
|
"loss": 0.097, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 3.28683355627747, |
|
"grad_norm": 0.8017995357513428, |
|
"learning_rate": 4.198348694316436e-05, |
|
"loss": 0.0873, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 3.3059430536976877, |
|
"grad_norm": 0.2565690875053406, |
|
"learning_rate": 4.1935483870967746e-05, |
|
"loss": 0.1096, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 3.3250525511179054, |
|
"grad_norm": 0.5259280204772949, |
|
"learning_rate": 4.1887480798771124e-05, |
|
"loss": 0.0853, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 3.3441620485381236, |
|
"grad_norm": 1.4056413173675537, |
|
"learning_rate": 4.18394777265745e-05, |
|
"loss": 0.0977, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 3.3632715459583413, |
|
"grad_norm": 0.9580443501472473, |
|
"learning_rate": 4.179147465437788e-05, |
|
"loss": 0.0895, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 3.382381043378559, |
|
"grad_norm": 0.5705116391181946, |
|
"learning_rate": 4.174347158218126e-05, |
|
"loss": 0.0937, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 3.401490540798777, |
|
"grad_norm": 0.7241528630256653, |
|
"learning_rate": 4.169546850998464e-05, |
|
"loss": 0.0952, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 3.420600038218995, |
|
"grad_norm": 0.5498321652412415, |
|
"learning_rate": 4.164746543778802e-05, |
|
"loss": 0.0827, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 3.4397095356392127, |
|
"grad_norm": 0.7606751918792725, |
|
"learning_rate": 4.1599462365591404e-05, |
|
"loss": 0.0755, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 3.4397095356392127, |
|
"eval_events-synergy/entsum_processed_loss": 0.05818604305386543, |
|
"eval_events-synergy/entsum_processed_runtime": 86.615, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.854, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.719, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 3.4588190330594304, |
|
"grad_norm": 1.3093830347061157, |
|
"learning_rate": 4.1551459293394776e-05, |
|
"loss": 0.0863, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 3.4779285304796486, |
|
"grad_norm": 0.3217012286186218, |
|
"learning_rate": 4.1503456221198154e-05, |
|
"loss": 0.093, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 3.4970380278998663, |
|
"grad_norm": 2.5801539421081543, |
|
"learning_rate": 4.145545314900154e-05, |
|
"loss": 0.0735, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 3.516147525320084, |
|
"grad_norm": 1.4098963737487793, |
|
"learning_rate": 4.140745007680492e-05, |
|
"loss": 0.0632, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 3.5352570227403017, |
|
"grad_norm": 2.017245054244995, |
|
"learning_rate": 4.13594470046083e-05, |
|
"loss": 0.0771, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 3.55436652016052, |
|
"grad_norm": 0.4717109203338623, |
|
"learning_rate": 4.1311443932411676e-05, |
|
"loss": 0.0806, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 3.5734760175807376, |
|
"grad_norm": 1.616327166557312, |
|
"learning_rate": 4.1263440860215055e-05, |
|
"loss": 0.0702, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 3.5925855150009554, |
|
"grad_norm": 0.4756339192390442, |
|
"learning_rate": 4.1215437788018434e-05, |
|
"loss": 0.0893, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 3.6116950124211735, |
|
"grad_norm": 0.8106458783149719, |
|
"learning_rate": 4.116743471582181e-05, |
|
"loss": 0.0608, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 3.6308045098413912, |
|
"grad_norm": 0.7149679660797119, |
|
"learning_rate": 4.11194316436252e-05, |
|
"loss": 0.083, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 3.6308045098413912, |
|
"eval_events-synergy/entsum_processed_loss": 0.053829826414585114, |
|
"eval_events-synergy/entsum_processed_runtime": 86.5846, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.864, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.722, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 3.649914007261609, |
|
"grad_norm": 0.7926931381225586, |
|
"learning_rate": 4.107142857142857e-05, |
|
"loss": 0.0849, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 3.669023504681827, |
|
"grad_norm": 0.5450174808502197, |
|
"learning_rate": 4.1023425499231955e-05, |
|
"loss": 0.0651, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 3.688133002102045, |
|
"grad_norm": 1.2390661239624023, |
|
"learning_rate": 4.0975422427035334e-05, |
|
"loss": 0.0768, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 3.7072424995222626, |
|
"grad_norm": 1.3439677953720093, |
|
"learning_rate": 4.092741935483871e-05, |
|
"loss": 0.1014, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 3.7263519969424803, |
|
"grad_norm": 0.7183663845062256, |
|
"learning_rate": 4.087941628264209e-05, |
|
"loss": 0.082, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 3.745461494362698, |
|
"grad_norm": 1.6611928939819336, |
|
"learning_rate": 4.083141321044547e-05, |
|
"loss": 0.0732, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 3.764570991782916, |
|
"grad_norm": 1.2751322984695435, |
|
"learning_rate": 4.078341013824885e-05, |
|
"loss": 0.0768, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 3.783680489203134, |
|
"grad_norm": 1.3333556652069092, |
|
"learning_rate": 4.073540706605223e-05, |
|
"loss": 0.0794, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 3.8027899866233517, |
|
"grad_norm": 0.6069827675819397, |
|
"learning_rate": 4.068740399385561e-05, |
|
"loss": 0.0846, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 3.82189948404357, |
|
"grad_norm": 0.38667362928390503, |
|
"learning_rate": 4.0639400921658985e-05, |
|
"loss": 0.0894, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 3.82189948404357, |
|
"eval_events-synergy/entsum_processed_loss": 0.04852017015218735, |
|
"eval_events-synergy/entsum_processed_runtime": 86.5692, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.869, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.723, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 3.8410089814637876, |
|
"grad_norm": 1.1866488456726074, |
|
"learning_rate": 4.0591397849462364e-05, |
|
"loss": 0.055, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 3.8601184788840053, |
|
"grad_norm": 1.6856223344802856, |
|
"learning_rate": 4.054339477726575e-05, |
|
"loss": 0.0713, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 3.8792279763042234, |
|
"grad_norm": 0.5651134252548218, |
|
"learning_rate": 4.049539170506912e-05, |
|
"loss": 0.075, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 3.898337473724441, |
|
"grad_norm": 0.9011269807815552, |
|
"learning_rate": 4.044738863287251e-05, |
|
"loss": 0.061, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 3.917446971144659, |
|
"grad_norm": 1.215333104133606, |
|
"learning_rate": 4.0399385560675886e-05, |
|
"loss": 0.0704, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 3.9365564685648766, |
|
"grad_norm": 0.8905503749847412, |
|
"learning_rate": 4.0351382488479264e-05, |
|
"loss": 0.0804, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 3.9556659659850943, |
|
"grad_norm": 0.3945884108543396, |
|
"learning_rate": 4.030337941628264e-05, |
|
"loss": 0.0731, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 3.9747754634053125, |
|
"grad_norm": 1.3491140604019165, |
|
"learning_rate": 4.025537634408602e-05, |
|
"loss": 0.0811, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 3.9938849608255302, |
|
"grad_norm": 0.6310815811157227, |
|
"learning_rate": 4.02073732718894e-05, |
|
"loss": 0.0609, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 4.012994458245748, |
|
"grad_norm": 0.8954095840454102, |
|
"learning_rate": 4.015937019969278e-05, |
|
"loss": 0.0605, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 4.012994458245748, |
|
"eval_events-synergy/entsum_processed_loss": 0.04423968493938446, |
|
"eval_events-synergy/entsum_processed_runtime": 85.8578, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.091, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.779, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 4.032103955665966, |
|
"grad_norm": 0.7362313866615295, |
|
"learning_rate": 4.0111367127496165e-05, |
|
"loss": 0.0473, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 4.051213453086183, |
|
"grad_norm": 0.45443108677864075, |
|
"learning_rate": 4.0063364055299544e-05, |
|
"loss": 0.0526, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 4.070322950506402, |
|
"grad_norm": 0.5123448371887207, |
|
"learning_rate": 4.001536098310292e-05, |
|
"loss": 0.0604, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 4.08943244792662, |
|
"grad_norm": 1.018021583557129, |
|
"learning_rate": 3.99673579109063e-05, |
|
"loss": 0.0629, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 4.108541945346837, |
|
"grad_norm": 1.2121423482894897, |
|
"learning_rate": 3.991935483870968e-05, |
|
"loss": 0.0554, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 4.127651442767055, |
|
"grad_norm": 1.4117833375930786, |
|
"learning_rate": 3.987135176651306e-05, |
|
"loss": 0.0562, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 4.146760940187273, |
|
"grad_norm": 0.7586542367935181, |
|
"learning_rate": 3.982334869431644e-05, |
|
"loss": 0.0604, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 4.165870437607491, |
|
"grad_norm": 1.4312876462936401, |
|
"learning_rate": 3.977534562211982e-05, |
|
"loss": 0.0683, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 4.184979935027709, |
|
"grad_norm": 1.278427243232727, |
|
"learning_rate": 3.9727342549923195e-05, |
|
"loss": 0.0607, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 4.204089432447927, |
|
"grad_norm": 2.7680790424346924, |
|
"learning_rate": 3.9679339477726574e-05, |
|
"loss": 0.0535, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 4.204089432447927, |
|
"eval_events-synergy/entsum_processed_loss": 0.04234563931822777, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4674, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.9, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.731, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 4.223198929868144, |
|
"grad_norm": 1.4007564783096313, |
|
"learning_rate": 3.963133640552996e-05, |
|
"loss": 0.0715, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 4.242308427288362, |
|
"grad_norm": 0.6837427020072937, |
|
"learning_rate": 3.958333333333333e-05, |
|
"loss": 0.0611, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 4.26141792470858, |
|
"grad_norm": 1.1406750679016113, |
|
"learning_rate": 3.9535330261136717e-05, |
|
"loss": 0.055, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 4.280527422128798, |
|
"grad_norm": 0.38464096188545227, |
|
"learning_rate": 3.9487327188940095e-05, |
|
"loss": 0.0562, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 4.299636919549016, |
|
"grad_norm": 1.4123332500457764, |
|
"learning_rate": 3.9439324116743474e-05, |
|
"loss": 0.0505, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 4.318746416969233, |
|
"grad_norm": 0.24340485036373138, |
|
"learning_rate": 3.939132104454685e-05, |
|
"loss": 0.0642, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 4.3378559143894515, |
|
"grad_norm": 0.541613757610321, |
|
"learning_rate": 3.934331797235023e-05, |
|
"loss": 0.059, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 4.35696541180967, |
|
"grad_norm": 1.4583444595336914, |
|
"learning_rate": 3.929531490015361e-05, |
|
"loss": 0.0515, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 4.376074909229887, |
|
"grad_norm": 1.1488178968429565, |
|
"learning_rate": 3.924731182795699e-05, |
|
"loss": 0.0532, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 4.395184406650105, |
|
"grad_norm": 1.302866816520691, |
|
"learning_rate": 3.9199308755760374e-05, |
|
"loss": 0.0515, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 4.395184406650105, |
|
"eval_events-synergy/entsum_processed_loss": 0.04016019403934479, |
|
"eval_events-synergy/entsum_processed_runtime": 86.5369, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.879, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.725, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 4.414293904070323, |
|
"grad_norm": 0.5283303260803223, |
|
"learning_rate": 3.9151305683563746e-05, |
|
"loss": 0.0534, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 4.433403401490541, |
|
"grad_norm": 0.9314780831336975, |
|
"learning_rate": 3.910330261136713e-05, |
|
"loss": 0.05, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 4.452512898910759, |
|
"grad_norm": 0.44866007566452026, |
|
"learning_rate": 3.905529953917051e-05, |
|
"loss": 0.0584, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 4.471622396330977, |
|
"grad_norm": 0.8532125353813171, |
|
"learning_rate": 3.900729646697389e-05, |
|
"loss": 0.0651, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 4.490731893751194, |
|
"grad_norm": 2.1083550453186035, |
|
"learning_rate": 3.895929339477727e-05, |
|
"loss": 0.0498, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 4.509841391171412, |
|
"grad_norm": 0.4969152808189392, |
|
"learning_rate": 3.891129032258065e-05, |
|
"loss": 0.0538, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 4.5289508885916305, |
|
"grad_norm": 0.577306866645813, |
|
"learning_rate": 3.8863287250384026e-05, |
|
"loss": 0.0388, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 4.548060386011848, |
|
"grad_norm": 0.08934194594621658, |
|
"learning_rate": 3.8815284178187404e-05, |
|
"loss": 0.0487, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 4.567169883432066, |
|
"grad_norm": 1.1038001775741577, |
|
"learning_rate": 3.876728110599078e-05, |
|
"loss": 0.0558, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 4.586279380852283, |
|
"grad_norm": 1.019621729850769, |
|
"learning_rate": 3.871927803379416e-05, |
|
"loss": 0.0659, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 4.586279380852283, |
|
"eval_events-synergy/entsum_processed_loss": 0.03685642033815384, |
|
"eval_events-synergy/entsum_processed_runtime": 84.6028, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.493, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.879, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 4.605388878272501, |
|
"grad_norm": 0.8093920350074768, |
|
"learning_rate": 3.867127496159754e-05, |
|
"loss": 0.0452, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 4.62449837569272, |
|
"grad_norm": 1.2581048011779785, |
|
"learning_rate": 3.8623271889400926e-05, |
|
"loss": 0.0445, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 4.643607873112937, |
|
"grad_norm": 0.908857524394989, |
|
"learning_rate": 3.8575268817204305e-05, |
|
"loss": 0.0469, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 4.662717370533155, |
|
"grad_norm": 1.6877511739730835, |
|
"learning_rate": 3.8527265745007684e-05, |
|
"loss": 0.062, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 4.681826867953372, |
|
"grad_norm": 0.47951963543891907, |
|
"learning_rate": 3.847926267281106e-05, |
|
"loss": 0.0761, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 4.7009363653735905, |
|
"grad_norm": 0.7642994523048401, |
|
"learning_rate": 3.843125960061444e-05, |
|
"loss": 0.0534, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 4.720045862793809, |
|
"grad_norm": 0.6277188062667847, |
|
"learning_rate": 3.838325652841782e-05, |
|
"loss": 0.0521, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 4.739155360214026, |
|
"grad_norm": 0.9474277496337891, |
|
"learning_rate": 3.83352534562212e-05, |
|
"loss": 0.0625, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 4.758264857634244, |
|
"grad_norm": 0.6333776116371155, |
|
"learning_rate": 3.8287250384024584e-05, |
|
"loss": 0.0384, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 4.777374355054462, |
|
"grad_norm": 0.5701904892921448, |
|
"learning_rate": 3.8239247311827956e-05, |
|
"loss": 0.0574, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 4.777374355054462, |
|
"eval_events-synergy/entsum_processed_loss": 0.03454362228512764, |
|
"eval_events-synergy/entsum_processed_runtime": 86.3766, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.929, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.738, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 4.79648385247468, |
|
"grad_norm": 1.0204541683197021, |
|
"learning_rate": 3.819124423963134e-05, |
|
"loss": 0.0551, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 4.815593349894898, |
|
"grad_norm": 1.5141241550445557, |
|
"learning_rate": 3.814324116743472e-05, |
|
"loss": 0.0529, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 4.834702847315116, |
|
"grad_norm": 1.2822602987289429, |
|
"learning_rate": 3.809523809523809e-05, |
|
"loss": 0.0627, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 4.853812344735333, |
|
"grad_norm": 0.21899272501468658, |
|
"learning_rate": 3.804723502304148e-05, |
|
"loss": 0.0383, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 4.872921842155551, |
|
"grad_norm": 0.30676454305648804, |
|
"learning_rate": 3.7999231950844856e-05, |
|
"loss": 0.0453, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 4.8920313395757695, |
|
"grad_norm": 0.33828213810920715, |
|
"learning_rate": 3.7951228878648235e-05, |
|
"loss": 0.0402, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 4.911140836995987, |
|
"grad_norm": 0.7716867327690125, |
|
"learning_rate": 3.7903225806451614e-05, |
|
"loss": 0.0523, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 4.930250334416205, |
|
"grad_norm": 0.31507009267807007, |
|
"learning_rate": 3.785522273425499e-05, |
|
"loss": 0.0415, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 4.949359831836423, |
|
"grad_norm": 0.7771691083908081, |
|
"learning_rate": 3.780721966205837e-05, |
|
"loss": 0.0581, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 4.96846932925664, |
|
"grad_norm": 1.4588698148727417, |
|
"learning_rate": 3.775921658986175e-05, |
|
"loss": 0.0501, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 4.96846932925664, |
|
"eval_events-synergy/entsum_processed_loss": 0.03175923973321915, |
|
"eval_events-synergy/entsum_processed_runtime": 85.1461, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.318, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.835, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 4.987578826676859, |
|
"grad_norm": 0.24702726304531097, |
|
"learning_rate": 3.7711213517665136e-05, |
|
"loss": 0.0488, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 5.006688324097076, |
|
"grad_norm": 0.3701643645763397, |
|
"learning_rate": 3.766321044546851e-05, |
|
"loss": 0.0422, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 5.025797821517294, |
|
"grad_norm": 0.2334865927696228, |
|
"learning_rate": 3.761520737327189e-05, |
|
"loss": 0.0369, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 5.044907318937512, |
|
"grad_norm": 0.4860515892505646, |
|
"learning_rate": 3.756720430107527e-05, |
|
"loss": 0.0467, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 5.0640168163577295, |
|
"grad_norm": 1.0469186305999756, |
|
"learning_rate": 3.751920122887865e-05, |
|
"loss": 0.0388, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 5.083126313777948, |
|
"grad_norm": 0.10929598659276962, |
|
"learning_rate": 3.747119815668203e-05, |
|
"loss": 0.039, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 5.102235811198166, |
|
"grad_norm": 0.9259900450706482, |
|
"learning_rate": 3.742319508448541e-05, |
|
"loss": 0.0537, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 5.121345308618383, |
|
"grad_norm": 0.6803724765777588, |
|
"learning_rate": 3.737519201228879e-05, |
|
"loss": 0.0429, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 5.140454806038601, |
|
"grad_norm": 0.35482415556907654, |
|
"learning_rate": 3.7327188940092166e-05, |
|
"loss": 0.0454, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 5.1595643034588194, |
|
"grad_norm": 0.4932660162448883, |
|
"learning_rate": 3.727918586789555e-05, |
|
"loss": 0.0389, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 5.1595643034588194, |
|
"eval_events-synergy/entsum_processed_loss": 0.0308841560035944, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4654, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.901, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.731, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 5.178673800879037, |
|
"grad_norm": 0.7944111824035645, |
|
"learning_rate": 3.723118279569893e-05, |
|
"loss": 0.0367, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 5.197783298299255, |
|
"grad_norm": 0.3408133387565613, |
|
"learning_rate": 3.71831797235023e-05, |
|
"loss": 0.0412, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 5.216892795719472, |
|
"grad_norm": 1.3057624101638794, |
|
"learning_rate": 3.713517665130569e-05, |
|
"loss": 0.0313, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 5.23600229313969, |
|
"grad_norm": 0.28078633546829224, |
|
"learning_rate": 3.7087173579109066e-05, |
|
"loss": 0.0432, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 5.2551117905599085, |
|
"grad_norm": 0.3789531886577606, |
|
"learning_rate": 3.7039170506912445e-05, |
|
"loss": 0.0415, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 5.274221287980126, |
|
"grad_norm": 0.5109480619430542, |
|
"learning_rate": 3.6991167434715824e-05, |
|
"loss": 0.0328, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 5.293330785400344, |
|
"grad_norm": 0.010233950801193714, |
|
"learning_rate": 3.69431643625192e-05, |
|
"loss": 0.0399, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 5.312440282820562, |
|
"grad_norm": 0.4560386538505554, |
|
"learning_rate": 3.689516129032258e-05, |
|
"loss": 0.0387, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 5.331549780240779, |
|
"grad_norm": 0.1968875378370285, |
|
"learning_rate": 3.684715821812596e-05, |
|
"loss": 0.0371, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 5.350659277660998, |
|
"grad_norm": 0.8004895448684692, |
|
"learning_rate": 3.6799155145929345e-05, |
|
"loss": 0.0437, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 5.350659277660998, |
|
"eval_events-synergy/entsum_processed_loss": 0.029284106567502022, |
|
"eval_events-synergy/entsum_processed_runtime": 85.8087, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.107, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.783, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 5.369768775081216, |
|
"grad_norm": 0.9914371371269226, |
|
"learning_rate": 3.675115207373272e-05, |
|
"loss": 0.0352, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 5.388878272501433, |
|
"grad_norm": 1.3968762159347534, |
|
"learning_rate": 3.67031490015361e-05, |
|
"loss": 0.0305, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 5.407987769921651, |
|
"grad_norm": 0.677922785282135, |
|
"learning_rate": 3.665514592933948e-05, |
|
"loss": 0.0321, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 5.427097267341869, |
|
"grad_norm": 0.5966954231262207, |
|
"learning_rate": 3.6607142857142853e-05, |
|
"loss": 0.0356, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 5.446206764762087, |
|
"grad_norm": 0.47409358620643616, |
|
"learning_rate": 3.655913978494624e-05, |
|
"loss": 0.0281, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 5.465316262182305, |
|
"grad_norm": 0.5637366771697998, |
|
"learning_rate": 3.651113671274962e-05, |
|
"loss": 0.0374, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 5.484425759602522, |
|
"grad_norm": 0.5510061383247375, |
|
"learning_rate": 3.6463133640552996e-05, |
|
"loss": 0.0321, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 5.50353525702274, |
|
"grad_norm": 0.7010207176208496, |
|
"learning_rate": 3.6415130568356375e-05, |
|
"loss": 0.0368, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 5.522644754442958, |
|
"grad_norm": 1.2387152910232544, |
|
"learning_rate": 3.636712749615976e-05, |
|
"loss": 0.0397, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 5.541754251863176, |
|
"grad_norm": 1.7459216117858887, |
|
"learning_rate": 3.631912442396313e-05, |
|
"loss": 0.0386, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 5.541754251863176, |
|
"eval_events-synergy/entsum_processed_loss": 0.02899482287466526, |
|
"eval_events-synergy/entsum_processed_runtime": 84.3392, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.579, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.901, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 5.560863749283394, |
|
"grad_norm": 0.6622141003608704, |
|
"learning_rate": 3.627112135176651e-05, |
|
"loss": 0.0368, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 5.579973246703612, |
|
"grad_norm": 1.0212815999984741, |
|
"learning_rate": 3.62231182795699e-05, |
|
"loss": 0.0373, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 5.599082744123829, |
|
"grad_norm": 0.4521411657333374, |
|
"learning_rate": 3.6175115207373276e-05, |
|
"loss": 0.0365, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 5.6181922415440475, |
|
"grad_norm": 0.8316215872764587, |
|
"learning_rate": 3.6127112135176654e-05, |
|
"loss": 0.0401, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 5.637301738964265, |
|
"grad_norm": 0.3077566623687744, |
|
"learning_rate": 3.607910906298003e-05, |
|
"loss": 0.044, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 5.656411236384483, |
|
"grad_norm": 1.1517149209976196, |
|
"learning_rate": 3.603110599078341e-05, |
|
"loss": 0.0355, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 5.675520733804701, |
|
"grad_norm": 1.3990229368209839, |
|
"learning_rate": 3.598310291858679e-05, |
|
"loss": 0.0349, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 5.694630231224918, |
|
"grad_norm": 1.5779931545257568, |
|
"learning_rate": 3.593509984639017e-05, |
|
"loss": 0.0393, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 5.713739728645137, |
|
"grad_norm": 1.200478434562683, |
|
"learning_rate": 3.5887096774193555e-05, |
|
"loss": 0.0333, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 5.732849226065355, |
|
"grad_norm": 1.5373812913894653, |
|
"learning_rate": 3.583909370199693e-05, |
|
"loss": 0.0403, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 5.732849226065355, |
|
"eval_events-synergy/entsum_processed_loss": 0.02680058218538761, |
|
"eval_events-synergy/entsum_processed_runtime": 86.269, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.962, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.746, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 5.751958723485572, |
|
"grad_norm": 0.24801170825958252, |
|
"learning_rate": 3.579109062980031e-05, |
|
"loss": 0.0383, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 5.77106822090579, |
|
"grad_norm": 1.638857126235962, |
|
"learning_rate": 3.574308755760369e-05, |
|
"loss": 0.0466, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 5.790177718326008, |
|
"grad_norm": 0.6621894836425781, |
|
"learning_rate": 3.569508448540706e-05, |
|
"loss": 0.0332, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 5.809287215746226, |
|
"grad_norm": 0.7041098475456238, |
|
"learning_rate": 3.564708141321045e-05, |
|
"loss": 0.0252, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 5.828396713166444, |
|
"grad_norm": 0.1067354753613472, |
|
"learning_rate": 3.559907834101383e-05, |
|
"loss": 0.029, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 5.847506210586662, |
|
"grad_norm": 0.8987241387367249, |
|
"learning_rate": 3.5551075268817206e-05, |
|
"loss": 0.039, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 5.866615708006879, |
|
"grad_norm": 2.081352710723877, |
|
"learning_rate": 3.5503072196620585e-05, |
|
"loss": 0.033, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 5.885725205427097, |
|
"grad_norm": 1.5779916048049927, |
|
"learning_rate": 3.545506912442397e-05, |
|
"loss": 0.0337, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 5.904834702847316, |
|
"grad_norm": 0.8174372911453247, |
|
"learning_rate": 3.540706605222734e-05, |
|
"loss": 0.0355, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 5.923944200267533, |
|
"grad_norm": 0.35382506251335144, |
|
"learning_rate": 3.535906298003072e-05, |
|
"loss": 0.0477, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 5.923944200267533, |
|
"eval_events-synergy/entsum_processed_loss": 0.025150079280138016, |
|
"eval_events-synergy/entsum_processed_runtime": 86.0706, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.024, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.762, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 5.943053697687751, |
|
"grad_norm": 2.253109931945801, |
|
"learning_rate": 3.5311059907834107e-05, |
|
"loss": 0.0349, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 5.962163195107968, |
|
"grad_norm": 0.9015364646911621, |
|
"learning_rate": 3.526305683563748e-05, |
|
"loss": 0.0284, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 5.9812726925281865, |
|
"grad_norm": 0.24150536954402924, |
|
"learning_rate": 3.5215053763440864e-05, |
|
"loss": 0.0379, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 6.000382189948405, |
|
"grad_norm": 0.7606573700904846, |
|
"learning_rate": 3.516705069124424e-05, |
|
"loss": 0.0285, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 6.019491687368622, |
|
"grad_norm": 0.6848638653755188, |
|
"learning_rate": 3.511904761904762e-05, |
|
"loss": 0.0301, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 6.03860118478884, |
|
"grad_norm": 1.088733196258545, |
|
"learning_rate": 3.5071044546851e-05, |
|
"loss": 0.0265, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 6.057710682209058, |
|
"grad_norm": 1.046855092048645, |
|
"learning_rate": 3.502304147465438e-05, |
|
"loss": 0.0269, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 6.076820179629276, |
|
"grad_norm": 0.9664869904518127, |
|
"learning_rate": 3.497503840245776e-05, |
|
"loss": 0.0234, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 6.095929677049494, |
|
"grad_norm": 0.5576639175415039, |
|
"learning_rate": 3.4927035330261136e-05, |
|
"loss": 0.0332, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 6.115039174469711, |
|
"grad_norm": 0.839173436164856, |
|
"learning_rate": 3.487903225806452e-05, |
|
"loss": 0.0301, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 6.115039174469711, |
|
"eval_events-synergy/entsum_processed_loss": 0.02411143109202385, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4163, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.916, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.735, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 6.134148671889929, |
|
"grad_norm": 0.783420741558075, |
|
"learning_rate": 3.4831029185867894e-05, |
|
"loss": 0.0326, |
|
"step": 32100 |
|
}, |
|
{ |
|
"epoch": 6.153258169310147, |
|
"grad_norm": 0.7571488618850708, |
|
"learning_rate": 3.478302611367127e-05, |
|
"loss": 0.0266, |
|
"step": 32200 |
|
}, |
|
{ |
|
"epoch": 6.172367666730365, |
|
"grad_norm": 0.08053430169820786, |
|
"learning_rate": 3.473502304147466e-05, |
|
"loss": 0.0278, |
|
"step": 32300 |
|
}, |
|
{ |
|
"epoch": 6.191477164150583, |
|
"grad_norm": 1.4743053913116455, |
|
"learning_rate": 3.468701996927804e-05, |
|
"loss": 0.0256, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 6.210586661570801, |
|
"grad_norm": 0.20271699130535126, |
|
"learning_rate": 3.4639016897081416e-05, |
|
"loss": 0.0298, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 6.229696158991018, |
|
"grad_norm": 1.0754730701446533, |
|
"learning_rate": 3.4591013824884794e-05, |
|
"loss": 0.0253, |
|
"step": 32600 |
|
}, |
|
{ |
|
"epoch": 6.248805656411236, |
|
"grad_norm": 1.5914433002471924, |
|
"learning_rate": 3.454301075268817e-05, |
|
"loss": 0.0354, |
|
"step": 32700 |
|
}, |
|
{ |
|
"epoch": 6.267915153831455, |
|
"grad_norm": 0.37541651725769043, |
|
"learning_rate": 3.449500768049155e-05, |
|
"loss": 0.0274, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 6.287024651251672, |
|
"grad_norm": 0.6610062718391418, |
|
"learning_rate": 3.444700460829493e-05, |
|
"loss": 0.0308, |
|
"step": 32900 |
|
}, |
|
{ |
|
"epoch": 6.30613414867189, |
|
"grad_norm": 0.7302411198616028, |
|
"learning_rate": 3.4399001536098316e-05, |
|
"loss": 0.0281, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 6.30613414867189, |
|
"eval_events-synergy/entsum_processed_loss": 0.023598061874508858, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4223, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.914, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.734, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 6.325243646092108, |
|
"grad_norm": 1.2536879777908325, |
|
"learning_rate": 3.435099846390169e-05, |
|
"loss": 0.0292, |
|
"step": 33100 |
|
}, |
|
{ |
|
"epoch": 6.3443531435123255, |
|
"grad_norm": 0.050880610942840576, |
|
"learning_rate": 3.4302995391705074e-05, |
|
"loss": 0.023, |
|
"step": 33200 |
|
}, |
|
{ |
|
"epoch": 6.363462640932544, |
|
"grad_norm": 0.07612816989421844, |
|
"learning_rate": 3.425499231950845e-05, |
|
"loss": 0.027, |
|
"step": 33300 |
|
}, |
|
{ |
|
"epoch": 6.382572138352761, |
|
"grad_norm": 0.2280048131942749, |
|
"learning_rate": 3.4206989247311824e-05, |
|
"loss": 0.031, |
|
"step": 33400 |
|
}, |
|
{ |
|
"epoch": 6.401681635772979, |
|
"grad_norm": 0.9767467975616455, |
|
"learning_rate": 3.415898617511521e-05, |
|
"loss": 0.029, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 6.420791133193197, |
|
"grad_norm": 1.0423424243927002, |
|
"learning_rate": 3.411098310291859e-05, |
|
"loss": 0.0244, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 6.439900630613415, |
|
"grad_norm": 1.2598810195922852, |
|
"learning_rate": 3.406298003072197e-05, |
|
"loss": 0.0293, |
|
"step": 33700 |
|
}, |
|
{ |
|
"epoch": 6.459010128033633, |
|
"grad_norm": 0.1793285310268402, |
|
"learning_rate": 3.4014976958525346e-05, |
|
"loss": 0.0294, |
|
"step": 33800 |
|
}, |
|
{ |
|
"epoch": 6.478119625453851, |
|
"grad_norm": 0.011976735666394234, |
|
"learning_rate": 3.396697388632873e-05, |
|
"loss": 0.0276, |
|
"step": 33900 |
|
}, |
|
{ |
|
"epoch": 6.497229122874068, |
|
"grad_norm": 1.0099623203277588, |
|
"learning_rate": 3.3918970814132103e-05, |
|
"loss": 0.0351, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 6.497229122874068, |
|
"eval_events-synergy/entsum_processed_loss": 0.02258702553808689, |
|
"eval_events-synergy/entsum_processed_runtime": 86.2714, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.961, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.746, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 6.516338620294286, |
|
"grad_norm": 0.6294183135032654, |
|
"learning_rate": 3.387096774193548e-05, |
|
"loss": 0.0236, |
|
"step": 34100 |
|
}, |
|
{ |
|
"epoch": 6.535448117714504, |
|
"grad_norm": 0.5773456692695618, |
|
"learning_rate": 3.382296466973887e-05, |
|
"loss": 0.024, |
|
"step": 34200 |
|
}, |
|
{ |
|
"epoch": 6.554557615134722, |
|
"grad_norm": 0.7001476883888245, |
|
"learning_rate": 3.377496159754224e-05, |
|
"loss": 0.03, |
|
"step": 34300 |
|
}, |
|
{ |
|
"epoch": 6.57366711255494, |
|
"grad_norm": 1.5043959617614746, |
|
"learning_rate": 3.3726958525345625e-05, |
|
"loss": 0.023, |
|
"step": 34400 |
|
}, |
|
{ |
|
"epoch": 6.592776609975157, |
|
"grad_norm": 0.37344950437545776, |
|
"learning_rate": 3.3678955453149004e-05, |
|
"loss": 0.0383, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 6.611886107395375, |
|
"grad_norm": 0.7408236265182495, |
|
"learning_rate": 3.363095238095238e-05, |
|
"loss": 0.0253, |
|
"step": 34600 |
|
}, |
|
{ |
|
"epoch": 6.630995604815594, |
|
"grad_norm": 0.08738729357719421, |
|
"learning_rate": 3.358294930875576e-05, |
|
"loss": 0.023, |
|
"step": 34700 |
|
}, |
|
{ |
|
"epoch": 6.650105102235811, |
|
"grad_norm": 0.45454615354537964, |
|
"learning_rate": 3.353494623655914e-05, |
|
"loss": 0.025, |
|
"step": 34800 |
|
}, |
|
{ |
|
"epoch": 6.669214599656029, |
|
"grad_norm": 0.4266088008880615, |
|
"learning_rate": 3.348694316436252e-05, |
|
"loss": 0.0251, |
|
"step": 34900 |
|
}, |
|
{ |
|
"epoch": 6.688324097076247, |
|
"grad_norm": 0.44192788004875183, |
|
"learning_rate": 3.34389400921659e-05, |
|
"loss": 0.0349, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 6.688324097076247, |
|
"eval_events-synergy/entsum_processed_loss": 0.022197816520929337, |
|
"eval_events-synergy/entsum_processed_runtime": 86.47, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.899, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.731, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 6.7074335944964645, |
|
"grad_norm": 1.1022039651870728, |
|
"learning_rate": 3.339093701996928e-05, |
|
"loss": 0.0291, |
|
"step": 35100 |
|
}, |
|
{ |
|
"epoch": 6.726543091916683, |
|
"grad_norm": 0.7178447842597961, |
|
"learning_rate": 3.334293394777266e-05, |
|
"loss": 0.0252, |
|
"step": 35200 |
|
}, |
|
{ |
|
"epoch": 6.745652589336901, |
|
"grad_norm": 0.20827040076255798, |
|
"learning_rate": 3.3294930875576034e-05, |
|
"loss": 0.026, |
|
"step": 35300 |
|
}, |
|
{ |
|
"epoch": 6.764762086757118, |
|
"grad_norm": 0.044019654393196106, |
|
"learning_rate": 3.324692780337942e-05, |
|
"loss": 0.018, |
|
"step": 35400 |
|
}, |
|
{ |
|
"epoch": 6.783871584177336, |
|
"grad_norm": 0.4710674285888672, |
|
"learning_rate": 3.31989247311828e-05, |
|
"loss": 0.0275, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 6.802981081597554, |
|
"grad_norm": 1.3574426174163818, |
|
"learning_rate": 3.315092165898618e-05, |
|
"loss": 0.0245, |
|
"step": 35600 |
|
}, |
|
{ |
|
"epoch": 6.822090579017772, |
|
"grad_norm": 0.2804085612297058, |
|
"learning_rate": 3.3102918586789556e-05, |
|
"loss": 0.0207, |
|
"step": 35700 |
|
}, |
|
{ |
|
"epoch": 6.84120007643799, |
|
"grad_norm": 0.16950590908527374, |
|
"learning_rate": 3.305491551459294e-05, |
|
"loss": 0.028, |
|
"step": 35800 |
|
}, |
|
{ |
|
"epoch": 6.860309573858207, |
|
"grad_norm": 0.38544100522994995, |
|
"learning_rate": 3.300691244239631e-05, |
|
"loss": 0.0369, |
|
"step": 35900 |
|
}, |
|
{ |
|
"epoch": 6.879419071278425, |
|
"grad_norm": 0.36500322818756104, |
|
"learning_rate": 3.295890937019969e-05, |
|
"loss": 0.0255, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 6.879419071278425, |
|
"eval_events-synergy/entsum_processed_loss": 0.021442364901304245, |
|
"eval_events-synergy/entsum_processed_runtime": 85.1296, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.323, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.837, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 6.8985285686986435, |
|
"grad_norm": 0.8930972218513489, |
|
"learning_rate": 3.291090629800308e-05, |
|
"loss": 0.023, |
|
"step": 36100 |
|
}, |
|
{ |
|
"epoch": 6.917638066118861, |
|
"grad_norm": 0.02050691284239292, |
|
"learning_rate": 3.286290322580645e-05, |
|
"loss": 0.0213, |
|
"step": 36200 |
|
}, |
|
{ |
|
"epoch": 6.936747563539079, |
|
"grad_norm": 0.268373042345047, |
|
"learning_rate": 3.2814900153609835e-05, |
|
"loss": 0.0211, |
|
"step": 36300 |
|
}, |
|
{ |
|
"epoch": 6.955857060959297, |
|
"grad_norm": 0.4497075080871582, |
|
"learning_rate": 3.2766897081413214e-05, |
|
"loss": 0.0275, |
|
"step": 36400 |
|
}, |
|
{ |
|
"epoch": 6.974966558379514, |
|
"grad_norm": 0.024803461506962776, |
|
"learning_rate": 3.271889400921659e-05, |
|
"loss": 0.0259, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 6.994076055799733, |
|
"grad_norm": 0.030653996393084526, |
|
"learning_rate": 3.267089093701997e-05, |
|
"loss": 0.0225, |
|
"step": 36600 |
|
}, |
|
{ |
|
"epoch": 7.013185553219951, |
|
"grad_norm": 0.3798658549785614, |
|
"learning_rate": 3.262288786482335e-05, |
|
"loss": 0.0246, |
|
"step": 36700 |
|
}, |
|
{ |
|
"epoch": 7.032295050640168, |
|
"grad_norm": 0.9082667231559753, |
|
"learning_rate": 3.257488479262673e-05, |
|
"loss": 0.0217, |
|
"step": 36800 |
|
}, |
|
{ |
|
"epoch": 7.051404548060386, |
|
"grad_norm": 0.7902773022651672, |
|
"learning_rate": 3.252688172043011e-05, |
|
"loss": 0.02, |
|
"step": 36900 |
|
}, |
|
{ |
|
"epoch": 7.0705140454806035, |
|
"grad_norm": 0.6506838202476501, |
|
"learning_rate": 3.247887864823349e-05, |
|
"loss": 0.0185, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 7.0705140454806035, |
|
"eval_events-synergy/entsum_processed_loss": 0.021531905978918076, |
|
"eval_events-synergy/entsum_processed_runtime": 85.174, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.309, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.833, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 7.089623542900822, |
|
"grad_norm": 0.624746561050415, |
|
"learning_rate": 3.2430875576036865e-05, |
|
"loss": 0.0192, |
|
"step": 37100 |
|
}, |
|
{ |
|
"epoch": 7.10873304032104, |
|
"grad_norm": 0.5149529576301575, |
|
"learning_rate": 3.2382872503840243e-05, |
|
"loss": 0.0276, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 7.127842537741257, |
|
"grad_norm": 3.0162253379821777, |
|
"learning_rate": 3.233486943164363e-05, |
|
"loss": 0.0209, |
|
"step": 37300 |
|
}, |
|
{ |
|
"epoch": 7.146952035161475, |
|
"grad_norm": 0.3581557273864746, |
|
"learning_rate": 3.228686635944701e-05, |
|
"loss": 0.017, |
|
"step": 37400 |
|
}, |
|
{ |
|
"epoch": 7.166061532581693, |
|
"grad_norm": 0.534199595451355, |
|
"learning_rate": 3.2238863287250386e-05, |
|
"loss": 0.0225, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 7.185171030001911, |
|
"grad_norm": 0.584403395652771, |
|
"learning_rate": 3.2190860215053765e-05, |
|
"loss": 0.0246, |
|
"step": 37600 |
|
}, |
|
{ |
|
"epoch": 7.204280527422129, |
|
"grad_norm": 0.33575019240379333, |
|
"learning_rate": 3.2142857142857144e-05, |
|
"loss": 0.0175, |
|
"step": 37700 |
|
}, |
|
{ |
|
"epoch": 7.223390024842347, |
|
"grad_norm": 0.10965080559253693, |
|
"learning_rate": 3.209485407066052e-05, |
|
"loss": 0.0226, |
|
"step": 37800 |
|
}, |
|
{ |
|
"epoch": 7.242499522262564, |
|
"grad_norm": 0.35581934452056885, |
|
"learning_rate": 3.20468509984639e-05, |
|
"loss": 0.0238, |
|
"step": 37900 |
|
}, |
|
{ |
|
"epoch": 7.2616090196827825, |
|
"grad_norm": 0.34397947788238525, |
|
"learning_rate": 3.199884792626729e-05, |
|
"loss": 0.0143, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 7.2616090196827825, |
|
"eval_events-synergy/entsum_processed_loss": 0.020761175081133842, |
|
"eval_events-synergy/entsum_processed_runtime": 85.5012, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.204, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.807, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 7.280718517103, |
|
"grad_norm": 0.42107951641082764, |
|
"learning_rate": 3.195084485407066e-05, |
|
"loss": 0.0247, |
|
"step": 38100 |
|
}, |
|
{ |
|
"epoch": 7.299828014523218, |
|
"grad_norm": 0.02115056663751602, |
|
"learning_rate": 3.1902841781874044e-05, |
|
"loss": 0.0195, |
|
"step": 38200 |
|
}, |
|
{ |
|
"epoch": 7.318937511943436, |
|
"grad_norm": 0.5838721990585327, |
|
"learning_rate": 3.185483870967742e-05, |
|
"loss": 0.0219, |
|
"step": 38300 |
|
}, |
|
{ |
|
"epoch": 7.338047009363653, |
|
"grad_norm": 0.1621849685907364, |
|
"learning_rate": 3.18068356374808e-05, |
|
"loss": 0.0205, |
|
"step": 38400 |
|
}, |
|
{ |
|
"epoch": 7.357156506783872, |
|
"grad_norm": 0.9421815872192383, |
|
"learning_rate": 3.175883256528418e-05, |
|
"loss": 0.0255, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 7.37626600420409, |
|
"grad_norm": 0.534527063369751, |
|
"learning_rate": 3.171082949308756e-05, |
|
"loss": 0.0193, |
|
"step": 38600 |
|
}, |
|
{ |
|
"epoch": 7.395375501624307, |
|
"grad_norm": 0.25326958298683167, |
|
"learning_rate": 3.166282642089094e-05, |
|
"loss": 0.0173, |
|
"step": 38700 |
|
}, |
|
{ |
|
"epoch": 7.414484999044525, |
|
"grad_norm": 0.45906540751457214, |
|
"learning_rate": 3.161482334869432e-05, |
|
"loss": 0.0217, |
|
"step": 38800 |
|
}, |
|
{ |
|
"epoch": 7.433594496464743, |
|
"grad_norm": 0.20368924736976624, |
|
"learning_rate": 3.15668202764977e-05, |
|
"loss": 0.0204, |
|
"step": 38900 |
|
}, |
|
{ |
|
"epoch": 7.452703993884961, |
|
"grad_norm": 0.16196204721927643, |
|
"learning_rate": 3.1518817204301074e-05, |
|
"loss": 0.0269, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 7.452703993884961, |
|
"eval_events-synergy/entsum_processed_loss": 0.01936463639140129, |
|
"eval_events-synergy/entsum_processed_runtime": 85.1337, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.322, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.836, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 7.471813491305179, |
|
"grad_norm": 0.9692901372909546, |
|
"learning_rate": 3.147081413210445e-05, |
|
"loss": 0.0233, |
|
"step": 39100 |
|
}, |
|
{ |
|
"epoch": 7.490922988725396, |
|
"grad_norm": 0.772438645362854, |
|
"learning_rate": 3.142281105990784e-05, |
|
"loss": 0.0207, |
|
"step": 39200 |
|
}, |
|
{ |
|
"epoch": 7.510032486145614, |
|
"grad_norm": 0.39141640067100525, |
|
"learning_rate": 3.137480798771121e-05, |
|
"loss": 0.0191, |
|
"step": 39300 |
|
}, |
|
{ |
|
"epoch": 7.529141983565832, |
|
"grad_norm": 0.19886603951454163, |
|
"learning_rate": 3.1326804915514596e-05, |
|
"loss": 0.0189, |
|
"step": 39400 |
|
}, |
|
{ |
|
"epoch": 7.54825148098605, |
|
"grad_norm": 0.8104422092437744, |
|
"learning_rate": 3.1278801843317975e-05, |
|
"loss": 0.0247, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 7.567360978406268, |
|
"grad_norm": 1.5498480796813965, |
|
"learning_rate": 3.1230798771121354e-05, |
|
"loss": 0.0227, |
|
"step": 39600 |
|
}, |
|
{ |
|
"epoch": 7.586470475826486, |
|
"grad_norm": 0.7283300757408142, |
|
"learning_rate": 3.118279569892473e-05, |
|
"loss": 0.0226, |
|
"step": 39700 |
|
}, |
|
{ |
|
"epoch": 7.605579973246703, |
|
"grad_norm": 1.3604527711868286, |
|
"learning_rate": 3.113479262672811e-05, |
|
"loss": 0.0167, |
|
"step": 39800 |
|
}, |
|
{ |
|
"epoch": 7.6246894706669215, |
|
"grad_norm": 1.0663504600524902, |
|
"learning_rate": 3.108678955453149e-05, |
|
"loss": 0.0251, |
|
"step": 39900 |
|
}, |
|
{ |
|
"epoch": 7.64379896808714, |
|
"grad_norm": 0.7986360192298889, |
|
"learning_rate": 3.103878648233487e-05, |
|
"loss": 0.0178, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 7.64379896808714, |
|
"eval_events-synergy/entsum_processed_loss": 0.01874077506363392, |
|
"eval_events-synergy/entsum_processed_runtime": 86.5004, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.89, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.728, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 7.662908465507357, |
|
"grad_norm": 0.20992030203342438, |
|
"learning_rate": 3.0990783410138254e-05, |
|
"loss": 0.0169, |
|
"step": 40100 |
|
}, |
|
{ |
|
"epoch": 7.682017962927575, |
|
"grad_norm": 0.19716721773147583, |
|
"learning_rate": 3.0942780337941626e-05, |
|
"loss": 0.0176, |
|
"step": 40200 |
|
}, |
|
{ |
|
"epoch": 7.701127460347793, |
|
"grad_norm": 0.015498577617108822, |
|
"learning_rate": 3.089477726574501e-05, |
|
"loss": 0.0211, |
|
"step": 40300 |
|
}, |
|
{ |
|
"epoch": 7.7202369577680106, |
|
"grad_norm": 0.6486418843269348, |
|
"learning_rate": 3.084677419354839e-05, |
|
"loss": 0.0155, |
|
"step": 40400 |
|
}, |
|
{ |
|
"epoch": 7.739346455188229, |
|
"grad_norm": 0.6476672887802124, |
|
"learning_rate": 3.079877112135177e-05, |
|
"loss": 0.0201, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 7.758455952608447, |
|
"grad_norm": 1.1004291772842407, |
|
"learning_rate": 3.075076804915515e-05, |
|
"loss": 0.0242, |
|
"step": 40600 |
|
}, |
|
{ |
|
"epoch": 7.777565450028664, |
|
"grad_norm": 0.46351030468940735, |
|
"learning_rate": 3.0702764976958526e-05, |
|
"loss": 0.0185, |
|
"step": 40700 |
|
}, |
|
{ |
|
"epoch": 7.796674947448882, |
|
"grad_norm": 1.0854315757751465, |
|
"learning_rate": 3.0654761904761905e-05, |
|
"loss": 0.0201, |
|
"step": 40800 |
|
}, |
|
{ |
|
"epoch": 7.8157844448691, |
|
"grad_norm": 0.8841759562492371, |
|
"learning_rate": 3.0606758832565284e-05, |
|
"loss": 0.0184, |
|
"step": 40900 |
|
}, |
|
{ |
|
"epoch": 7.834893942289318, |
|
"grad_norm": 0.9825415015220642, |
|
"learning_rate": 3.055875576036866e-05, |
|
"loss": 0.0187, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 7.834893942289318, |
|
"eval_events-synergy/entsum_processed_loss": 0.018535524606704712, |
|
"eval_events-synergy/entsum_processed_runtime": 85.5215, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.198, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.805, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 7.854003439709536, |
|
"grad_norm": 2.4730401039123535, |
|
"learning_rate": 3.0510752688172045e-05, |
|
"loss": 0.0191, |
|
"step": 41100 |
|
}, |
|
{ |
|
"epoch": 7.873112937129753, |
|
"grad_norm": 2.2196543216705322, |
|
"learning_rate": 3.0462749615975423e-05, |
|
"loss": 0.0283, |
|
"step": 41200 |
|
}, |
|
{ |
|
"epoch": 7.892222434549971, |
|
"grad_norm": 0.16002187132835388, |
|
"learning_rate": 3.0414746543778806e-05, |
|
"loss": 0.0199, |
|
"step": 41300 |
|
}, |
|
{ |
|
"epoch": 7.911331931970189, |
|
"grad_norm": 1.5348676443099976, |
|
"learning_rate": 3.036674347158218e-05, |
|
"loss": 0.0161, |
|
"step": 41400 |
|
}, |
|
{ |
|
"epoch": 7.930441429390407, |
|
"grad_norm": 0.6840288043022156, |
|
"learning_rate": 3.0318740399385563e-05, |
|
"loss": 0.014, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 7.949550926810625, |
|
"grad_norm": 0.7691094875335693, |
|
"learning_rate": 3.0270737327188942e-05, |
|
"loss": 0.0171, |
|
"step": 41600 |
|
}, |
|
{ |
|
"epoch": 7.968660424230842, |
|
"grad_norm": 0.9019795060157776, |
|
"learning_rate": 3.0222734254992317e-05, |
|
"loss": 0.0208, |
|
"step": 41700 |
|
}, |
|
{ |
|
"epoch": 7.9877699216510605, |
|
"grad_norm": 0.40712770819664, |
|
"learning_rate": 3.0174731182795703e-05, |
|
"loss": 0.0216, |
|
"step": 41800 |
|
}, |
|
{ |
|
"epoch": 8.006879419071279, |
|
"grad_norm": 0.9297962784767151, |
|
"learning_rate": 3.0126728110599078e-05, |
|
"loss": 0.0206, |
|
"step": 41900 |
|
}, |
|
{ |
|
"epoch": 8.025988916491496, |
|
"grad_norm": 0.9476808309555054, |
|
"learning_rate": 3.007872503840246e-05, |
|
"loss": 0.0179, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 8.025988916491496, |
|
"eval_events-synergy/entsum_processed_loss": 0.017886348068714142, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4493, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.906, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.732, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 8.045098413911715, |
|
"grad_norm": 0.2303045094013214, |
|
"learning_rate": 3.003072196620584e-05, |
|
"loss": 0.0166, |
|
"step": 42100 |
|
}, |
|
{ |
|
"epoch": 8.064207911331932, |
|
"grad_norm": 0.2901061773300171, |
|
"learning_rate": 2.998271889400922e-05, |
|
"loss": 0.0192, |
|
"step": 42200 |
|
}, |
|
{ |
|
"epoch": 8.08331740875215, |
|
"grad_norm": 0.051759280264377594, |
|
"learning_rate": 2.9934715821812596e-05, |
|
"loss": 0.0161, |
|
"step": 42300 |
|
}, |
|
{ |
|
"epoch": 8.102426906172367, |
|
"grad_norm": 0.4911729097366333, |
|
"learning_rate": 2.9886712749615975e-05, |
|
"loss": 0.018, |
|
"step": 42400 |
|
}, |
|
{ |
|
"epoch": 8.121536403592586, |
|
"grad_norm": 0.4533239006996155, |
|
"learning_rate": 2.9838709677419357e-05, |
|
"loss": 0.0177, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 8.140645901012803, |
|
"grad_norm": 0.03935829922556877, |
|
"learning_rate": 2.9790706605222736e-05, |
|
"loss": 0.016, |
|
"step": 42600 |
|
}, |
|
{ |
|
"epoch": 8.15975539843302, |
|
"grad_norm": 0.5652256011962891, |
|
"learning_rate": 2.9742703533026118e-05, |
|
"loss": 0.0125, |
|
"step": 42700 |
|
}, |
|
{ |
|
"epoch": 8.17886489585324, |
|
"grad_norm": 0.5128132104873657, |
|
"learning_rate": 2.9694700460829493e-05, |
|
"loss": 0.0123, |
|
"step": 42800 |
|
}, |
|
{ |
|
"epoch": 8.197974393273457, |
|
"grad_norm": 0.27111709117889404, |
|
"learning_rate": 2.9646697388632872e-05, |
|
"loss": 0.0173, |
|
"step": 42900 |
|
}, |
|
{ |
|
"epoch": 8.217083890693674, |
|
"grad_norm": 0.1685011088848114, |
|
"learning_rate": 2.9598694316436254e-05, |
|
"loss": 0.0161, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 8.217083890693674, |
|
"eval_events-synergy/entsum_processed_loss": 0.017874937504529953, |
|
"eval_events-synergy/entsum_processed_runtime": 86.3239, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.945, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.742, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 8.236193388113893, |
|
"grad_norm": 0.3956407606601715, |
|
"learning_rate": 2.955069124423963e-05, |
|
"loss": 0.0143, |
|
"step": 43100 |
|
}, |
|
{ |
|
"epoch": 8.25530288553411, |
|
"grad_norm": 0.46672216057777405, |
|
"learning_rate": 2.9502688172043015e-05, |
|
"loss": 0.0136, |
|
"step": 43200 |
|
}, |
|
{ |
|
"epoch": 8.274412382954328, |
|
"grad_norm": 0.27463072538375854, |
|
"learning_rate": 2.945468509984639e-05, |
|
"loss": 0.0152, |
|
"step": 43300 |
|
}, |
|
{ |
|
"epoch": 8.293521880374547, |
|
"grad_norm": 0.11778465658426285, |
|
"learning_rate": 2.9406682027649773e-05, |
|
"loss": 0.0227, |
|
"step": 43400 |
|
}, |
|
{ |
|
"epoch": 8.312631377794764, |
|
"grad_norm": 0.966484785079956, |
|
"learning_rate": 2.935867895545315e-05, |
|
"loss": 0.0146, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 8.331740875214981, |
|
"grad_norm": 0.10549327731132507, |
|
"learning_rate": 2.9310675883256527e-05, |
|
"loss": 0.0164, |
|
"step": 43600 |
|
}, |
|
{ |
|
"epoch": 8.3508503726352, |
|
"grad_norm": 0.19580169022083282, |
|
"learning_rate": 2.926267281105991e-05, |
|
"loss": 0.0192, |
|
"step": 43700 |
|
}, |
|
{ |
|
"epoch": 8.369959870055418, |
|
"grad_norm": 0.2825284004211426, |
|
"learning_rate": 2.9214669738863288e-05, |
|
"loss": 0.0183, |
|
"step": 43800 |
|
}, |
|
{ |
|
"epoch": 8.389069367475635, |
|
"grad_norm": 0.4127114713191986, |
|
"learning_rate": 2.916666666666667e-05, |
|
"loss": 0.0167, |
|
"step": 43900 |
|
}, |
|
{ |
|
"epoch": 8.408178864895854, |
|
"grad_norm": 0.2957821488380432, |
|
"learning_rate": 2.911866359447005e-05, |
|
"loss": 0.0152, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 8.408178864895854, |
|
"eval_events-synergy/entsum_processed_loss": 0.018097488209605217, |
|
"eval_events-synergy/entsum_processed_runtime": 86.2713, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.961, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.746, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 8.427288362316071, |
|
"grad_norm": 1.0461006164550781, |
|
"learning_rate": 2.907066052227343e-05, |
|
"loss": 0.0168, |
|
"step": 44100 |
|
}, |
|
{ |
|
"epoch": 8.446397859736289, |
|
"grad_norm": 0.4242873191833496, |
|
"learning_rate": 2.9022657450076806e-05, |
|
"loss": 0.0184, |
|
"step": 44200 |
|
}, |
|
{ |
|
"epoch": 8.465507357156508, |
|
"grad_norm": 0.4693470001220703, |
|
"learning_rate": 2.8974654377880185e-05, |
|
"loss": 0.0196, |
|
"step": 44300 |
|
}, |
|
{ |
|
"epoch": 8.484616854576725, |
|
"grad_norm": 0.018570706248283386, |
|
"learning_rate": 2.8926651305683567e-05, |
|
"loss": 0.0157, |
|
"step": 44400 |
|
}, |
|
{ |
|
"epoch": 8.503726351996942, |
|
"grad_norm": 0.5416761040687561, |
|
"learning_rate": 2.8878648233486942e-05, |
|
"loss": 0.0178, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 8.52283584941716, |
|
"grad_norm": 0.5635196566581726, |
|
"learning_rate": 2.8830645161290328e-05, |
|
"loss": 0.0122, |
|
"step": 44600 |
|
}, |
|
{ |
|
"epoch": 8.541945346837378, |
|
"grad_norm": 0.42741984128952026, |
|
"learning_rate": 2.8782642089093703e-05, |
|
"loss": 0.0145, |
|
"step": 44700 |
|
}, |
|
{ |
|
"epoch": 8.561054844257596, |
|
"grad_norm": 1.090361475944519, |
|
"learning_rate": 2.8734639016897082e-05, |
|
"loss": 0.0107, |
|
"step": 44800 |
|
}, |
|
{ |
|
"epoch": 8.580164341677813, |
|
"grad_norm": 0.792757511138916, |
|
"learning_rate": 2.8686635944700464e-05, |
|
"loss": 0.017, |
|
"step": 44900 |
|
}, |
|
{ |
|
"epoch": 8.599273839098032, |
|
"grad_norm": 0.07932941615581512, |
|
"learning_rate": 2.863863287250384e-05, |
|
"loss": 0.0141, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 8.599273839098032, |
|
"eval_events-synergy/entsum_processed_loss": 0.017112286761403084, |
|
"eval_events-synergy/entsum_processed_runtime": 86.6905, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.831, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.714, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 8.61838333651825, |
|
"grad_norm": 0.42730385065078735, |
|
"learning_rate": 2.859062980030722e-05, |
|
"loss": 0.0168, |
|
"step": 45100 |
|
}, |
|
{ |
|
"epoch": 8.637492833938467, |
|
"grad_norm": 0.8624597787857056, |
|
"learning_rate": 2.85426267281106e-05, |
|
"loss": 0.021, |
|
"step": 45200 |
|
}, |
|
{ |
|
"epoch": 8.656602331358686, |
|
"grad_norm": 0.34436488151550293, |
|
"learning_rate": 2.8494623655913982e-05, |
|
"loss": 0.0152, |
|
"step": 45300 |
|
}, |
|
{ |
|
"epoch": 8.675711828778903, |
|
"grad_norm": 1.8274009227752686, |
|
"learning_rate": 2.844662058371736e-05, |
|
"loss": 0.0114, |
|
"step": 45400 |
|
}, |
|
{ |
|
"epoch": 8.69482132619912, |
|
"grad_norm": 0.1283120959997177, |
|
"learning_rate": 2.8398617511520736e-05, |
|
"loss": 0.0203, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 8.71393082361934, |
|
"grad_norm": 1.9227547645568848, |
|
"learning_rate": 2.835061443932412e-05, |
|
"loss": 0.0161, |
|
"step": 45600 |
|
}, |
|
{ |
|
"epoch": 8.733040321039557, |
|
"grad_norm": 1.129739761352539, |
|
"learning_rate": 2.8302611367127497e-05, |
|
"loss": 0.0149, |
|
"step": 45700 |
|
}, |
|
{ |
|
"epoch": 8.752149818459774, |
|
"grad_norm": 0.807822048664093, |
|
"learning_rate": 2.825460829493088e-05, |
|
"loss": 0.0123, |
|
"step": 45800 |
|
}, |
|
{ |
|
"epoch": 8.771259315879993, |
|
"grad_norm": 0.24655993282794952, |
|
"learning_rate": 2.8206605222734255e-05, |
|
"loss": 0.0165, |
|
"step": 45900 |
|
}, |
|
{ |
|
"epoch": 8.79036881330021, |
|
"grad_norm": 0.19655448198318481, |
|
"learning_rate": 2.8158602150537637e-05, |
|
"loss": 0.0174, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 8.79036881330021, |
|
"eval_events-synergy/entsum_processed_loss": 0.01653146930038929, |
|
"eval_events-synergy/entsum_processed_runtime": 86.3181, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.947, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.743, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 8.809478310720428, |
|
"grad_norm": 0.37631919980049133, |
|
"learning_rate": 2.8110599078341016e-05, |
|
"loss": 0.0186, |
|
"step": 46100 |
|
}, |
|
{ |
|
"epoch": 8.828587808140647, |
|
"grad_norm": 0.25859326124191284, |
|
"learning_rate": 2.8062596006144394e-05, |
|
"loss": 0.0194, |
|
"step": 46200 |
|
}, |
|
{ |
|
"epoch": 8.847697305560864, |
|
"grad_norm": 0.9864445924758911, |
|
"learning_rate": 2.8014592933947776e-05, |
|
"loss": 0.0163, |
|
"step": 46300 |
|
}, |
|
{ |
|
"epoch": 8.866806802981081, |
|
"grad_norm": 1.691253900527954, |
|
"learning_rate": 2.7966589861751152e-05, |
|
"loss": 0.017, |
|
"step": 46400 |
|
}, |
|
{ |
|
"epoch": 8.8859163004013, |
|
"grad_norm": 0.9325816035270691, |
|
"learning_rate": 2.7918586789554534e-05, |
|
"loss": 0.0126, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 8.905025797821517, |
|
"grad_norm": 0.8106985688209534, |
|
"learning_rate": 2.7870583717357913e-05, |
|
"loss": 0.0155, |
|
"step": 46600 |
|
}, |
|
{ |
|
"epoch": 8.924135295241735, |
|
"grad_norm": 0.5398417711257935, |
|
"learning_rate": 2.7822580645161288e-05, |
|
"loss": 0.0141, |
|
"step": 46700 |
|
}, |
|
{ |
|
"epoch": 8.943244792661954, |
|
"grad_norm": 0.5932244658470154, |
|
"learning_rate": 2.777457757296467e-05, |
|
"loss": 0.016, |
|
"step": 46800 |
|
}, |
|
{ |
|
"epoch": 8.962354290082171, |
|
"grad_norm": 0.2489900141954422, |
|
"learning_rate": 2.772657450076805e-05, |
|
"loss": 0.0157, |
|
"step": 46900 |
|
}, |
|
{ |
|
"epoch": 8.981463787502388, |
|
"grad_norm": 0.5513284802436829, |
|
"learning_rate": 2.767857142857143e-05, |
|
"loss": 0.0133, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 8.981463787502388, |
|
"eval_events-synergy/entsum_processed_loss": 0.016639988869428635, |
|
"eval_events-synergy/entsum_processed_runtime": 86.6238, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.852, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.719, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 9.000573284922606, |
|
"grad_norm": 0.7625744342803955, |
|
"learning_rate": 2.763056835637481e-05, |
|
"loss": 0.013, |
|
"step": 47100 |
|
}, |
|
{ |
|
"epoch": 9.019682782342825, |
|
"grad_norm": 0.5866988897323608, |
|
"learning_rate": 2.7582565284178192e-05, |
|
"loss": 0.0132, |
|
"step": 47200 |
|
}, |
|
{ |
|
"epoch": 9.038792279763042, |
|
"grad_norm": 0.21316476166248322, |
|
"learning_rate": 2.7534562211981567e-05, |
|
"loss": 0.0126, |
|
"step": 47300 |
|
}, |
|
{ |
|
"epoch": 9.05790177718326, |
|
"grad_norm": 0.2244674414396286, |
|
"learning_rate": 2.7486559139784946e-05, |
|
"loss": 0.0125, |
|
"step": 47400 |
|
}, |
|
{ |
|
"epoch": 9.077011274603478, |
|
"grad_norm": 0.48919981718063354, |
|
"learning_rate": 2.7438556067588328e-05, |
|
"loss": 0.0145, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 9.096120772023696, |
|
"grad_norm": 1.7857321500778198, |
|
"learning_rate": 2.7390552995391703e-05, |
|
"loss": 0.0136, |
|
"step": 47600 |
|
}, |
|
{ |
|
"epoch": 9.115230269443913, |
|
"grad_norm": 1.4337810277938843, |
|
"learning_rate": 2.734254992319509e-05, |
|
"loss": 0.0124, |
|
"step": 47700 |
|
}, |
|
{ |
|
"epoch": 9.134339766864132, |
|
"grad_norm": 0.1657884567975998, |
|
"learning_rate": 2.7294546850998464e-05, |
|
"loss": 0.0131, |
|
"step": 47800 |
|
}, |
|
{ |
|
"epoch": 9.15344926428435, |
|
"grad_norm": 0.32333338260650635, |
|
"learning_rate": 2.7246543778801846e-05, |
|
"loss": 0.0113, |
|
"step": 47900 |
|
}, |
|
{ |
|
"epoch": 9.172558761704567, |
|
"grad_norm": 0.05588236078619957, |
|
"learning_rate": 2.7198540706605225e-05, |
|
"loss": 0.0114, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 9.172558761704567, |
|
"eval_events-synergy/entsum_processed_loss": 0.01658570021390915, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4495, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.906, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.732, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 9.191668259124786, |
|
"grad_norm": 0.12975239753723145, |
|
"learning_rate": 2.71505376344086e-05, |
|
"loss": 0.0157, |
|
"step": 48100 |
|
}, |
|
{ |
|
"epoch": 9.210777756545003, |
|
"grad_norm": 0.21814003586769104, |
|
"learning_rate": 2.7102534562211983e-05, |
|
"loss": 0.011, |
|
"step": 48200 |
|
}, |
|
{ |
|
"epoch": 9.22988725396522, |
|
"grad_norm": 0.04857701435685158, |
|
"learning_rate": 2.705453149001536e-05, |
|
"loss": 0.0113, |
|
"step": 48300 |
|
}, |
|
{ |
|
"epoch": 9.24899675138544, |
|
"grad_norm": 0.17843018472194672, |
|
"learning_rate": 2.7006528417818743e-05, |
|
"loss": 0.0114, |
|
"step": 48400 |
|
}, |
|
{ |
|
"epoch": 9.268106248805656, |
|
"grad_norm": 0.4526139497756958, |
|
"learning_rate": 2.6958525345622122e-05, |
|
"loss": 0.0164, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 9.287215746225874, |
|
"grad_norm": 0.6078909635543823, |
|
"learning_rate": 2.6910522273425498e-05, |
|
"loss": 0.0105, |
|
"step": 48600 |
|
}, |
|
{ |
|
"epoch": 9.306325243646093, |
|
"grad_norm": 0.8294253349304199, |
|
"learning_rate": 2.686251920122888e-05, |
|
"loss": 0.0129, |
|
"step": 48700 |
|
}, |
|
{ |
|
"epoch": 9.32543474106631, |
|
"grad_norm": 0.6839629411697388, |
|
"learning_rate": 2.681451612903226e-05, |
|
"loss": 0.0156, |
|
"step": 48800 |
|
}, |
|
{ |
|
"epoch": 9.344544238486527, |
|
"grad_norm": 0.05435250699520111, |
|
"learning_rate": 2.676651305683564e-05, |
|
"loss": 0.0131, |
|
"step": 48900 |
|
}, |
|
{ |
|
"epoch": 9.363653735906746, |
|
"grad_norm": 1.4614830017089844, |
|
"learning_rate": 2.6718509984639016e-05, |
|
"loss": 0.0114, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 9.363653735906746, |
|
"eval_events-synergy/entsum_processed_loss": 0.015850717201828957, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4368, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.91, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.733, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 9.382763233326964, |
|
"grad_norm": 0.2169920802116394, |
|
"learning_rate": 2.66705069124424e-05, |
|
"loss": 0.0128, |
|
"step": 49100 |
|
}, |
|
{ |
|
"epoch": 9.401872730747181, |
|
"grad_norm": 0.2624705135822296, |
|
"learning_rate": 2.6622503840245777e-05, |
|
"loss": 0.0109, |
|
"step": 49200 |
|
}, |
|
{ |
|
"epoch": 9.4209822281674, |
|
"grad_norm": 0.3865724802017212, |
|
"learning_rate": 2.6574500768049156e-05, |
|
"loss": 0.0151, |
|
"step": 49300 |
|
}, |
|
{ |
|
"epoch": 9.440091725587617, |
|
"grad_norm": 0.6372859477996826, |
|
"learning_rate": 2.6526497695852538e-05, |
|
"loss": 0.0127, |
|
"step": 49400 |
|
}, |
|
{ |
|
"epoch": 9.459201223007835, |
|
"grad_norm": 0.4839339256286621, |
|
"learning_rate": 2.6478494623655913e-05, |
|
"loss": 0.0132, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 9.478310720428052, |
|
"grad_norm": 0.5721094608306885, |
|
"learning_rate": 2.6430491551459295e-05, |
|
"loss": 0.0142, |
|
"step": 49600 |
|
}, |
|
{ |
|
"epoch": 9.497420217848271, |
|
"grad_norm": 0.03586877882480621, |
|
"learning_rate": 2.6382488479262674e-05, |
|
"loss": 0.0123, |
|
"step": 49700 |
|
}, |
|
{ |
|
"epoch": 9.516529715268488, |
|
"grad_norm": 0.38037604093551636, |
|
"learning_rate": 2.633448540706605e-05, |
|
"loss": 0.0235, |
|
"step": 49800 |
|
}, |
|
{ |
|
"epoch": 9.535639212688706, |
|
"grad_norm": 0.2859026789665222, |
|
"learning_rate": 2.6286482334869435e-05, |
|
"loss": 0.0124, |
|
"step": 49900 |
|
}, |
|
{ |
|
"epoch": 9.554748710108925, |
|
"grad_norm": 0.27045369148254395, |
|
"learning_rate": 2.623847926267281e-05, |
|
"loss": 0.017, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 9.554748710108925, |
|
"eval_events-synergy/entsum_processed_loss": 0.01477838959544897, |
|
"eval_events-synergy/entsum_processed_runtime": 85.8776, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.085, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.777, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 9.573858207529142, |
|
"grad_norm": 0.1873704493045807, |
|
"learning_rate": 2.6190476190476192e-05, |
|
"loss": 0.0115, |
|
"step": 50100 |
|
}, |
|
{ |
|
"epoch": 9.59296770494936, |
|
"grad_norm": 0.1264422982931137, |
|
"learning_rate": 2.614247311827957e-05, |
|
"loss": 0.0099, |
|
"step": 50200 |
|
}, |
|
{ |
|
"epoch": 9.612077202369578, |
|
"grad_norm": 0.8306263089179993, |
|
"learning_rate": 2.6094470046082953e-05, |
|
"loss": 0.0136, |
|
"step": 50300 |
|
}, |
|
{ |
|
"epoch": 9.631186699789795, |
|
"grad_norm": 0.5657772421836853, |
|
"learning_rate": 2.604646697388633e-05, |
|
"loss": 0.0113, |
|
"step": 50400 |
|
}, |
|
{ |
|
"epoch": 9.650296197210013, |
|
"grad_norm": 0.7657700181007385, |
|
"learning_rate": 2.5998463901689707e-05, |
|
"loss": 0.0127, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 9.669405694630232, |
|
"grad_norm": 0.4362422227859497, |
|
"learning_rate": 2.595046082949309e-05, |
|
"loss": 0.01, |
|
"step": 50600 |
|
}, |
|
{ |
|
"epoch": 9.688515192050449, |
|
"grad_norm": 1.3221834897994995, |
|
"learning_rate": 2.5902457757296468e-05, |
|
"loss": 0.013, |
|
"step": 50700 |
|
}, |
|
{ |
|
"epoch": 9.707624689470666, |
|
"grad_norm": 0.10563758760690689, |
|
"learning_rate": 2.585445468509985e-05, |
|
"loss": 0.0139, |
|
"step": 50800 |
|
}, |
|
{ |
|
"epoch": 9.726734186890885, |
|
"grad_norm": 0.40996453166007996, |
|
"learning_rate": 2.5806451612903226e-05, |
|
"loss": 0.0097, |
|
"step": 50900 |
|
}, |
|
{ |
|
"epoch": 9.745843684311103, |
|
"grad_norm": 0.14993074536323547, |
|
"learning_rate": 2.5758448540706608e-05, |
|
"loss": 0.0117, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 9.745843684311103, |
|
"eval_events-synergy/entsum_processed_loss": 0.015352104790508747, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4339, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.911, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.733, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 9.76495318173132, |
|
"grad_norm": 0.034810420125722885, |
|
"learning_rate": 2.5710445468509986e-05, |
|
"loss": 0.0117, |
|
"step": 51100 |
|
}, |
|
{ |
|
"epoch": 9.784062679151539, |
|
"grad_norm": 0.5618245005607605, |
|
"learning_rate": 2.5662442396313362e-05, |
|
"loss": 0.0113, |
|
"step": 51200 |
|
}, |
|
{ |
|
"epoch": 9.803172176571756, |
|
"grad_norm": 0.26905587315559387, |
|
"learning_rate": 2.5614439324116747e-05, |
|
"loss": 0.0135, |
|
"step": 51300 |
|
}, |
|
{ |
|
"epoch": 9.822281673991974, |
|
"grad_norm": 0.4496970772743225, |
|
"learning_rate": 2.5566436251920123e-05, |
|
"loss": 0.0136, |
|
"step": 51400 |
|
}, |
|
{ |
|
"epoch": 9.841391171412193, |
|
"grad_norm": 0.7193441987037659, |
|
"learning_rate": 2.5518433179723505e-05, |
|
"loss": 0.0109, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 9.86050066883241, |
|
"grad_norm": 1.0633965730667114, |
|
"learning_rate": 2.5470430107526883e-05, |
|
"loss": 0.0169, |
|
"step": 51600 |
|
}, |
|
{ |
|
"epoch": 9.879610166252627, |
|
"grad_norm": 0.03440142795443535, |
|
"learning_rate": 2.542242703533026e-05, |
|
"loss": 0.012, |
|
"step": 51700 |
|
}, |
|
{ |
|
"epoch": 9.898719663672846, |
|
"grad_norm": 0.7566103339195251, |
|
"learning_rate": 2.537442396313364e-05, |
|
"loss": 0.0145, |
|
"step": 51800 |
|
}, |
|
{ |
|
"epoch": 9.917829161093064, |
|
"grad_norm": 0.39200007915496826, |
|
"learning_rate": 2.532642089093702e-05, |
|
"loss": 0.0129, |
|
"step": 51900 |
|
}, |
|
{ |
|
"epoch": 9.93693865851328, |
|
"grad_norm": 1.0183656215667725, |
|
"learning_rate": 2.5278417818740402e-05, |
|
"loss": 0.0129, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 9.93693865851328, |
|
"eval_events-synergy/entsum_processed_loss": 0.014359832741320133, |
|
"eval_events-synergy/entsum_processed_runtime": 86.5634, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.87, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.723, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 9.956048155933498, |
|
"grad_norm": 0.4568009376525879, |
|
"learning_rate": 2.523041474654378e-05, |
|
"loss": 0.0092, |
|
"step": 52100 |
|
}, |
|
{ |
|
"epoch": 9.975157653353717, |
|
"grad_norm": 0.4945560097694397, |
|
"learning_rate": 2.5182411674347163e-05, |
|
"loss": 0.0107, |
|
"step": 52200 |
|
}, |
|
{ |
|
"epoch": 9.994267150773934, |
|
"grad_norm": 0.4079298675060272, |
|
"learning_rate": 2.5134408602150538e-05, |
|
"loss": 0.0145, |
|
"step": 52300 |
|
}, |
|
{ |
|
"epoch": 10.013376648194152, |
|
"grad_norm": 0.7107488512992859, |
|
"learning_rate": 2.5086405529953917e-05, |
|
"loss": 0.0091, |
|
"step": 52400 |
|
}, |
|
{ |
|
"epoch": 10.03248614561437, |
|
"grad_norm": 0.5608364939689636, |
|
"learning_rate": 2.50384024577573e-05, |
|
"loss": 0.0105, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 10.051595643034588, |
|
"grad_norm": 0.15459409356117249, |
|
"learning_rate": 2.4990399385560678e-05, |
|
"loss": 0.0079, |
|
"step": 52600 |
|
}, |
|
{ |
|
"epoch": 10.070705140454805, |
|
"grad_norm": 0.5406343340873718, |
|
"learning_rate": 2.4942396313364056e-05, |
|
"loss": 0.011, |
|
"step": 52700 |
|
}, |
|
{ |
|
"epoch": 10.089814637875024, |
|
"grad_norm": 0.45881524682044983, |
|
"learning_rate": 2.4894393241167435e-05, |
|
"loss": 0.0097, |
|
"step": 52800 |
|
}, |
|
{ |
|
"epoch": 10.108924135295242, |
|
"grad_norm": 1.0400211811065674, |
|
"learning_rate": 2.4846390168970814e-05, |
|
"loss": 0.0091, |
|
"step": 52900 |
|
}, |
|
{ |
|
"epoch": 10.128033632715459, |
|
"grad_norm": 0.4115142524242401, |
|
"learning_rate": 2.4798387096774196e-05, |
|
"loss": 0.0121, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 10.128033632715459, |
|
"eval_events-synergy/entsum_processed_loss": 0.01460443064570427, |
|
"eval_events-synergy/entsum_processed_runtime": 86.3665, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.932, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.739, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 10.147143130135678, |
|
"grad_norm": 0.7973828911781311, |
|
"learning_rate": 2.4750384024577575e-05, |
|
"loss": 0.0109, |
|
"step": 53100 |
|
}, |
|
{ |
|
"epoch": 10.166252627555895, |
|
"grad_norm": 0.21007944643497467, |
|
"learning_rate": 2.4702380952380953e-05, |
|
"loss": 0.01, |
|
"step": 53200 |
|
}, |
|
{ |
|
"epoch": 10.185362124976113, |
|
"grad_norm": 0.14576242864131927, |
|
"learning_rate": 2.4654377880184332e-05, |
|
"loss": 0.01, |
|
"step": 53300 |
|
}, |
|
{ |
|
"epoch": 10.204471622396332, |
|
"grad_norm": 0.08291052281856537, |
|
"learning_rate": 2.460637480798771e-05, |
|
"loss": 0.0097, |
|
"step": 53400 |
|
}, |
|
{ |
|
"epoch": 10.223581119816549, |
|
"grad_norm": 0.06554676592350006, |
|
"learning_rate": 2.4558371735791093e-05, |
|
"loss": 0.0095, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 10.242690617236766, |
|
"grad_norm": 0.7229769229888916, |
|
"learning_rate": 2.4510368663594472e-05, |
|
"loss": 0.0098, |
|
"step": 53600 |
|
}, |
|
{ |
|
"epoch": 10.261800114656985, |
|
"grad_norm": 1.9222382307052612, |
|
"learning_rate": 2.446236559139785e-05, |
|
"loss": 0.0161, |
|
"step": 53700 |
|
}, |
|
{ |
|
"epoch": 10.280909612077203, |
|
"grad_norm": 0.05762649327516556, |
|
"learning_rate": 2.4414362519201233e-05, |
|
"loss": 0.0092, |
|
"step": 53800 |
|
}, |
|
{ |
|
"epoch": 10.30001910949742, |
|
"grad_norm": 1.128400206565857, |
|
"learning_rate": 2.436635944700461e-05, |
|
"loss": 0.0114, |
|
"step": 53900 |
|
}, |
|
{ |
|
"epoch": 10.319128606917639, |
|
"grad_norm": 0.5140525102615356, |
|
"learning_rate": 2.4318356374807987e-05, |
|
"loss": 0.017, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 10.319128606917639, |
|
"eval_events-synergy/entsum_processed_loss": 0.014034281484782696, |
|
"eval_events-synergy/entsum_processed_runtime": 86.3783, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.928, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.738, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 10.338238104337856, |
|
"grad_norm": 0.011483533307909966, |
|
"learning_rate": 2.427035330261137e-05, |
|
"loss": 0.0077, |
|
"step": 54100 |
|
}, |
|
{ |
|
"epoch": 10.357347601758073, |
|
"grad_norm": 0.1507517546415329, |
|
"learning_rate": 2.4222350230414748e-05, |
|
"loss": 0.0082, |
|
"step": 54200 |
|
}, |
|
{ |
|
"epoch": 10.376457099178293, |
|
"grad_norm": 0.3378009498119354, |
|
"learning_rate": 2.4174347158218126e-05, |
|
"loss": 0.0123, |
|
"step": 54300 |
|
}, |
|
{ |
|
"epoch": 10.39556659659851, |
|
"grad_norm": 0.491153359413147, |
|
"learning_rate": 2.412634408602151e-05, |
|
"loss": 0.0105, |
|
"step": 54400 |
|
}, |
|
{ |
|
"epoch": 10.414676094018727, |
|
"grad_norm": 0.7116023898124695, |
|
"learning_rate": 2.4078341013824887e-05, |
|
"loss": 0.0098, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 10.433785591438944, |
|
"grad_norm": 0.05201850086450577, |
|
"learning_rate": 2.4030337941628263e-05, |
|
"loss": 0.0108, |
|
"step": 54600 |
|
}, |
|
{ |
|
"epoch": 10.452895088859163, |
|
"grad_norm": 1.5810301303863525, |
|
"learning_rate": 2.3982334869431645e-05, |
|
"loss": 0.009, |
|
"step": 54700 |
|
}, |
|
{ |
|
"epoch": 10.47200458627938, |
|
"grad_norm": 0.7053477764129639, |
|
"learning_rate": 2.3934331797235023e-05, |
|
"loss": 0.0096, |
|
"step": 54800 |
|
}, |
|
{ |
|
"epoch": 10.491114083699598, |
|
"grad_norm": 0.5703982710838318, |
|
"learning_rate": 2.3886328725038402e-05, |
|
"loss": 0.0084, |
|
"step": 54900 |
|
}, |
|
{ |
|
"epoch": 10.510223581119817, |
|
"grad_norm": 0.3266231119632721, |
|
"learning_rate": 2.3838325652841784e-05, |
|
"loss": 0.0119, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 10.510223581119817, |
|
"eval_events-synergy/entsum_processed_loss": 0.014698924496769905, |
|
"eval_events-synergy/entsum_processed_runtime": 86.2536, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.967, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.748, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 10.529333078540034, |
|
"grad_norm": 0.17007987201213837, |
|
"learning_rate": 2.3790322580645163e-05, |
|
"loss": 0.0099, |
|
"step": 55100 |
|
}, |
|
{ |
|
"epoch": 10.548442575960252, |
|
"grad_norm": 0.9838082194328308, |
|
"learning_rate": 2.3742319508448542e-05, |
|
"loss": 0.0091, |
|
"step": 55200 |
|
}, |
|
{ |
|
"epoch": 10.56755207338047, |
|
"grad_norm": 0.32569730281829834, |
|
"learning_rate": 2.369431643625192e-05, |
|
"loss": 0.0105, |
|
"step": 55300 |
|
}, |
|
{ |
|
"epoch": 10.586661570800688, |
|
"grad_norm": 0.3832113742828369, |
|
"learning_rate": 2.36463133640553e-05, |
|
"loss": 0.01, |
|
"step": 55400 |
|
}, |
|
{ |
|
"epoch": 10.605771068220905, |
|
"grad_norm": 0.43005886673927307, |
|
"learning_rate": 2.359831029185868e-05, |
|
"loss": 0.0101, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 10.624880565641124, |
|
"grad_norm": 1.174752950668335, |
|
"learning_rate": 2.355030721966206e-05, |
|
"loss": 0.0103, |
|
"step": 55600 |
|
}, |
|
{ |
|
"epoch": 10.643990063061342, |
|
"grad_norm": 0.16343650221824646, |
|
"learning_rate": 2.350230414746544e-05, |
|
"loss": 0.0092, |
|
"step": 55700 |
|
}, |
|
{ |
|
"epoch": 10.663099560481559, |
|
"grad_norm": 0.3868729770183563, |
|
"learning_rate": 2.345430107526882e-05, |
|
"loss": 0.0155, |
|
"step": 55800 |
|
}, |
|
{ |
|
"epoch": 10.682209057901778, |
|
"grad_norm": 0.18367870151996613, |
|
"learning_rate": 2.3406298003072196e-05, |
|
"loss": 0.0093, |
|
"step": 55900 |
|
}, |
|
{ |
|
"epoch": 10.701318555321995, |
|
"grad_norm": 0.7105177640914917, |
|
"learning_rate": 2.3358294930875575e-05, |
|
"loss": 0.0086, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 10.701318555321995, |
|
"eval_events-synergy/entsum_processed_loss": 0.014125595800578594, |
|
"eval_events-synergy/entsum_processed_runtime": 86.2929, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.955, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.744, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 10.720428052742212, |
|
"grad_norm": 0.1564301997423172, |
|
"learning_rate": 2.3310291858678957e-05, |
|
"loss": 0.0068, |
|
"step": 56100 |
|
}, |
|
{ |
|
"epoch": 10.739537550162432, |
|
"grad_norm": 0.13135480880737305, |
|
"learning_rate": 2.3262288786482336e-05, |
|
"loss": 0.0105, |
|
"step": 56200 |
|
}, |
|
{ |
|
"epoch": 10.758647047582649, |
|
"grad_norm": 0.15838299691677094, |
|
"learning_rate": 2.3214285714285715e-05, |
|
"loss": 0.008, |
|
"step": 56300 |
|
}, |
|
{ |
|
"epoch": 10.777756545002866, |
|
"grad_norm": 0.42811164259910583, |
|
"learning_rate": 2.3166282642089097e-05, |
|
"loss": 0.0106, |
|
"step": 56400 |
|
}, |
|
{ |
|
"epoch": 10.796866042423083, |
|
"grad_norm": 0.1568647176027298, |
|
"learning_rate": 2.3118279569892472e-05, |
|
"loss": 0.0113, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 10.815975539843302, |
|
"grad_norm": 0.41632843017578125, |
|
"learning_rate": 2.3070276497695854e-05, |
|
"loss": 0.0087, |
|
"step": 56600 |
|
}, |
|
{ |
|
"epoch": 10.83508503726352, |
|
"grad_norm": 0.3496435582637787, |
|
"learning_rate": 2.3022273425499233e-05, |
|
"loss": 0.0085, |
|
"step": 56700 |
|
}, |
|
{ |
|
"epoch": 10.854194534683739, |
|
"grad_norm": 0.04413243755698204, |
|
"learning_rate": 2.2974270353302612e-05, |
|
"loss": 0.0095, |
|
"step": 56800 |
|
}, |
|
{ |
|
"epoch": 10.873304032103956, |
|
"grad_norm": 0.2369256615638733, |
|
"learning_rate": 2.2926267281105994e-05, |
|
"loss": 0.0088, |
|
"step": 56900 |
|
}, |
|
{ |
|
"epoch": 10.892413529524173, |
|
"grad_norm": 0.07838968932628632, |
|
"learning_rate": 2.2878264208909373e-05, |
|
"loss": 0.0095, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 10.892413529524173, |
|
"eval_events-synergy/entsum_processed_loss": 0.01357998140156269, |
|
"eval_events-synergy/entsum_processed_runtime": 86.3828, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.927, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.737, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 10.91152302694439, |
|
"grad_norm": 0.3263333737850189, |
|
"learning_rate": 2.2830261136712748e-05, |
|
"loss": 0.0103, |
|
"step": 57100 |
|
}, |
|
{ |
|
"epoch": 10.93063252436461, |
|
"grad_norm": 0.7420934438705444, |
|
"learning_rate": 2.278225806451613e-05, |
|
"loss": 0.0142, |
|
"step": 57200 |
|
}, |
|
{ |
|
"epoch": 10.949742021784827, |
|
"grad_norm": 0.4374001622200012, |
|
"learning_rate": 2.273425499231951e-05, |
|
"loss": 0.0101, |
|
"step": 57300 |
|
}, |
|
{ |
|
"epoch": 10.968851519205044, |
|
"grad_norm": 1.4210389852523804, |
|
"learning_rate": 2.2686251920122888e-05, |
|
"loss": 0.0102, |
|
"step": 57400 |
|
}, |
|
{ |
|
"epoch": 10.987961016625263, |
|
"grad_norm": 0.6043274402618408, |
|
"learning_rate": 2.263824884792627e-05, |
|
"loss": 0.014, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 11.00707051404548, |
|
"grad_norm": 0.18480466306209564, |
|
"learning_rate": 2.259024577572965e-05, |
|
"loss": 0.0097, |
|
"step": 57600 |
|
}, |
|
{ |
|
"epoch": 11.026180011465698, |
|
"grad_norm": 0.013819389045238495, |
|
"learning_rate": 2.2542242703533027e-05, |
|
"loss": 0.0073, |
|
"step": 57700 |
|
}, |
|
{ |
|
"epoch": 11.045289508885917, |
|
"grad_norm": 1.205317497253418, |
|
"learning_rate": 2.2494239631336406e-05, |
|
"loss": 0.0092, |
|
"step": 57800 |
|
}, |
|
{ |
|
"epoch": 11.064399006306134, |
|
"grad_norm": 0.15929065644741058, |
|
"learning_rate": 2.2446236559139785e-05, |
|
"loss": 0.0086, |
|
"step": 57900 |
|
}, |
|
{ |
|
"epoch": 11.083508503726351, |
|
"grad_norm": 1.1358189582824707, |
|
"learning_rate": 2.2398233486943167e-05, |
|
"loss": 0.0074, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 11.083508503726351, |
|
"eval_events-synergy/entsum_processed_loss": 0.013762025162577629, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4105, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.918, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.735, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 11.10261800114657, |
|
"grad_norm": 0.684076189994812, |
|
"learning_rate": 2.2350230414746546e-05, |
|
"loss": 0.0085, |
|
"step": 58100 |
|
}, |
|
{ |
|
"epoch": 11.121727498566788, |
|
"grad_norm": 1.0887881517410278, |
|
"learning_rate": 2.2302227342549924e-05, |
|
"loss": 0.0083, |
|
"step": 58200 |
|
}, |
|
{ |
|
"epoch": 11.140836995987005, |
|
"grad_norm": 0.6392953395843506, |
|
"learning_rate": 2.2254224270353306e-05, |
|
"loss": 0.0079, |
|
"step": 58300 |
|
}, |
|
{ |
|
"epoch": 11.159946493407224, |
|
"grad_norm": 0.858275830745697, |
|
"learning_rate": 2.2206221198156682e-05, |
|
"loss": 0.0084, |
|
"step": 58400 |
|
}, |
|
{ |
|
"epoch": 11.179055990827441, |
|
"grad_norm": 0.0732496976852417, |
|
"learning_rate": 2.215821812596006e-05, |
|
"loss": 0.0074, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 11.198165488247659, |
|
"grad_norm": 0.38673388957977295, |
|
"learning_rate": 2.2110215053763443e-05, |
|
"loss": 0.0081, |
|
"step": 58600 |
|
}, |
|
{ |
|
"epoch": 11.217274985667878, |
|
"grad_norm": 0.09929162263870239, |
|
"learning_rate": 2.206221198156682e-05, |
|
"loss": 0.0094, |
|
"step": 58700 |
|
}, |
|
{ |
|
"epoch": 11.236384483088095, |
|
"grad_norm": 0.504121720790863, |
|
"learning_rate": 2.20142089093702e-05, |
|
"loss": 0.0069, |
|
"step": 58800 |
|
}, |
|
{ |
|
"epoch": 11.255493980508312, |
|
"grad_norm": 0.11551874130964279, |
|
"learning_rate": 2.1966205837173582e-05, |
|
"loss": 0.0087, |
|
"step": 58900 |
|
}, |
|
{ |
|
"epoch": 11.27460347792853, |
|
"grad_norm": 0.8204399943351746, |
|
"learning_rate": 2.1918202764976958e-05, |
|
"loss": 0.0072, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 11.27460347792853, |
|
"eval_events-synergy/entsum_processed_loss": 0.013163074851036072, |
|
"eval_events-synergy/entsum_processed_runtime": 86.5869, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.863, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.722, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 11.293712975348749, |
|
"grad_norm": 0.2919471859931946, |
|
"learning_rate": 2.187019969278034e-05, |
|
"loss": 0.0085, |
|
"step": 59100 |
|
}, |
|
{ |
|
"epoch": 11.312822472768966, |
|
"grad_norm": 0.399214506149292, |
|
"learning_rate": 2.182219662058372e-05, |
|
"loss": 0.0154, |
|
"step": 59200 |
|
}, |
|
{ |
|
"epoch": 11.331931970189183, |
|
"grad_norm": 0.9353770613670349, |
|
"learning_rate": 2.1774193548387097e-05, |
|
"loss": 0.0063, |
|
"step": 59300 |
|
}, |
|
{ |
|
"epoch": 11.351041467609402, |
|
"grad_norm": 0.24659943580627441, |
|
"learning_rate": 2.172619047619048e-05, |
|
"loss": 0.0073, |
|
"step": 59400 |
|
}, |
|
{ |
|
"epoch": 11.37015096502962, |
|
"grad_norm": 0.8196508288383484, |
|
"learning_rate": 2.1678187403993858e-05, |
|
"loss": 0.0066, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 11.389260462449837, |
|
"grad_norm": 0.05071408301591873, |
|
"learning_rate": 2.1630184331797233e-05, |
|
"loss": 0.0067, |
|
"step": 59600 |
|
}, |
|
{ |
|
"epoch": 11.408369959870056, |
|
"grad_norm": 0.6439305543899536, |
|
"learning_rate": 2.1582181259600616e-05, |
|
"loss": 0.0092, |
|
"step": 59700 |
|
}, |
|
{ |
|
"epoch": 11.427479457290273, |
|
"grad_norm": 0.009084120392799377, |
|
"learning_rate": 2.1534178187403994e-05, |
|
"loss": 0.0095, |
|
"step": 59800 |
|
}, |
|
{ |
|
"epoch": 11.44658895471049, |
|
"grad_norm": 0.04028179123997688, |
|
"learning_rate": 2.1486175115207373e-05, |
|
"loss": 0.0081, |
|
"step": 59900 |
|
}, |
|
{ |
|
"epoch": 11.46569845213071, |
|
"grad_norm": 0.4081525206565857, |
|
"learning_rate": 2.1438172043010755e-05, |
|
"loss": 0.0087, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 11.46569845213071, |
|
"eval_events-synergy/entsum_processed_loss": 0.013131758198142052, |
|
"eval_events-synergy/entsum_processed_runtime": 86.1545, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.998, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.755, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 11.484807949550927, |
|
"grad_norm": 0.0647350549697876, |
|
"learning_rate": 2.1390168970814134e-05, |
|
"loss": 0.008, |
|
"step": 60100 |
|
}, |
|
{ |
|
"epoch": 11.503917446971144, |
|
"grad_norm": 0.2612893581390381, |
|
"learning_rate": 2.1342165898617513e-05, |
|
"loss": 0.0084, |
|
"step": 60200 |
|
}, |
|
{ |
|
"epoch": 11.523026944391363, |
|
"grad_norm": 0.21366460621356964, |
|
"learning_rate": 2.129416282642089e-05, |
|
"loss": 0.0106, |
|
"step": 60300 |
|
}, |
|
{ |
|
"epoch": 11.54213644181158, |
|
"grad_norm": 1.9202722311019897, |
|
"learning_rate": 2.124615975422427e-05, |
|
"loss": 0.0085, |
|
"step": 60400 |
|
}, |
|
{ |
|
"epoch": 11.561245939231798, |
|
"grad_norm": 0.7038975358009338, |
|
"learning_rate": 2.1198156682027652e-05, |
|
"loss": 0.0082, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 11.580355436652017, |
|
"grad_norm": 0.5983741879463196, |
|
"learning_rate": 2.115015360983103e-05, |
|
"loss": 0.0074, |
|
"step": 60600 |
|
}, |
|
{ |
|
"epoch": 11.599464934072234, |
|
"grad_norm": 0.09934480488300323, |
|
"learning_rate": 2.110215053763441e-05, |
|
"loss": 0.0076, |
|
"step": 60700 |
|
}, |
|
{ |
|
"epoch": 11.618574431492451, |
|
"grad_norm": 0.021143363788723946, |
|
"learning_rate": 2.1054147465437792e-05, |
|
"loss": 0.0075, |
|
"step": 60800 |
|
}, |
|
{ |
|
"epoch": 11.63768392891267, |
|
"grad_norm": 0.14107128977775574, |
|
"learning_rate": 2.1006144393241167e-05, |
|
"loss": 0.0092, |
|
"step": 60900 |
|
}, |
|
{ |
|
"epoch": 11.656793426332888, |
|
"grad_norm": 0.695513129234314, |
|
"learning_rate": 2.0958141321044546e-05, |
|
"loss": 0.0084, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 11.656793426332888, |
|
"eval_events-synergy/entsum_processed_loss": 0.013082274235785007, |
|
"eval_events-synergy/entsum_processed_runtime": 85.9161, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.073, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.774, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 11.675902923753105, |
|
"grad_norm": 0.14702828228473663, |
|
"learning_rate": 2.0910138248847928e-05, |
|
"loss": 0.008, |
|
"step": 61100 |
|
}, |
|
{ |
|
"epoch": 11.695012421173324, |
|
"grad_norm": 0.8319016695022583, |
|
"learning_rate": 2.0862135176651307e-05, |
|
"loss": 0.0077, |
|
"step": 61200 |
|
}, |
|
{ |
|
"epoch": 11.714121918593541, |
|
"grad_norm": 1.6200295686721802, |
|
"learning_rate": 2.0814132104454685e-05, |
|
"loss": 0.0096, |
|
"step": 61300 |
|
}, |
|
{ |
|
"epoch": 11.733231416013759, |
|
"grad_norm": 0.11744115501642227, |
|
"learning_rate": 2.0766129032258068e-05, |
|
"loss": 0.0069, |
|
"step": 61400 |
|
}, |
|
{ |
|
"epoch": 11.752340913433976, |
|
"grad_norm": 0.008127202279865742, |
|
"learning_rate": 2.0718125960061443e-05, |
|
"loss": 0.0082, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 11.771450410854195, |
|
"grad_norm": 0.28937405347824097, |
|
"learning_rate": 2.0670122887864825e-05, |
|
"loss": 0.019, |
|
"step": 61600 |
|
}, |
|
{ |
|
"epoch": 11.790559908274412, |
|
"grad_norm": 0.5325780510902405, |
|
"learning_rate": 2.0622119815668204e-05, |
|
"loss": 0.0085, |
|
"step": 61700 |
|
}, |
|
{ |
|
"epoch": 11.80966940569463, |
|
"grad_norm": 0.3603396415710449, |
|
"learning_rate": 2.0574116743471583e-05, |
|
"loss": 0.0087, |
|
"step": 61800 |
|
}, |
|
{ |
|
"epoch": 11.828778903114848, |
|
"grad_norm": 0.3811150789260864, |
|
"learning_rate": 2.052611367127496e-05, |
|
"loss": 0.007, |
|
"step": 61900 |
|
}, |
|
{ |
|
"epoch": 11.847888400535066, |
|
"grad_norm": 0.017322426661849022, |
|
"learning_rate": 2.0478110599078343e-05, |
|
"loss": 0.0077, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 11.847888400535066, |
|
"eval_events-synergy/entsum_processed_loss": 0.01322674285620451, |
|
"eval_events-synergy/entsum_processed_runtime": 85.465, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.216, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.81, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 11.866997897955283, |
|
"grad_norm": 0.5657706260681152, |
|
"learning_rate": 2.0430107526881722e-05, |
|
"loss": 0.0087, |
|
"step": 62100 |
|
}, |
|
{ |
|
"epoch": 11.886107395375502, |
|
"grad_norm": 0.7124789953231812, |
|
"learning_rate": 2.03821044546851e-05, |
|
"loss": 0.008, |
|
"step": 62200 |
|
}, |
|
{ |
|
"epoch": 11.90521689279572, |
|
"grad_norm": 0.5974131226539612, |
|
"learning_rate": 2.033410138248848e-05, |
|
"loss": 0.0079, |
|
"step": 62300 |
|
}, |
|
{ |
|
"epoch": 11.924326390215937, |
|
"grad_norm": 0.722249448299408, |
|
"learning_rate": 2.028609831029186e-05, |
|
"loss": 0.0086, |
|
"step": 62400 |
|
}, |
|
{ |
|
"epoch": 11.943435887636156, |
|
"grad_norm": 0.6730612516403198, |
|
"learning_rate": 2.023809523809524e-05, |
|
"loss": 0.0086, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 11.962545385056373, |
|
"grad_norm": 0.4014939069747925, |
|
"learning_rate": 2.019009216589862e-05, |
|
"loss": 0.0078, |
|
"step": 62600 |
|
}, |
|
{ |
|
"epoch": 11.98165488247659, |
|
"grad_norm": 0.7014623880386353, |
|
"learning_rate": 2.0142089093701998e-05, |
|
"loss": 0.0111, |
|
"step": 62700 |
|
}, |
|
{ |
|
"epoch": 12.00076437989681, |
|
"grad_norm": 0.8713180422782898, |
|
"learning_rate": 2.0094086021505377e-05, |
|
"loss": 0.0098, |
|
"step": 62800 |
|
}, |
|
{ |
|
"epoch": 12.019873877317027, |
|
"grad_norm": 0.3292623460292816, |
|
"learning_rate": 2.0046082949308755e-05, |
|
"loss": 0.0068, |
|
"step": 62900 |
|
}, |
|
{ |
|
"epoch": 12.038983374737244, |
|
"grad_norm": 0.6158481240272522, |
|
"learning_rate": 1.9998079877112134e-05, |
|
"loss": 0.007, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 12.038983374737244, |
|
"eval_events-synergy/entsum_processed_loss": 0.013169880956411362, |
|
"eval_events-synergy/entsum_processed_runtime": 86.3306, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.943, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.742, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 12.058092872157463, |
|
"grad_norm": 0.025973064824938774, |
|
"learning_rate": 1.9950076804915516e-05, |
|
"loss": 0.0071, |
|
"step": 63100 |
|
}, |
|
{ |
|
"epoch": 12.07720236957768, |
|
"grad_norm": 0.4744926989078522, |
|
"learning_rate": 1.9902073732718895e-05, |
|
"loss": 0.01, |
|
"step": 63200 |
|
}, |
|
{ |
|
"epoch": 12.096311866997898, |
|
"grad_norm": 0.7474972009658813, |
|
"learning_rate": 1.9854070660522274e-05, |
|
"loss": 0.0091, |
|
"step": 63300 |
|
}, |
|
{ |
|
"epoch": 12.115421364418117, |
|
"grad_norm": 0.13019229471683502, |
|
"learning_rate": 1.9806067588325653e-05, |
|
"loss": 0.0049, |
|
"step": 63400 |
|
}, |
|
{ |
|
"epoch": 12.134530861838334, |
|
"grad_norm": 0.8603349328041077, |
|
"learning_rate": 1.975806451612903e-05, |
|
"loss": 0.0074, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 12.153640359258551, |
|
"grad_norm": 0.3122252821922302, |
|
"learning_rate": 1.9710061443932413e-05, |
|
"loss": 0.0073, |
|
"step": 63600 |
|
}, |
|
{ |
|
"epoch": 12.17274985667877, |
|
"grad_norm": 0.048645805567502975, |
|
"learning_rate": 1.9662058371735792e-05, |
|
"loss": 0.0057, |
|
"step": 63700 |
|
}, |
|
{ |
|
"epoch": 12.191859354098987, |
|
"grad_norm": 0.5081863403320312, |
|
"learning_rate": 1.961405529953917e-05, |
|
"loss": 0.0088, |
|
"step": 63800 |
|
}, |
|
{ |
|
"epoch": 12.210968851519205, |
|
"grad_norm": 0.28996503353118896, |
|
"learning_rate": 1.9566052227342553e-05, |
|
"loss": 0.0065, |
|
"step": 63900 |
|
}, |
|
{ |
|
"epoch": 12.230078348939422, |
|
"grad_norm": 0.4138163626194, |
|
"learning_rate": 1.9518049155145932e-05, |
|
"loss": 0.0049, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 12.230078348939422, |
|
"eval_events-synergy/entsum_processed_loss": 0.013458210043609142, |
|
"eval_events-synergy/entsum_processed_runtime": 86.2248, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.976, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.75, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 12.249187846359641, |
|
"grad_norm": 0.057438481599092484, |
|
"learning_rate": 1.9470046082949307e-05, |
|
"loss": 0.0065, |
|
"step": 64100 |
|
}, |
|
{ |
|
"epoch": 12.268297343779858, |
|
"grad_norm": 0.40838849544525146, |
|
"learning_rate": 1.942204301075269e-05, |
|
"loss": 0.0067, |
|
"step": 64200 |
|
}, |
|
{ |
|
"epoch": 12.287406841200076, |
|
"grad_norm": 0.2802123427391052, |
|
"learning_rate": 1.9374039938556068e-05, |
|
"loss": 0.0075, |
|
"step": 64300 |
|
}, |
|
{ |
|
"epoch": 12.306516338620295, |
|
"grad_norm": 0.009569051675498486, |
|
"learning_rate": 1.9326036866359447e-05, |
|
"loss": 0.0071, |
|
"step": 64400 |
|
}, |
|
{ |
|
"epoch": 12.325625836040512, |
|
"grad_norm": 0.03632039576768875, |
|
"learning_rate": 1.927803379416283e-05, |
|
"loss": 0.0068, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 12.34473533346073, |
|
"grad_norm": 0.46252092719078064, |
|
"learning_rate": 1.9230030721966208e-05, |
|
"loss": 0.0062, |
|
"step": 64600 |
|
}, |
|
{ |
|
"epoch": 12.363844830880948, |
|
"grad_norm": 0.04237058386206627, |
|
"learning_rate": 1.9182027649769586e-05, |
|
"loss": 0.0078, |
|
"step": 64700 |
|
}, |
|
{ |
|
"epoch": 12.382954328301166, |
|
"grad_norm": 0.050396714359521866, |
|
"learning_rate": 1.9134024577572965e-05, |
|
"loss": 0.0089, |
|
"step": 64800 |
|
}, |
|
{ |
|
"epoch": 12.402063825721383, |
|
"grad_norm": 0.8509294986724854, |
|
"learning_rate": 1.9086021505376344e-05, |
|
"loss": 0.006, |
|
"step": 64900 |
|
}, |
|
{ |
|
"epoch": 12.421173323141602, |
|
"grad_norm": 0.07883996516466141, |
|
"learning_rate": 1.9038018433179726e-05, |
|
"loss": 0.0068, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 12.421173323141602, |
|
"eval_events-synergy/entsum_processed_loss": 0.013203460723161697, |
|
"eval_events-synergy/entsum_processed_runtime": 86.1892, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.987, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.753, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 12.44028282056182, |
|
"grad_norm": 0.0476163849234581, |
|
"learning_rate": 1.8990015360983105e-05, |
|
"loss": 0.0079, |
|
"step": 65100 |
|
}, |
|
{ |
|
"epoch": 12.459392317982037, |
|
"grad_norm": 0.02199789695441723, |
|
"learning_rate": 1.8942012288786483e-05, |
|
"loss": 0.0067, |
|
"step": 65200 |
|
}, |
|
{ |
|
"epoch": 12.478501815402256, |
|
"grad_norm": 0.05000906065106392, |
|
"learning_rate": 1.8894009216589862e-05, |
|
"loss": 0.0082, |
|
"step": 65300 |
|
}, |
|
{ |
|
"epoch": 12.497611312822473, |
|
"grad_norm": 0.8533111810684204, |
|
"learning_rate": 1.884600614439324e-05, |
|
"loss": 0.0074, |
|
"step": 65400 |
|
}, |
|
{ |
|
"epoch": 12.51672081024269, |
|
"grad_norm": 1.0276228189468384, |
|
"learning_rate": 1.879800307219662e-05, |
|
"loss": 0.0055, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 12.53583030766291, |
|
"grad_norm": 0.05433342233300209, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 0.0089, |
|
"step": 65600 |
|
}, |
|
{ |
|
"epoch": 12.554939805083126, |
|
"grad_norm": 0.15978392958641052, |
|
"learning_rate": 1.870199692780338e-05, |
|
"loss": 0.0064, |
|
"step": 65700 |
|
}, |
|
{ |
|
"epoch": 12.574049302503344, |
|
"grad_norm": 0.3056204319000244, |
|
"learning_rate": 1.865399385560676e-05, |
|
"loss": 0.0064, |
|
"step": 65800 |
|
}, |
|
{ |
|
"epoch": 12.593158799923563, |
|
"grad_norm": 0.009515652433037758, |
|
"learning_rate": 1.860599078341014e-05, |
|
"loss": 0.0069, |
|
"step": 65900 |
|
}, |
|
{ |
|
"epoch": 12.61226829734378, |
|
"grad_norm": 0.026574306190013885, |
|
"learning_rate": 1.8557987711213517e-05, |
|
"loss": 0.0061, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 12.61226829734378, |
|
"eval_events-synergy/entsum_processed_loss": 0.013461146503686905, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4904, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.893, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.729, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 12.631377794763997, |
|
"grad_norm": 0.21460069715976715, |
|
"learning_rate": 1.85099846390169e-05, |
|
"loss": 0.0078, |
|
"step": 66100 |
|
}, |
|
{ |
|
"epoch": 12.650487292184216, |
|
"grad_norm": 0.29758745431900024, |
|
"learning_rate": 1.8461981566820278e-05, |
|
"loss": 0.0073, |
|
"step": 66200 |
|
}, |
|
{ |
|
"epoch": 12.669596789604434, |
|
"grad_norm": 0.4597108066082001, |
|
"learning_rate": 1.8413978494623656e-05, |
|
"loss": 0.0046, |
|
"step": 66300 |
|
}, |
|
{ |
|
"epoch": 12.688706287024651, |
|
"grad_norm": 0.07014264166355133, |
|
"learning_rate": 1.836597542242704e-05, |
|
"loss": 0.0073, |
|
"step": 66400 |
|
}, |
|
{ |
|
"epoch": 12.707815784444868, |
|
"grad_norm": 0.6235468983650208, |
|
"learning_rate": 1.8317972350230417e-05, |
|
"loss": 0.0085, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 12.726925281865087, |
|
"grad_norm": 0.49677661061286926, |
|
"learning_rate": 1.8269969278033793e-05, |
|
"loss": 0.0066, |
|
"step": 66600 |
|
}, |
|
{ |
|
"epoch": 12.746034779285305, |
|
"grad_norm": 0.023319154977798462, |
|
"learning_rate": 1.8221966205837175e-05, |
|
"loss": 0.0061, |
|
"step": 66700 |
|
}, |
|
{ |
|
"epoch": 12.765144276705522, |
|
"grad_norm": 0.5906485319137573, |
|
"learning_rate": 1.8173963133640553e-05, |
|
"loss": 0.0078, |
|
"step": 66800 |
|
}, |
|
{ |
|
"epoch": 12.784253774125741, |
|
"grad_norm": 0.4909743666648865, |
|
"learning_rate": 1.8125960061443932e-05, |
|
"loss": 0.008, |
|
"step": 66900 |
|
}, |
|
{ |
|
"epoch": 12.803363271545958, |
|
"grad_norm": 1.3858819007873535, |
|
"learning_rate": 1.8077956989247314e-05, |
|
"loss": 0.0066, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 12.803363271545958, |
|
"eval_events-synergy/entsum_processed_loss": 0.01290189754217863, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4251, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.913, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.734, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 12.822472768966175, |
|
"grad_norm": 0.8135608434677124, |
|
"learning_rate": 1.8029953917050693e-05, |
|
"loss": 0.0087, |
|
"step": 67100 |
|
}, |
|
{ |
|
"epoch": 12.841582266386395, |
|
"grad_norm": 0.3143382966518402, |
|
"learning_rate": 1.7981950844854072e-05, |
|
"loss": 0.0074, |
|
"step": 67200 |
|
}, |
|
{ |
|
"epoch": 12.860691763806612, |
|
"grad_norm": 0.1856617033481598, |
|
"learning_rate": 1.793394777265745e-05, |
|
"loss": 0.0073, |
|
"step": 67300 |
|
}, |
|
{ |
|
"epoch": 12.87980126122683, |
|
"grad_norm": 0.0038214472588151693, |
|
"learning_rate": 1.788594470046083e-05, |
|
"loss": 0.007, |
|
"step": 67400 |
|
}, |
|
{ |
|
"epoch": 12.898910758647048, |
|
"grad_norm": 0.12916350364685059, |
|
"learning_rate": 1.783794162826421e-05, |
|
"loss": 0.006, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 12.918020256067265, |
|
"grad_norm": 0.3312636613845825, |
|
"learning_rate": 1.778993855606759e-05, |
|
"loss": 0.0054, |
|
"step": 67600 |
|
}, |
|
{ |
|
"epoch": 12.937129753487483, |
|
"grad_norm": 0.008426926098763943, |
|
"learning_rate": 1.774193548387097e-05, |
|
"loss": 0.0055, |
|
"step": 67700 |
|
}, |
|
{ |
|
"epoch": 12.956239250907702, |
|
"grad_norm": 0.02026149071753025, |
|
"learning_rate": 1.769393241167435e-05, |
|
"loss": 0.0055, |
|
"step": 67800 |
|
}, |
|
{ |
|
"epoch": 12.975348748327919, |
|
"grad_norm": 0.13877490162849426, |
|
"learning_rate": 1.7645929339477726e-05, |
|
"loss": 0.0053, |
|
"step": 67900 |
|
}, |
|
{ |
|
"epoch": 12.994458245748136, |
|
"grad_norm": 0.4303775429725647, |
|
"learning_rate": 1.7597926267281105e-05, |
|
"loss": 0.0073, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 12.994458245748136, |
|
"eval_events-synergy/entsum_processed_loss": 0.013035867363214493, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4767, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.897, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.73, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 13.013567743168355, |
|
"grad_norm": 0.43861302733421326, |
|
"learning_rate": 1.7549923195084487e-05, |
|
"loss": 0.006, |
|
"step": 68100 |
|
}, |
|
{ |
|
"epoch": 13.032677240588573, |
|
"grad_norm": 0.5167204141616821, |
|
"learning_rate": 1.7501920122887866e-05, |
|
"loss": 0.0058, |
|
"step": 68200 |
|
}, |
|
{ |
|
"epoch": 13.05178673800879, |
|
"grad_norm": 0.9433317184448242, |
|
"learning_rate": 1.7453917050691245e-05, |
|
"loss": 0.0057, |
|
"step": 68300 |
|
}, |
|
{ |
|
"epoch": 13.070896235429009, |
|
"grad_norm": 0.01894897036254406, |
|
"learning_rate": 1.7405913978494627e-05, |
|
"loss": 0.0051, |
|
"step": 68400 |
|
}, |
|
{ |
|
"epoch": 13.090005732849226, |
|
"grad_norm": 0.9866701364517212, |
|
"learning_rate": 1.7357910906298002e-05, |
|
"loss": 0.006, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 13.109115230269444, |
|
"grad_norm": 0.014975917525589466, |
|
"learning_rate": 1.7309907834101384e-05, |
|
"loss": 0.0045, |
|
"step": 68600 |
|
}, |
|
{ |
|
"epoch": 13.128224727689663, |
|
"grad_norm": 0.016140179708600044, |
|
"learning_rate": 1.7261904761904763e-05, |
|
"loss": 0.0046, |
|
"step": 68700 |
|
}, |
|
{ |
|
"epoch": 13.14733422510988, |
|
"grad_norm": 0.42392489314079285, |
|
"learning_rate": 1.7213901689708142e-05, |
|
"loss": 0.0073, |
|
"step": 68800 |
|
}, |
|
{ |
|
"epoch": 13.166443722530097, |
|
"grad_norm": 0.44540801644325256, |
|
"learning_rate": 1.7165898617511524e-05, |
|
"loss": 0.0061, |
|
"step": 68900 |
|
}, |
|
{ |
|
"epoch": 13.185553219950314, |
|
"grad_norm": 0.3063792884349823, |
|
"learning_rate": 1.7117895545314903e-05, |
|
"loss": 0.0062, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 13.185553219950314, |
|
"eval_events-synergy/entsum_processed_loss": 0.013066727668046951, |
|
"eval_events-synergy/entsum_processed_runtime": 84.3501, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.576, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.9, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 13.204662717370534, |
|
"grad_norm": 0.09294837713241577, |
|
"learning_rate": 1.7069892473118278e-05, |
|
"loss": 0.0056, |
|
"step": 69100 |
|
}, |
|
{ |
|
"epoch": 13.22377221479075, |
|
"grad_norm": 0.07464176416397095, |
|
"learning_rate": 1.702188940092166e-05, |
|
"loss": 0.0058, |
|
"step": 69200 |
|
}, |
|
{ |
|
"epoch": 13.242881712210968, |
|
"grad_norm": 0.06809643656015396, |
|
"learning_rate": 1.697388632872504e-05, |
|
"loss": 0.005, |
|
"step": 69300 |
|
}, |
|
{ |
|
"epoch": 13.261991209631187, |
|
"grad_norm": 0.007226106245070696, |
|
"learning_rate": 1.6925883256528418e-05, |
|
"loss": 0.0076, |
|
"step": 69400 |
|
}, |
|
{ |
|
"epoch": 13.281100707051404, |
|
"grad_norm": 0.2170906960964203, |
|
"learning_rate": 1.68778801843318e-05, |
|
"loss": 0.0063, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 13.300210204471622, |
|
"grad_norm": 0.01999014802277088, |
|
"learning_rate": 1.682987711213518e-05, |
|
"loss": 0.0065, |
|
"step": 69600 |
|
}, |
|
{ |
|
"epoch": 13.31931970189184, |
|
"grad_norm": 0.0101222088560462, |
|
"learning_rate": 1.6781874039938557e-05, |
|
"loss": 0.0054, |
|
"step": 69700 |
|
}, |
|
{ |
|
"epoch": 13.338429199312058, |
|
"grad_norm": 0.7783103585243225, |
|
"learning_rate": 1.6733870967741936e-05, |
|
"loss": 0.0057, |
|
"step": 69800 |
|
}, |
|
{ |
|
"epoch": 13.357538696732275, |
|
"grad_norm": 0.17000514268875122, |
|
"learning_rate": 1.6685867895545315e-05, |
|
"loss": 0.0058, |
|
"step": 69900 |
|
}, |
|
{ |
|
"epoch": 13.376648194152494, |
|
"grad_norm": 0.2570934295654297, |
|
"learning_rate": 1.6637864823348693e-05, |
|
"loss": 0.006, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 13.376648194152494, |
|
"eval_events-synergy/entsum_processed_loss": 0.01279551349580288, |
|
"eval_events-synergy/entsum_processed_runtime": 86.2214, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.977, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.75, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 13.395757691572712, |
|
"grad_norm": 0.8089013695716858, |
|
"learning_rate": 1.6589861751152075e-05, |
|
"loss": 0.0061, |
|
"step": 70100 |
|
}, |
|
{ |
|
"epoch": 13.414867188992929, |
|
"grad_norm": 0.6852768063545227, |
|
"learning_rate": 1.6541858678955454e-05, |
|
"loss": 0.0044, |
|
"step": 70200 |
|
}, |
|
{ |
|
"epoch": 13.433976686413148, |
|
"grad_norm": 0.15175899863243103, |
|
"learning_rate": 1.6493855606758833e-05, |
|
"loss": 0.0061, |
|
"step": 70300 |
|
}, |
|
{ |
|
"epoch": 13.453086183833365, |
|
"grad_norm": 0.05197775736451149, |
|
"learning_rate": 1.6445852534562212e-05, |
|
"loss": 0.0063, |
|
"step": 70400 |
|
}, |
|
{ |
|
"epoch": 13.472195681253583, |
|
"grad_norm": 0.5456545948982239, |
|
"learning_rate": 1.639784946236559e-05, |
|
"loss": 0.0058, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 13.491305178673802, |
|
"grad_norm": 0.16188669204711914, |
|
"learning_rate": 1.6349846390168973e-05, |
|
"loss": 0.0064, |
|
"step": 70600 |
|
}, |
|
{ |
|
"epoch": 13.510414676094019, |
|
"grad_norm": 0.8403982520103455, |
|
"learning_rate": 1.630184331797235e-05, |
|
"loss": 0.0053, |
|
"step": 70700 |
|
}, |
|
{ |
|
"epoch": 13.529524173514236, |
|
"grad_norm": 0.00283870380371809, |
|
"learning_rate": 1.625384024577573e-05, |
|
"loss": 0.0064, |
|
"step": 70800 |
|
}, |
|
{ |
|
"epoch": 13.548633670934455, |
|
"grad_norm": 0.05162491649389267, |
|
"learning_rate": 1.6205837173579112e-05, |
|
"loss": 0.0054, |
|
"step": 70900 |
|
}, |
|
{ |
|
"epoch": 13.567743168354673, |
|
"grad_norm": 0.5664875507354736, |
|
"learning_rate": 1.6157834101382488e-05, |
|
"loss": 0.0054, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 13.567743168354673, |
|
"eval_events-synergy/entsum_processed_loss": 0.012747672386467457, |
|
"eval_events-synergy/entsum_processed_runtime": 86.3771, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.928, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.738, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 13.58685266577489, |
|
"grad_norm": 0.2854113280773163, |
|
"learning_rate": 1.6109831029185866e-05, |
|
"loss": 0.0047, |
|
"step": 71100 |
|
}, |
|
{ |
|
"epoch": 13.605962163195109, |
|
"grad_norm": 0.2796891927719116, |
|
"learning_rate": 1.606182795698925e-05, |
|
"loss": 0.0059, |
|
"step": 71200 |
|
}, |
|
{ |
|
"epoch": 13.625071660615326, |
|
"grad_norm": 0.4434114098548889, |
|
"learning_rate": 1.6013824884792627e-05, |
|
"loss": 0.0088, |
|
"step": 71300 |
|
}, |
|
{ |
|
"epoch": 13.644181158035543, |
|
"grad_norm": 0.11509229242801666, |
|
"learning_rate": 1.5965821812596006e-05, |
|
"loss": 0.0082, |
|
"step": 71400 |
|
}, |
|
{ |
|
"epoch": 13.66329065545576, |
|
"grad_norm": 0.034501317888498306, |
|
"learning_rate": 1.5917818740399388e-05, |
|
"loss": 0.0094, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 13.68240015287598, |
|
"grad_norm": 0.483247846364975, |
|
"learning_rate": 1.5869815668202767e-05, |
|
"loss": 0.005, |
|
"step": 71600 |
|
}, |
|
{ |
|
"epoch": 13.701509650296197, |
|
"grad_norm": 0.17208580672740936, |
|
"learning_rate": 1.5821812596006145e-05, |
|
"loss": 0.0051, |
|
"step": 71700 |
|
}, |
|
{ |
|
"epoch": 13.720619147716414, |
|
"grad_norm": 0.42245280742645264, |
|
"learning_rate": 1.5773809523809524e-05, |
|
"loss": 0.0052, |
|
"step": 71800 |
|
}, |
|
{ |
|
"epoch": 13.739728645136633, |
|
"grad_norm": 0.5766732096672058, |
|
"learning_rate": 1.5725806451612903e-05, |
|
"loss": 0.0056, |
|
"step": 71900 |
|
}, |
|
{ |
|
"epoch": 13.75883814255685, |
|
"grad_norm": 0.1782081127166748, |
|
"learning_rate": 1.5677803379416285e-05, |
|
"loss": 0.0067, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 13.75883814255685, |
|
"eval_events-synergy/entsum_processed_loss": 0.012470792047679424, |
|
"eval_events-synergy/entsum_processed_runtime": 86.3378, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.941, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.741, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 13.777947639977068, |
|
"grad_norm": 0.27173900604248047, |
|
"learning_rate": 1.5629800307219664e-05, |
|
"loss": 0.0042, |
|
"step": 72100 |
|
}, |
|
{ |
|
"epoch": 13.797057137397287, |
|
"grad_norm": 0.9423706531524658, |
|
"learning_rate": 1.5581797235023043e-05, |
|
"loss": 0.0061, |
|
"step": 72200 |
|
}, |
|
{ |
|
"epoch": 13.816166634817504, |
|
"grad_norm": 0.27326226234436035, |
|
"learning_rate": 1.553379416282642e-05, |
|
"loss": 0.005, |
|
"step": 72300 |
|
}, |
|
{ |
|
"epoch": 13.835276132237722, |
|
"grad_norm": 0.2667993903160095, |
|
"learning_rate": 1.54857910906298e-05, |
|
"loss": 0.0055, |
|
"step": 72400 |
|
}, |
|
{ |
|
"epoch": 13.85438562965794, |
|
"grad_norm": 0.4120780825614929, |
|
"learning_rate": 1.543778801843318e-05, |
|
"loss": 0.0043, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 13.873495127078158, |
|
"grad_norm": 0.6277773976325989, |
|
"learning_rate": 1.538978494623656e-05, |
|
"loss": 0.0077, |
|
"step": 72600 |
|
}, |
|
{ |
|
"epoch": 13.892604624498375, |
|
"grad_norm": 0.4718801975250244, |
|
"learning_rate": 1.534178187403994e-05, |
|
"loss": 0.0056, |
|
"step": 72700 |
|
}, |
|
{ |
|
"epoch": 13.911714121918594, |
|
"grad_norm": 0.18229953944683075, |
|
"learning_rate": 1.529377880184332e-05, |
|
"loss": 0.0081, |
|
"step": 72800 |
|
}, |
|
{ |
|
"epoch": 13.930823619338812, |
|
"grad_norm": 0.08315181732177734, |
|
"learning_rate": 1.5245775729646697e-05, |
|
"loss": 0.0073, |
|
"step": 72900 |
|
}, |
|
{ |
|
"epoch": 13.949933116759029, |
|
"grad_norm": 0.006552177015691996, |
|
"learning_rate": 1.5197772657450078e-05, |
|
"loss": 0.0055, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 13.949933116759029, |
|
"eval_events-synergy/entsum_processed_loss": 0.012344392947852612, |
|
"eval_events-synergy/entsum_processed_runtime": 86.3295, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.943, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.742, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 13.969042614179248, |
|
"grad_norm": 0.150691419839859, |
|
"learning_rate": 1.5149769585253456e-05, |
|
"loss": 0.0067, |
|
"step": 73100 |
|
}, |
|
{ |
|
"epoch": 13.988152111599465, |
|
"grad_norm": 0.7237725257873535, |
|
"learning_rate": 1.5101766513056837e-05, |
|
"loss": 0.0067, |
|
"step": 73200 |
|
}, |
|
{ |
|
"epoch": 14.007261609019682, |
|
"grad_norm": 0.07672421634197235, |
|
"learning_rate": 1.5053763440860215e-05, |
|
"loss": 0.0053, |
|
"step": 73300 |
|
}, |
|
{ |
|
"epoch": 14.026371106439901, |
|
"grad_norm": 0.2431519776582718, |
|
"learning_rate": 1.5005760368663596e-05, |
|
"loss": 0.0063, |
|
"step": 73400 |
|
}, |
|
{ |
|
"epoch": 14.045480603860119, |
|
"grad_norm": 0.048979759216308594, |
|
"learning_rate": 1.4957757296466976e-05, |
|
"loss": 0.005, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 14.064590101280336, |
|
"grad_norm": 0.9289604425430298, |
|
"learning_rate": 1.4909754224270353e-05, |
|
"loss": 0.0036, |
|
"step": 73600 |
|
}, |
|
{ |
|
"epoch": 14.083699598700553, |
|
"grad_norm": 0.06590162217617035, |
|
"learning_rate": 1.4861751152073732e-05, |
|
"loss": 0.004, |
|
"step": 73700 |
|
}, |
|
{ |
|
"epoch": 14.102809096120772, |
|
"grad_norm": 0.3335033357143402, |
|
"learning_rate": 1.4813748079877113e-05, |
|
"loss": 0.005, |
|
"step": 73800 |
|
}, |
|
{ |
|
"epoch": 14.12191859354099, |
|
"grad_norm": 0.08950705081224442, |
|
"learning_rate": 1.4765745007680493e-05, |
|
"loss": 0.0045, |
|
"step": 73900 |
|
}, |
|
{ |
|
"epoch": 14.141028090961207, |
|
"grad_norm": 0.7303321957588196, |
|
"learning_rate": 1.4717741935483872e-05, |
|
"loss": 0.0071, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 14.141028090961207, |
|
"eval_events-synergy/entsum_processed_loss": 0.012597361579537392, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4655, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.901, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.731, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 14.160137588381426, |
|
"grad_norm": 0.31180980801582336, |
|
"learning_rate": 1.4669738863287252e-05, |
|
"loss": 0.0055, |
|
"step": 74100 |
|
}, |
|
{ |
|
"epoch": 14.179247085801643, |
|
"grad_norm": 0.019986744970083237, |
|
"learning_rate": 1.462173579109063e-05, |
|
"loss": 0.005, |
|
"step": 74200 |
|
}, |
|
{ |
|
"epoch": 14.19835658322186, |
|
"grad_norm": 0.6268077492713928, |
|
"learning_rate": 1.457373271889401e-05, |
|
"loss": 0.0046, |
|
"step": 74300 |
|
}, |
|
{ |
|
"epoch": 14.21746608064208, |
|
"grad_norm": 0.18772102892398834, |
|
"learning_rate": 1.4525729646697388e-05, |
|
"loss": 0.0052, |
|
"step": 74400 |
|
}, |
|
{ |
|
"epoch": 14.236575578062297, |
|
"grad_norm": 0.014273080974817276, |
|
"learning_rate": 1.4477726574500769e-05, |
|
"loss": 0.0052, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 14.255685075482514, |
|
"grad_norm": 0.04534445330500603, |
|
"learning_rate": 1.442972350230415e-05, |
|
"loss": 0.0078, |
|
"step": 74600 |
|
}, |
|
{ |
|
"epoch": 14.274794572902733, |
|
"grad_norm": 0.06626222282648087, |
|
"learning_rate": 1.4381720430107528e-05, |
|
"loss": 0.0049, |
|
"step": 74700 |
|
}, |
|
{ |
|
"epoch": 14.29390407032295, |
|
"grad_norm": 0.05039865896105766, |
|
"learning_rate": 1.4333717357910905e-05, |
|
"loss": 0.0062, |
|
"step": 74800 |
|
}, |
|
{ |
|
"epoch": 14.313013567743168, |
|
"grad_norm": 0.0062214708887040615, |
|
"learning_rate": 1.4285714285714285e-05, |
|
"loss": 0.0043, |
|
"step": 74900 |
|
}, |
|
{ |
|
"epoch": 14.332123065163387, |
|
"grad_norm": 0.08620163798332214, |
|
"learning_rate": 1.4237711213517666e-05, |
|
"loss": 0.0057, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 14.332123065163387, |
|
"eval_events-synergy/entsum_processed_loss": 0.012711184099316597, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4444, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.907, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.733, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 14.351232562583604, |
|
"grad_norm": 0.04931287094950676, |
|
"learning_rate": 1.4189708141321045e-05, |
|
"loss": 0.0083, |
|
"step": 75100 |
|
}, |
|
{ |
|
"epoch": 14.370342060003821, |
|
"grad_norm": 0.01421100553125143, |
|
"learning_rate": 1.4141705069124425e-05, |
|
"loss": 0.004, |
|
"step": 75200 |
|
}, |
|
{ |
|
"epoch": 14.38945155742404, |
|
"grad_norm": 0.011329934932291508, |
|
"learning_rate": 1.4093701996927805e-05, |
|
"loss": 0.0049, |
|
"step": 75300 |
|
}, |
|
{ |
|
"epoch": 14.408561054844258, |
|
"grad_norm": 0.15495194494724274, |
|
"learning_rate": 1.4045698924731183e-05, |
|
"loss": 0.0032, |
|
"step": 75400 |
|
}, |
|
{ |
|
"epoch": 14.427670552264475, |
|
"grad_norm": 0.05016593262553215, |
|
"learning_rate": 1.3997695852534561e-05, |
|
"loss": 0.0055, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 14.446780049684694, |
|
"grad_norm": 1.6086093187332153, |
|
"learning_rate": 1.3949692780337942e-05, |
|
"loss": 0.005, |
|
"step": 75600 |
|
}, |
|
{ |
|
"epoch": 14.465889547104911, |
|
"grad_norm": 0.4085695743560791, |
|
"learning_rate": 1.3901689708141322e-05, |
|
"loss": 0.0054, |
|
"step": 75700 |
|
}, |
|
{ |
|
"epoch": 14.484999044525129, |
|
"grad_norm": 0.1665886640548706, |
|
"learning_rate": 1.3853686635944701e-05, |
|
"loss": 0.0047, |
|
"step": 75800 |
|
}, |
|
{ |
|
"epoch": 14.504108541945346, |
|
"grad_norm": 0.1481260061264038, |
|
"learning_rate": 1.3805683563748081e-05, |
|
"loss": 0.0043, |
|
"step": 75900 |
|
}, |
|
{ |
|
"epoch": 14.523218039365565, |
|
"grad_norm": 0.22774682939052582, |
|
"learning_rate": 1.3757680491551462e-05, |
|
"loss": 0.0037, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 14.523218039365565, |
|
"eval_events-synergy/entsum_processed_loss": 0.012892031110823154, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4183, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.916, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.735, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 14.542327536785782, |
|
"grad_norm": 0.34367671608924866, |
|
"learning_rate": 1.3709677419354839e-05, |
|
"loss": 0.0054, |
|
"step": 76100 |
|
}, |
|
{ |
|
"epoch": 14.561437034206, |
|
"grad_norm": 0.7251562476158142, |
|
"learning_rate": 1.3661674347158218e-05, |
|
"loss": 0.0062, |
|
"step": 76200 |
|
}, |
|
{ |
|
"epoch": 14.580546531626219, |
|
"grad_norm": 0.1083359494805336, |
|
"learning_rate": 1.3613671274961598e-05, |
|
"loss": 0.005, |
|
"step": 76300 |
|
}, |
|
{ |
|
"epoch": 14.599656029046436, |
|
"grad_norm": 0.08868356049060822, |
|
"learning_rate": 1.3565668202764978e-05, |
|
"loss": 0.004, |
|
"step": 76400 |
|
}, |
|
{ |
|
"epoch": 14.618765526466653, |
|
"grad_norm": 0.4599528908729553, |
|
"learning_rate": 1.3517665130568357e-05, |
|
"loss": 0.0045, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 14.637875023886872, |
|
"grad_norm": 0.5329752564430237, |
|
"learning_rate": 1.3469662058371738e-05, |
|
"loss": 0.0052, |
|
"step": 76600 |
|
}, |
|
{ |
|
"epoch": 14.65698452130709, |
|
"grad_norm": 0.050466131418943405, |
|
"learning_rate": 1.3421658986175115e-05, |
|
"loss": 0.0049, |
|
"step": 76700 |
|
}, |
|
{ |
|
"epoch": 14.676094018727307, |
|
"grad_norm": 0.2970987856388092, |
|
"learning_rate": 1.3373655913978495e-05, |
|
"loss": 0.0042, |
|
"step": 76800 |
|
}, |
|
{ |
|
"epoch": 14.695203516147526, |
|
"grad_norm": 0.10357856750488281, |
|
"learning_rate": 1.3325652841781874e-05, |
|
"loss": 0.005, |
|
"step": 76900 |
|
}, |
|
{ |
|
"epoch": 14.714313013567743, |
|
"grad_norm": 0.05402665585279465, |
|
"learning_rate": 1.3277649769585254e-05, |
|
"loss": 0.0047, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 14.714313013567743, |
|
"eval_events-synergy/entsum_processed_loss": 0.012962962500751019, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4889, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.894, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.729, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 14.73342251098796, |
|
"grad_norm": 0.15317828953266144, |
|
"learning_rate": 1.3229646697388635e-05, |
|
"loss": 0.0054, |
|
"step": 77100 |
|
}, |
|
{ |
|
"epoch": 14.75253200840818, |
|
"grad_norm": 0.1756182163953781, |
|
"learning_rate": 1.3181643625192013e-05, |
|
"loss": 0.0055, |
|
"step": 77200 |
|
}, |
|
{ |
|
"epoch": 14.771641505828397, |
|
"grad_norm": 1.027187466621399, |
|
"learning_rate": 1.313364055299539e-05, |
|
"loss": 0.0034, |
|
"step": 77300 |
|
}, |
|
{ |
|
"epoch": 14.790751003248614, |
|
"grad_norm": 0.005386768840253353, |
|
"learning_rate": 1.308563748079877e-05, |
|
"loss": 0.0043, |
|
"step": 77400 |
|
}, |
|
{ |
|
"epoch": 14.809860500668833, |
|
"grad_norm": 0.005857468582689762, |
|
"learning_rate": 1.3037634408602151e-05, |
|
"loss": 0.0045, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 14.82896999808905, |
|
"grad_norm": 0.19254671037197113, |
|
"learning_rate": 1.298963133640553e-05, |
|
"loss": 0.0041, |
|
"step": 77600 |
|
}, |
|
{ |
|
"epoch": 14.848079495509268, |
|
"grad_norm": 0.00358038698323071, |
|
"learning_rate": 1.294162826420891e-05, |
|
"loss": 0.0045, |
|
"step": 77700 |
|
}, |
|
{ |
|
"epoch": 14.867188992929487, |
|
"grad_norm": 0.019394252449274063, |
|
"learning_rate": 1.2893625192012291e-05, |
|
"loss": 0.0069, |
|
"step": 77800 |
|
}, |
|
{ |
|
"epoch": 14.886298490349704, |
|
"grad_norm": 0.03039630688726902, |
|
"learning_rate": 1.284562211981567e-05, |
|
"loss": 0.0039, |
|
"step": 77900 |
|
}, |
|
{ |
|
"epoch": 14.905407987769921, |
|
"grad_norm": 0.20222346484661102, |
|
"learning_rate": 1.2797619047619047e-05, |
|
"loss": 0.005, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 14.905407987769921, |
|
"eval_events-synergy/entsum_processed_loss": 0.012617943808436394, |
|
"eval_events-synergy/entsum_processed_runtime": 86.5195, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.884, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.727, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 14.92451748519014, |
|
"grad_norm": 0.049767039716243744, |
|
"learning_rate": 1.2749615975422427e-05, |
|
"loss": 0.0049, |
|
"step": 78100 |
|
}, |
|
{ |
|
"epoch": 14.943626982610358, |
|
"grad_norm": 0.0789194330573082, |
|
"learning_rate": 1.2701612903225808e-05, |
|
"loss": 0.0049, |
|
"step": 78200 |
|
}, |
|
{ |
|
"epoch": 14.962736480030575, |
|
"grad_norm": 0.013039722107350826, |
|
"learning_rate": 1.2653609831029186e-05, |
|
"loss": 0.0049, |
|
"step": 78300 |
|
}, |
|
{ |
|
"epoch": 14.981845977450792, |
|
"grad_norm": 0.00856608897447586, |
|
"learning_rate": 1.2605606758832567e-05, |
|
"loss": 0.0045, |
|
"step": 78400 |
|
}, |
|
{ |
|
"epoch": 15.000955474871011, |
|
"grad_norm": 0.22470723092556, |
|
"learning_rate": 1.2557603686635947e-05, |
|
"loss": 0.0042, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 15.020064972291229, |
|
"grad_norm": 0.03667416051030159, |
|
"learning_rate": 1.2509600614439324e-05, |
|
"loss": 0.0044, |
|
"step": 78600 |
|
}, |
|
{ |
|
"epoch": 15.039174469711446, |
|
"grad_norm": 0.2480105310678482, |
|
"learning_rate": 1.2461597542242705e-05, |
|
"loss": 0.004, |
|
"step": 78700 |
|
}, |
|
{ |
|
"epoch": 15.058283967131665, |
|
"grad_norm": 0.24030384421348572, |
|
"learning_rate": 1.2413594470046083e-05, |
|
"loss": 0.0048, |
|
"step": 78800 |
|
}, |
|
{ |
|
"epoch": 15.077393464551882, |
|
"grad_norm": 0.891026496887207, |
|
"learning_rate": 1.2365591397849464e-05, |
|
"loss": 0.0047, |
|
"step": 78900 |
|
}, |
|
{ |
|
"epoch": 15.0965029619721, |
|
"grad_norm": 0.666159987449646, |
|
"learning_rate": 1.2317588325652843e-05, |
|
"loss": 0.0042, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 15.0965029619721, |
|
"eval_events-synergy/entsum_processed_loss": 0.013020485639572144, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4768, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.897, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.73, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 15.115612459392318, |
|
"grad_norm": 0.4374198913574219, |
|
"learning_rate": 1.2269585253456221e-05, |
|
"loss": 0.0046, |
|
"step": 79100 |
|
}, |
|
{ |
|
"epoch": 15.134721956812536, |
|
"grad_norm": 0.13781531155109406, |
|
"learning_rate": 1.2221582181259602e-05, |
|
"loss": 0.0035, |
|
"step": 79200 |
|
}, |
|
{ |
|
"epoch": 15.153831454232753, |
|
"grad_norm": 0.008838070556521416, |
|
"learning_rate": 1.217357910906298e-05, |
|
"loss": 0.0046, |
|
"step": 79300 |
|
}, |
|
{ |
|
"epoch": 15.172940951652972, |
|
"grad_norm": 0.04405039921402931, |
|
"learning_rate": 1.212557603686636e-05, |
|
"loss": 0.0035, |
|
"step": 79400 |
|
}, |
|
{ |
|
"epoch": 15.19205044907319, |
|
"grad_norm": 0.386900395154953, |
|
"learning_rate": 1.207757296466974e-05, |
|
"loss": 0.0042, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 15.211159946493407, |
|
"grad_norm": 0.06320745497941971, |
|
"learning_rate": 1.202956989247312e-05, |
|
"loss": 0.0049, |
|
"step": 79600 |
|
}, |
|
{ |
|
"epoch": 15.230269443913626, |
|
"grad_norm": 0.6713384985923767, |
|
"learning_rate": 1.1981566820276497e-05, |
|
"loss": 0.0047, |
|
"step": 79700 |
|
}, |
|
{ |
|
"epoch": 15.249378941333843, |
|
"grad_norm": 0.23006173968315125, |
|
"learning_rate": 1.1933563748079878e-05, |
|
"loss": 0.0042, |
|
"step": 79800 |
|
}, |
|
{ |
|
"epoch": 15.26848843875406, |
|
"grad_norm": 0.24073895812034607, |
|
"learning_rate": 1.1885560675883258e-05, |
|
"loss": 0.0055, |
|
"step": 79900 |
|
}, |
|
{ |
|
"epoch": 15.28759793617428, |
|
"grad_norm": 0.2813238799571991, |
|
"learning_rate": 1.1837557603686637e-05, |
|
"loss": 0.0035, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 15.28759793617428, |
|
"eval_events-synergy/entsum_processed_loss": 0.012684877961874008, |
|
"eval_events-synergy/entsum_processed_runtime": 86.376, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.929, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.738, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 15.306707433594497, |
|
"grad_norm": 0.05870668590068817, |
|
"learning_rate": 1.1789554531490015e-05, |
|
"loss": 0.0052, |
|
"step": 80100 |
|
}, |
|
{ |
|
"epoch": 15.325816931014714, |
|
"grad_norm": 0.383779913187027, |
|
"learning_rate": 1.1741551459293396e-05, |
|
"loss": 0.0056, |
|
"step": 80200 |
|
}, |
|
{ |
|
"epoch": 15.344926428434933, |
|
"grad_norm": 0.18271517753601074, |
|
"learning_rate": 1.1693548387096775e-05, |
|
"loss": 0.0042, |
|
"step": 80300 |
|
}, |
|
{ |
|
"epoch": 15.36403592585515, |
|
"grad_norm": 0.049872301518917084, |
|
"learning_rate": 1.1645545314900153e-05, |
|
"loss": 0.0041, |
|
"step": 80400 |
|
}, |
|
{ |
|
"epoch": 15.383145423275367, |
|
"grad_norm": 0.16262871026992798, |
|
"learning_rate": 1.1597542242703534e-05, |
|
"loss": 0.0045, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 15.402254920695587, |
|
"grad_norm": 0.12631197273731232, |
|
"learning_rate": 1.1549539170506914e-05, |
|
"loss": 0.0086, |
|
"step": 80600 |
|
}, |
|
{ |
|
"epoch": 15.421364418115804, |
|
"grad_norm": 0.02645757794380188, |
|
"learning_rate": 1.1501536098310293e-05, |
|
"loss": 0.0044, |
|
"step": 80700 |
|
}, |
|
{ |
|
"epoch": 15.440473915536021, |
|
"grad_norm": 0.4429394006729126, |
|
"learning_rate": 1.1453533026113672e-05, |
|
"loss": 0.0041, |
|
"step": 80800 |
|
}, |
|
{ |
|
"epoch": 15.459583412956238, |
|
"grad_norm": 0.4982936382293701, |
|
"learning_rate": 1.1405529953917052e-05, |
|
"loss": 0.0041, |
|
"step": 80900 |
|
}, |
|
{ |
|
"epoch": 15.478692910376457, |
|
"grad_norm": 0.03018922545015812, |
|
"learning_rate": 1.135752688172043e-05, |
|
"loss": 0.0041, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 15.478692910376457, |
|
"eval_events-synergy/entsum_processed_loss": 0.012665621004998684, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4282, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.913, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.734, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 15.497802407796675, |
|
"grad_norm": 0.04739225283265114, |
|
"learning_rate": 1.130952380952381e-05, |
|
"loss": 0.0035, |
|
"step": 81100 |
|
}, |
|
{ |
|
"epoch": 15.516911905216892, |
|
"grad_norm": 0.16332654654979706, |
|
"learning_rate": 1.126152073732719e-05, |
|
"loss": 0.0042, |
|
"step": 81200 |
|
}, |
|
{ |
|
"epoch": 15.536021402637111, |
|
"grad_norm": 0.24561795592308044, |
|
"learning_rate": 1.1213517665130569e-05, |
|
"loss": 0.0038, |
|
"step": 81300 |
|
}, |
|
{ |
|
"epoch": 15.555130900057328, |
|
"grad_norm": 0.6066825985908508, |
|
"learning_rate": 1.1165514592933947e-05, |
|
"loss": 0.0066, |
|
"step": 81400 |
|
}, |
|
{ |
|
"epoch": 15.574240397477546, |
|
"grad_norm": 0.12053509801626205, |
|
"learning_rate": 1.1117511520737328e-05, |
|
"loss": 0.0028, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 15.593349894897765, |
|
"grad_norm": 0.21273471415042877, |
|
"learning_rate": 1.1069508448540707e-05, |
|
"loss": 0.004, |
|
"step": 81600 |
|
}, |
|
{ |
|
"epoch": 15.612459392317982, |
|
"grad_norm": 0.05035385861992836, |
|
"learning_rate": 1.1021505376344087e-05, |
|
"loss": 0.0029, |
|
"step": 81700 |
|
}, |
|
{ |
|
"epoch": 15.6315688897382, |
|
"grad_norm": 0.4434753656387329, |
|
"learning_rate": 1.0973502304147466e-05, |
|
"loss": 0.0037, |
|
"step": 81800 |
|
}, |
|
{ |
|
"epoch": 15.650678387158418, |
|
"grad_norm": 0.05798697844147682, |
|
"learning_rate": 1.0925499231950845e-05, |
|
"loss": 0.0037, |
|
"step": 81900 |
|
}, |
|
{ |
|
"epoch": 15.669787884578636, |
|
"grad_norm": 0.04873861372470856, |
|
"learning_rate": 1.0877496159754225e-05, |
|
"loss": 0.0041, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 15.669787884578636, |
|
"eval_events-synergy/entsum_processed_loss": 0.012837884947657585, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4918, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.893, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.729, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 15.688897381998853, |
|
"grad_norm": 0.0296151302754879, |
|
"learning_rate": 1.0829493087557604e-05, |
|
"loss": 0.0049, |
|
"step": 82100 |
|
}, |
|
{ |
|
"epoch": 15.708006879419072, |
|
"grad_norm": 0.43933552503585815, |
|
"learning_rate": 1.0781490015360982e-05, |
|
"loss": 0.0044, |
|
"step": 82200 |
|
}, |
|
{ |
|
"epoch": 15.72711637683929, |
|
"grad_norm": 0.016396446153521538, |
|
"learning_rate": 1.0733486943164363e-05, |
|
"loss": 0.0048, |
|
"step": 82300 |
|
}, |
|
{ |
|
"epoch": 15.746225874259506, |
|
"grad_norm": 0.027049915865063667, |
|
"learning_rate": 1.0685483870967743e-05, |
|
"loss": 0.0054, |
|
"step": 82400 |
|
}, |
|
{ |
|
"epoch": 15.765335371679726, |
|
"grad_norm": 0.16051483154296875, |
|
"learning_rate": 1.0637480798771122e-05, |
|
"loss": 0.0036, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 15.784444869099943, |
|
"grad_norm": 0.24747684597969055, |
|
"learning_rate": 1.05894777265745e-05, |
|
"loss": 0.0042, |
|
"step": 82600 |
|
}, |
|
{ |
|
"epoch": 15.80355436652016, |
|
"grad_norm": 0.14130574464797974, |
|
"learning_rate": 1.0541474654377881e-05, |
|
"loss": 0.0037, |
|
"step": 82700 |
|
}, |
|
{ |
|
"epoch": 15.82266386394038, |
|
"grad_norm": 1.0581270456314087, |
|
"learning_rate": 1.049347158218126e-05, |
|
"loss": 0.0044, |
|
"step": 82800 |
|
}, |
|
{ |
|
"epoch": 15.841773361360596, |
|
"grad_norm": 0.25455254316329956, |
|
"learning_rate": 1.0445468509984639e-05, |
|
"loss": 0.004, |
|
"step": 82900 |
|
}, |
|
{ |
|
"epoch": 15.860882858780814, |
|
"grad_norm": 0.01451896596699953, |
|
"learning_rate": 1.039746543778802e-05, |
|
"loss": 0.0038, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 15.860882858780814, |
|
"eval_events-synergy/entsum_processed_loss": 0.012506287544965744, |
|
"eval_events-synergy/entsum_processed_runtime": 86.5196, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.884, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.727, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 15.879992356201033, |
|
"grad_norm": 0.1098129004240036, |
|
"learning_rate": 1.03494623655914e-05, |
|
"loss": 0.0038, |
|
"step": 83100 |
|
}, |
|
{ |
|
"epoch": 15.89910185362125, |
|
"grad_norm": 0.017855707556009293, |
|
"learning_rate": 1.0301459293394777e-05, |
|
"loss": 0.0039, |
|
"step": 83200 |
|
}, |
|
{ |
|
"epoch": 15.918211351041467, |
|
"grad_norm": 0.3735440671443939, |
|
"learning_rate": 1.0253456221198157e-05, |
|
"loss": 0.005, |
|
"step": 83300 |
|
}, |
|
{ |
|
"epoch": 15.937320848461685, |
|
"grad_norm": 0.3725346028804779, |
|
"learning_rate": 1.0205453149001538e-05, |
|
"loss": 0.0043, |
|
"step": 83400 |
|
}, |
|
{ |
|
"epoch": 15.956430345881904, |
|
"grad_norm": 0.717900812625885, |
|
"learning_rate": 1.0157450076804916e-05, |
|
"loss": 0.0043, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 15.975539843302121, |
|
"grad_norm": 0.006927446462213993, |
|
"learning_rate": 1.0109447004608295e-05, |
|
"loss": 0.0028, |
|
"step": 83600 |
|
}, |
|
{ |
|
"epoch": 15.994649340722338, |
|
"grad_norm": 0.23666119575500488, |
|
"learning_rate": 1.0061443932411675e-05, |
|
"loss": 0.0036, |
|
"step": 83700 |
|
}, |
|
{ |
|
"epoch": 16.013758838142557, |
|
"grad_norm": 0.07709264755249023, |
|
"learning_rate": 1.0013440860215054e-05, |
|
"loss": 0.0029, |
|
"step": 83800 |
|
}, |
|
{ |
|
"epoch": 16.032868335562775, |
|
"grad_norm": 0.2303076684474945, |
|
"learning_rate": 9.965437788018433e-06, |
|
"loss": 0.0036, |
|
"step": 83900 |
|
}, |
|
{ |
|
"epoch": 16.051977832982992, |
|
"grad_norm": 0.4032735228538513, |
|
"learning_rate": 9.917434715821813e-06, |
|
"loss": 0.0035, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 16.051977832982992, |
|
"eval_events-synergy/entsum_processed_loss": 0.012511693872511387, |
|
"eval_events-synergy/entsum_processed_runtime": 86.3726, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.93, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.738, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 16.07108733040321, |
|
"grad_norm": 0.020075472071766853, |
|
"learning_rate": 9.869431643625192e-06, |
|
"loss": 0.0035, |
|
"step": 84100 |
|
}, |
|
{ |
|
"epoch": 16.09019682782343, |
|
"grad_norm": 0.008200431242585182, |
|
"learning_rate": 9.821428571428573e-06, |
|
"loss": 0.0033, |
|
"step": 84200 |
|
}, |
|
{ |
|
"epoch": 16.109306325243647, |
|
"grad_norm": 0.5671067237854004, |
|
"learning_rate": 9.773425499231951e-06, |
|
"loss": 0.0031, |
|
"step": 84300 |
|
}, |
|
{ |
|
"epoch": 16.128415822663865, |
|
"grad_norm": 0.5184391140937805, |
|
"learning_rate": 9.725422427035332e-06, |
|
"loss": 0.0034, |
|
"step": 84400 |
|
}, |
|
{ |
|
"epoch": 16.147525320084082, |
|
"grad_norm": 0.8578518033027649, |
|
"learning_rate": 9.67741935483871e-06, |
|
"loss": 0.004, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 16.1666348175043, |
|
"grad_norm": 0.0992913693189621, |
|
"learning_rate": 9.62941628264209e-06, |
|
"loss": 0.0037, |
|
"step": 84600 |
|
}, |
|
{ |
|
"epoch": 16.185744314924516, |
|
"grad_norm": 0.2536514699459076, |
|
"learning_rate": 9.58141321044547e-06, |
|
"loss": 0.0041, |
|
"step": 84700 |
|
}, |
|
{ |
|
"epoch": 16.204853812344734, |
|
"grad_norm": 0.03721817210316658, |
|
"learning_rate": 9.533410138248848e-06, |
|
"loss": 0.0026, |
|
"step": 84800 |
|
}, |
|
{ |
|
"epoch": 16.223963309764954, |
|
"grad_norm": 0.008737379685044289, |
|
"learning_rate": 9.485407066052227e-06, |
|
"loss": 0.0034, |
|
"step": 84900 |
|
}, |
|
{ |
|
"epoch": 16.24307280718517, |
|
"grad_norm": 0.04447716847062111, |
|
"learning_rate": 9.437403993855607e-06, |
|
"loss": 0.0034, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 16.24307280718517, |
|
"eval_events-synergy/entsum_processed_loss": 0.012916664592921734, |
|
"eval_events-synergy/entsum_processed_runtime": 85.6566, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.155, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.795, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 16.26218230460539, |
|
"grad_norm": 0.015641506761312485, |
|
"learning_rate": 9.389400921658986e-06, |
|
"loss": 0.0027, |
|
"step": 85100 |
|
}, |
|
{ |
|
"epoch": 16.281291802025606, |
|
"grad_norm": 0.8110576272010803, |
|
"learning_rate": 9.341397849462367e-06, |
|
"loss": 0.0041, |
|
"step": 85200 |
|
}, |
|
{ |
|
"epoch": 16.300401299445824, |
|
"grad_norm": 0.005773237440735102, |
|
"learning_rate": 9.293394777265745e-06, |
|
"loss": 0.0037, |
|
"step": 85300 |
|
}, |
|
{ |
|
"epoch": 16.31951079686604, |
|
"grad_norm": 0.003712354227900505, |
|
"learning_rate": 9.245391705069124e-06, |
|
"loss": 0.0036, |
|
"step": 85400 |
|
}, |
|
{ |
|
"epoch": 16.33862029428626, |
|
"grad_norm": 0.31228092312812805, |
|
"learning_rate": 9.197388632872505e-06, |
|
"loss": 0.0037, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 16.35772979170648, |
|
"grad_norm": 0.2536914646625519, |
|
"learning_rate": 9.149385560675883e-06, |
|
"loss": 0.004, |
|
"step": 85600 |
|
}, |
|
{ |
|
"epoch": 16.376839289126696, |
|
"grad_norm": 0.18308839201927185, |
|
"learning_rate": 9.101382488479262e-06, |
|
"loss": 0.0032, |
|
"step": 85700 |
|
}, |
|
{ |
|
"epoch": 16.395948786546914, |
|
"grad_norm": 0.008483047597110271, |
|
"learning_rate": 9.053379416282642e-06, |
|
"loss": 0.0031, |
|
"step": 85800 |
|
}, |
|
{ |
|
"epoch": 16.41505828396713, |
|
"grad_norm": 0.4126696288585663, |
|
"learning_rate": 9.005376344086023e-06, |
|
"loss": 0.0029, |
|
"step": 85900 |
|
}, |
|
{ |
|
"epoch": 16.434167781387348, |
|
"grad_norm": 0.24156266450881958, |
|
"learning_rate": 8.9573732718894e-06, |
|
"loss": 0.0057, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 16.434167781387348, |
|
"eval_events-synergy/entsum_processed_loss": 0.012442585080862045, |
|
"eval_events-synergy/entsum_processed_runtime": 86.8846, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.771, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.699, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 16.45327727880757, |
|
"grad_norm": 0.011875503696501255, |
|
"learning_rate": 8.90937019969278e-06, |
|
"loss": 0.004, |
|
"step": 86100 |
|
}, |
|
{ |
|
"epoch": 16.472386776227786, |
|
"grad_norm": 0.02928953431546688, |
|
"learning_rate": 8.86136712749616e-06, |
|
"loss": 0.0049, |
|
"step": 86200 |
|
}, |
|
{ |
|
"epoch": 16.491496273648004, |
|
"grad_norm": 0.04371361806988716, |
|
"learning_rate": 8.81336405529954e-06, |
|
"loss": 0.0039, |
|
"step": 86300 |
|
}, |
|
{ |
|
"epoch": 16.51060577106822, |
|
"grad_norm": 0.023907937109470367, |
|
"learning_rate": 8.765360983102918e-06, |
|
"loss": 0.0025, |
|
"step": 86400 |
|
}, |
|
{ |
|
"epoch": 16.529715268488438, |
|
"grad_norm": 1.1203492879867554, |
|
"learning_rate": 8.717357910906299e-06, |
|
"loss": 0.0038, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 16.548824765908655, |
|
"grad_norm": 0.01931983418762684, |
|
"learning_rate": 8.66935483870968e-06, |
|
"loss": 0.0046, |
|
"step": 86600 |
|
}, |
|
{ |
|
"epoch": 16.567934263328873, |
|
"grad_norm": 0.05801007151603699, |
|
"learning_rate": 8.621351766513056e-06, |
|
"loss": 0.0039, |
|
"step": 86700 |
|
}, |
|
{ |
|
"epoch": 16.587043760749093, |
|
"grad_norm": 0.3823702335357666, |
|
"learning_rate": 8.573348694316437e-06, |
|
"loss": 0.0031, |
|
"step": 86800 |
|
}, |
|
{ |
|
"epoch": 16.60615325816931, |
|
"grad_norm": 0.1722560077905655, |
|
"learning_rate": 8.525345622119817e-06, |
|
"loss": 0.0032, |
|
"step": 86900 |
|
}, |
|
{ |
|
"epoch": 16.625262755589528, |
|
"grad_norm": 1.4988058805465698, |
|
"learning_rate": 8.477342549923196e-06, |
|
"loss": 0.0039, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 16.625262755589528, |
|
"eval_events-synergy/entsum_processed_loss": 0.012560435570776463, |
|
"eval_events-synergy/entsum_processed_runtime": 86.7389, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.816, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.71, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 16.644372253009745, |
|
"grad_norm": 0.15429021418094635, |
|
"learning_rate": 8.429339477726575e-06, |
|
"loss": 0.0034, |
|
"step": 87100 |
|
}, |
|
{ |
|
"epoch": 16.663481750429963, |
|
"grad_norm": 0.11306668817996979, |
|
"learning_rate": 8.381336405529955e-06, |
|
"loss": 0.0034, |
|
"step": 87200 |
|
}, |
|
{ |
|
"epoch": 16.68259124785018, |
|
"grad_norm": 0.7127687335014343, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.0046, |
|
"step": 87300 |
|
}, |
|
{ |
|
"epoch": 16.7017007452704, |
|
"grad_norm": 0.12723377346992493, |
|
"learning_rate": 8.285330261136712e-06, |
|
"loss": 0.0037, |
|
"step": 87400 |
|
}, |
|
{ |
|
"epoch": 16.720810242690618, |
|
"grad_norm": 0.08388692140579224, |
|
"learning_rate": 8.237327188940093e-06, |
|
"loss": 0.0024, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 16.739919740110835, |
|
"grad_norm": 0.024774732068181038, |
|
"learning_rate": 8.189324116743472e-06, |
|
"loss": 0.0042, |
|
"step": 87600 |
|
}, |
|
{ |
|
"epoch": 16.759029237531053, |
|
"grad_norm": 0.021856360137462616, |
|
"learning_rate": 8.141321044546852e-06, |
|
"loss": 0.0033, |
|
"step": 87700 |
|
}, |
|
{ |
|
"epoch": 16.77813873495127, |
|
"grad_norm": 0.09436067193746567, |
|
"learning_rate": 8.09331797235023e-06, |
|
"loss": 0.0041, |
|
"step": 87800 |
|
}, |
|
{ |
|
"epoch": 16.797248232371487, |
|
"grad_norm": 1.1164593696594238, |
|
"learning_rate": 8.04531490015361e-06, |
|
"loss": 0.0034, |
|
"step": 87900 |
|
}, |
|
{ |
|
"epoch": 16.816357729791708, |
|
"grad_norm": 0.2703384757041931, |
|
"learning_rate": 7.99731182795699e-06, |
|
"loss": 0.003, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 16.816357729791708, |
|
"eval_events-synergy/entsum_processed_loss": 0.012502993457019329, |
|
"eval_events-synergy/entsum_processed_runtime": 86.9088, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.764, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.697, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 16.835467227211925, |
|
"grad_norm": 0.09698379784822464, |
|
"learning_rate": 7.949308755760369e-06, |
|
"loss": 0.0035, |
|
"step": 88100 |
|
}, |
|
{ |
|
"epoch": 16.854576724632143, |
|
"grad_norm": 0.10032069683074951, |
|
"learning_rate": 7.901305683563747e-06, |
|
"loss": 0.0034, |
|
"step": 88200 |
|
}, |
|
{ |
|
"epoch": 16.87368622205236, |
|
"grad_norm": 0.2772189974784851, |
|
"learning_rate": 7.853302611367128e-06, |
|
"loss": 0.0032, |
|
"step": 88300 |
|
}, |
|
{ |
|
"epoch": 16.892795719472577, |
|
"grad_norm": 1.200387716293335, |
|
"learning_rate": 7.805299539170508e-06, |
|
"loss": 0.004, |
|
"step": 88400 |
|
}, |
|
{ |
|
"epoch": 16.911905216892794, |
|
"grad_norm": 0.41922566294670105, |
|
"learning_rate": 7.757296466973887e-06, |
|
"loss": 0.0037, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 16.931014714313015, |
|
"grad_norm": 0.030310144647955894, |
|
"learning_rate": 7.709293394777266e-06, |
|
"loss": 0.0038, |
|
"step": 88600 |
|
}, |
|
{ |
|
"epoch": 16.950124211733232, |
|
"grad_norm": 0.7613725066184998, |
|
"learning_rate": 7.661290322580646e-06, |
|
"loss": 0.0027, |
|
"step": 88700 |
|
}, |
|
{ |
|
"epoch": 16.96923370915345, |
|
"grad_norm": 0.3432716727256775, |
|
"learning_rate": 7.613287250384026e-06, |
|
"loss": 0.0038, |
|
"step": 88800 |
|
}, |
|
{ |
|
"epoch": 16.988343206573667, |
|
"grad_norm": 0.07921448349952698, |
|
"learning_rate": 7.565284178187404e-06, |
|
"loss": 0.0031, |
|
"step": 88900 |
|
}, |
|
{ |
|
"epoch": 17.007452703993884, |
|
"grad_norm": 1.4262216091156006, |
|
"learning_rate": 7.517281105990784e-06, |
|
"loss": 0.0031, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 17.007452703993884, |
|
"eval_events-synergy/entsum_processed_loss": 0.01249308604747057, |
|
"eval_events-synergy/entsum_processed_runtime": 84.734, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.451, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.869, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 17.0265622014141, |
|
"grad_norm": 0.5929850339889526, |
|
"learning_rate": 7.469278033794164e-06, |
|
"loss": 0.0033, |
|
"step": 89100 |
|
}, |
|
{ |
|
"epoch": 17.045671698834322, |
|
"grad_norm": 0.01337860431522131, |
|
"learning_rate": 7.4212749615975425e-06, |
|
"loss": 0.0031, |
|
"step": 89200 |
|
}, |
|
{ |
|
"epoch": 17.06478119625454, |
|
"grad_norm": 0.436662882566452, |
|
"learning_rate": 7.373271889400922e-06, |
|
"loss": 0.0033, |
|
"step": 89300 |
|
}, |
|
{ |
|
"epoch": 17.083890693674757, |
|
"grad_norm": 0.031468212604522705, |
|
"learning_rate": 7.325268817204302e-06, |
|
"loss": 0.0025, |
|
"step": 89400 |
|
}, |
|
{ |
|
"epoch": 17.103000191094974, |
|
"grad_norm": 0.12954148650169373, |
|
"learning_rate": 7.27726574500768e-06, |
|
"loss": 0.0029, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 17.12210968851519, |
|
"grad_norm": 0.04875854402780533, |
|
"learning_rate": 7.22926267281106e-06, |
|
"loss": 0.0031, |
|
"step": 89600 |
|
}, |
|
{ |
|
"epoch": 17.14121918593541, |
|
"grad_norm": 0.20842742919921875, |
|
"learning_rate": 7.18125960061444e-06, |
|
"loss": 0.0048, |
|
"step": 89700 |
|
}, |
|
{ |
|
"epoch": 17.160328683355626, |
|
"grad_norm": 0.022663576528429985, |
|
"learning_rate": 7.133256528417818e-06, |
|
"loss": 0.0052, |
|
"step": 89800 |
|
}, |
|
{ |
|
"epoch": 17.179438180775847, |
|
"grad_norm": 0.00754079595208168, |
|
"learning_rate": 7.085253456221199e-06, |
|
"loss": 0.0033, |
|
"step": 89900 |
|
}, |
|
{ |
|
"epoch": 17.198547678196064, |
|
"grad_norm": 0.06156223267316818, |
|
"learning_rate": 7.037250384024578e-06, |
|
"loss": 0.0031, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 17.198547678196064, |
|
"eval_events-synergy/entsum_processed_loss": 0.012280634604394436, |
|
"eval_events-synergy/entsum_processed_runtime": 85.4755, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.212, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.809, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 17.21765717561628, |
|
"grad_norm": 0.011858302168548107, |
|
"learning_rate": 6.989247311827957e-06, |
|
"loss": 0.0032, |
|
"step": 90100 |
|
}, |
|
{ |
|
"epoch": 17.2367666730365, |
|
"grad_norm": 0.23627856373786926, |
|
"learning_rate": 6.941244239631337e-06, |
|
"loss": 0.0037, |
|
"step": 90200 |
|
}, |
|
{ |
|
"epoch": 17.255876170456716, |
|
"grad_norm": 0.6709181666374207, |
|
"learning_rate": 6.893241167434716e-06, |
|
"loss": 0.0038, |
|
"step": 90300 |
|
}, |
|
{ |
|
"epoch": 17.274985667876933, |
|
"grad_norm": 0.13345150649547577, |
|
"learning_rate": 6.845238095238096e-06, |
|
"loss": 0.0032, |
|
"step": 90400 |
|
}, |
|
{ |
|
"epoch": 17.294095165297154, |
|
"grad_norm": 0.00865347869694233, |
|
"learning_rate": 6.7972350230414745e-06, |
|
"loss": 0.0033, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 17.31320466271737, |
|
"grad_norm": 0.2757847309112549, |
|
"learning_rate": 6.749231950844854e-06, |
|
"loss": 0.003, |
|
"step": 90600 |
|
}, |
|
{ |
|
"epoch": 17.33231416013759, |
|
"grad_norm": 0.2280185967683792, |
|
"learning_rate": 6.7012288786482346e-06, |
|
"loss": 0.0026, |
|
"step": 90700 |
|
}, |
|
{ |
|
"epoch": 17.351423657557806, |
|
"grad_norm": 0.3780621886253357, |
|
"learning_rate": 6.6532258064516125e-06, |
|
"loss": 0.0038, |
|
"step": 90800 |
|
}, |
|
{ |
|
"epoch": 17.370533154978023, |
|
"grad_norm": 0.2748869061470032, |
|
"learning_rate": 6.605222734254993e-06, |
|
"loss": 0.0029, |
|
"step": 90900 |
|
}, |
|
{ |
|
"epoch": 17.38964265239824, |
|
"grad_norm": 0.7025009393692017, |
|
"learning_rate": 6.5572196620583725e-06, |
|
"loss": 0.0033, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 17.38964265239824, |
|
"eval_events-synergy/entsum_processed_loss": 0.012524567544460297, |
|
"eval_events-synergy/entsum_processed_runtime": 85.033, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.354, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.844, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 17.40875214981846, |
|
"grad_norm": 0.07058654725551605, |
|
"learning_rate": 6.509216589861751e-06, |
|
"loss": 0.0024, |
|
"step": 91100 |
|
}, |
|
{ |
|
"epoch": 17.42786164723868, |
|
"grad_norm": 0.016350701451301575, |
|
"learning_rate": 6.461213517665131e-06, |
|
"loss": 0.0032, |
|
"step": 91200 |
|
}, |
|
{ |
|
"epoch": 17.446971144658896, |
|
"grad_norm": 1.0580590963363647, |
|
"learning_rate": 6.41321044546851e-06, |
|
"loss": 0.0033, |
|
"step": 91300 |
|
}, |
|
{ |
|
"epoch": 17.466080642079113, |
|
"grad_norm": 0.029303809627890587, |
|
"learning_rate": 6.365207373271889e-06, |
|
"loss": 0.0028, |
|
"step": 91400 |
|
}, |
|
{ |
|
"epoch": 17.48519013949933, |
|
"grad_norm": 0.45363864302635193, |
|
"learning_rate": 6.317204301075269e-06, |
|
"loss": 0.0046, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 17.504299636919548, |
|
"grad_norm": 0.15257449448108673, |
|
"learning_rate": 6.269201228878649e-06, |
|
"loss": 0.0024, |
|
"step": 91600 |
|
}, |
|
{ |
|
"epoch": 17.523409134339765, |
|
"grad_norm": 0.4897935688495636, |
|
"learning_rate": 6.221198156682028e-06, |
|
"loss": 0.0026, |
|
"step": 91700 |
|
}, |
|
{ |
|
"epoch": 17.542518631759986, |
|
"grad_norm": 0.09276042878627777, |
|
"learning_rate": 6.1731950844854075e-06, |
|
"loss": 0.0042, |
|
"step": 91800 |
|
}, |
|
{ |
|
"epoch": 17.561628129180203, |
|
"grad_norm": 0.053013380616903305, |
|
"learning_rate": 6.125192012288787e-06, |
|
"loss": 0.0034, |
|
"step": 91900 |
|
}, |
|
{ |
|
"epoch": 17.58073762660042, |
|
"grad_norm": 0.004806125536561012, |
|
"learning_rate": 6.077188940092167e-06, |
|
"loss": 0.0032, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 17.58073762660042, |
|
"eval_events-synergy/entsum_processed_loss": 0.012403319589793682, |
|
"eval_events-synergy/entsum_processed_runtime": 84.7299, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.452, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.869, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 17.599847124020638, |
|
"grad_norm": 0.5843932032585144, |
|
"learning_rate": 6.029185867895545e-06, |
|
"loss": 0.004, |
|
"step": 92100 |
|
}, |
|
{ |
|
"epoch": 17.618956621440855, |
|
"grad_norm": 0.1348479837179184, |
|
"learning_rate": 5.981182795698925e-06, |
|
"loss": 0.0024, |
|
"step": 92200 |
|
}, |
|
{ |
|
"epoch": 17.638066118861072, |
|
"grad_norm": 0.7288301587104797, |
|
"learning_rate": 5.9331797235023045e-06, |
|
"loss": 0.0039, |
|
"step": 92300 |
|
}, |
|
{ |
|
"epoch": 17.657175616281293, |
|
"grad_norm": 0.018965117633342743, |
|
"learning_rate": 5.885176651305683e-06, |
|
"loss": 0.0034, |
|
"step": 92400 |
|
}, |
|
{ |
|
"epoch": 17.67628511370151, |
|
"grad_norm": 0.21183770895004272, |
|
"learning_rate": 5.837173579109064e-06, |
|
"loss": 0.0029, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 17.695394611121728, |
|
"grad_norm": 0.28580865263938904, |
|
"learning_rate": 5.7891705069124425e-06, |
|
"loss": 0.0036, |
|
"step": 92600 |
|
}, |
|
{ |
|
"epoch": 17.714504108541945, |
|
"grad_norm": 0.09748672693967819, |
|
"learning_rate": 5.741167434715822e-06, |
|
"loss": 0.0038, |
|
"step": 92700 |
|
}, |
|
{ |
|
"epoch": 17.733613605962162, |
|
"grad_norm": 0.03202127292752266, |
|
"learning_rate": 5.693164362519202e-06, |
|
"loss": 0.0028, |
|
"step": 92800 |
|
}, |
|
{ |
|
"epoch": 17.75272310338238, |
|
"grad_norm": 0.5783613920211792, |
|
"learning_rate": 5.64516129032258e-06, |
|
"loss": 0.0025, |
|
"step": 92900 |
|
}, |
|
{ |
|
"epoch": 17.7718326008026, |
|
"grad_norm": 0.1477188616991043, |
|
"learning_rate": 5.597158218125961e-06, |
|
"loss": 0.0029, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 17.7718326008026, |
|
"eval_events-synergy/entsum_processed_loss": 0.012467662803828716, |
|
"eval_events-synergy/entsum_processed_runtime": 86.2702, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.962, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.746, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 17.790942098222818, |
|
"grad_norm": 0.029744425788521767, |
|
"learning_rate": 5.5491551459293395e-06, |
|
"loss": 0.0019, |
|
"step": 93100 |
|
}, |
|
{ |
|
"epoch": 17.810051595643035, |
|
"grad_norm": 0.006620775908231735, |
|
"learning_rate": 5.501152073732719e-06, |
|
"loss": 0.004, |
|
"step": 93200 |
|
}, |
|
{ |
|
"epoch": 17.829161093063252, |
|
"grad_norm": 0.03536098822951317, |
|
"learning_rate": 5.453149001536099e-06, |
|
"loss": 0.0029, |
|
"step": 93300 |
|
}, |
|
{ |
|
"epoch": 17.84827059048347, |
|
"grad_norm": 0.18313711881637573, |
|
"learning_rate": 5.405145929339478e-06, |
|
"loss": 0.0034, |
|
"step": 93400 |
|
}, |
|
{ |
|
"epoch": 17.867380087903687, |
|
"grad_norm": 0.11048083007335663, |
|
"learning_rate": 5.357142857142857e-06, |
|
"loss": 0.0032, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 17.886489585323908, |
|
"grad_norm": 0.02983025275170803, |
|
"learning_rate": 5.309139784946237e-06, |
|
"loss": 0.0028, |
|
"step": 93600 |
|
}, |
|
{ |
|
"epoch": 17.905599082744125, |
|
"grad_norm": 0.050138670951128006, |
|
"learning_rate": 5.261136712749616e-06, |
|
"loss": 0.0028, |
|
"step": 93700 |
|
}, |
|
{ |
|
"epoch": 17.924708580164342, |
|
"grad_norm": 0.015463109128177166, |
|
"learning_rate": 5.213133640552996e-06, |
|
"loss": 0.0023, |
|
"step": 93800 |
|
}, |
|
{ |
|
"epoch": 17.94381807758456, |
|
"grad_norm": 0.3205445408821106, |
|
"learning_rate": 5.165130568356375e-06, |
|
"loss": 0.0047, |
|
"step": 93900 |
|
}, |
|
{ |
|
"epoch": 17.962927575004777, |
|
"grad_norm": 0.02310132421553135, |
|
"learning_rate": 5.117127496159754e-06, |
|
"loss": 0.0019, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 17.962927575004777, |
|
"eval_events-synergy/entsum_processed_loss": 0.012523534707725048, |
|
"eval_events-synergy/entsum_processed_runtime": 84.462, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.539, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.891, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 17.982037072424994, |
|
"grad_norm": 0.09872489422559738, |
|
"learning_rate": 5.0691244239631346e-06, |
|
"loss": 0.0027, |
|
"step": 94100 |
|
}, |
|
{ |
|
"epoch": 18.00114656984521, |
|
"grad_norm": 0.009674746543169022, |
|
"learning_rate": 5.021121351766513e-06, |
|
"loss": 0.0025, |
|
"step": 94200 |
|
}, |
|
{ |
|
"epoch": 18.020256067265432, |
|
"grad_norm": 0.1077992394566536, |
|
"learning_rate": 4.973118279569893e-06, |
|
"loss": 0.003, |
|
"step": 94300 |
|
}, |
|
{ |
|
"epoch": 18.03936556468565, |
|
"grad_norm": 0.15195026993751526, |
|
"learning_rate": 4.9251152073732725e-06, |
|
"loss": 0.0035, |
|
"step": 94400 |
|
}, |
|
{ |
|
"epoch": 18.058475062105867, |
|
"grad_norm": 0.048992399126291275, |
|
"learning_rate": 4.877112135176651e-06, |
|
"loss": 0.0026, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 18.077584559526084, |
|
"grad_norm": 0.0054066977463662624, |
|
"learning_rate": 4.829109062980031e-06, |
|
"loss": 0.0027, |
|
"step": 94600 |
|
}, |
|
{ |
|
"epoch": 18.0966940569463, |
|
"grad_norm": 0.07554273307323456, |
|
"learning_rate": 4.78110599078341e-06, |
|
"loss": 0.0031, |
|
"step": 94700 |
|
}, |
|
{ |
|
"epoch": 18.11580355436652, |
|
"grad_norm": 0.184183269739151, |
|
"learning_rate": 4.73310291858679e-06, |
|
"loss": 0.0031, |
|
"step": 94800 |
|
}, |
|
{ |
|
"epoch": 18.13491305178674, |
|
"grad_norm": 0.5222034454345703, |
|
"learning_rate": 4.6850998463901695e-06, |
|
"loss": 0.0032, |
|
"step": 94900 |
|
}, |
|
{ |
|
"epoch": 18.154022549206957, |
|
"grad_norm": 0.07612292468547821, |
|
"learning_rate": 4.637096774193548e-06, |
|
"loss": 0.0025, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 18.154022549206957, |
|
"eval_events-synergy/entsum_processed_loss": 0.012190896086394787, |
|
"eval_events-synergy/entsum_processed_runtime": 84.6592, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.475, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.875, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 18.173132046627174, |
|
"grad_norm": 0.3161191940307617, |
|
"learning_rate": 4.589093701996928e-06, |
|
"loss": 0.0028, |
|
"step": 95100 |
|
}, |
|
{ |
|
"epoch": 18.19224154404739, |
|
"grad_norm": 0.0276503823697567, |
|
"learning_rate": 4.5410906298003075e-06, |
|
"loss": 0.0026, |
|
"step": 95200 |
|
}, |
|
{ |
|
"epoch": 18.21135104146761, |
|
"grad_norm": 0.03638790175318718, |
|
"learning_rate": 4.493087557603687e-06, |
|
"loss": 0.003, |
|
"step": 95300 |
|
}, |
|
{ |
|
"epoch": 18.230460538887826, |
|
"grad_norm": 0.11877056211233139, |
|
"learning_rate": 4.445084485407066e-06, |
|
"loss": 0.0036, |
|
"step": 95400 |
|
}, |
|
{ |
|
"epoch": 18.249570036308047, |
|
"grad_norm": 0.033486805856227875, |
|
"learning_rate": 4.397081413210446e-06, |
|
"loss": 0.0032, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 18.268679533728264, |
|
"grad_norm": 0.07359365373849869, |
|
"learning_rate": 4.349078341013825e-06, |
|
"loss": 0.0023, |
|
"step": 95600 |
|
}, |
|
{ |
|
"epoch": 18.28778903114848, |
|
"grad_norm": 0.07991454005241394, |
|
"learning_rate": 4.3010752688172045e-06, |
|
"loss": 0.0029, |
|
"step": 95700 |
|
}, |
|
{ |
|
"epoch": 18.3068985285687, |
|
"grad_norm": 0.0038367731031030416, |
|
"learning_rate": 4.253072196620584e-06, |
|
"loss": 0.0022, |
|
"step": 95800 |
|
}, |
|
{ |
|
"epoch": 18.326008025988916, |
|
"grad_norm": 0.33883294463157654, |
|
"learning_rate": 4.205069124423963e-06, |
|
"loss": 0.0035, |
|
"step": 95900 |
|
}, |
|
{ |
|
"epoch": 18.345117523409133, |
|
"grad_norm": 0.02871253900229931, |
|
"learning_rate": 4.157066052227343e-06, |
|
"loss": 0.0033, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 18.345117523409133, |
|
"eval_events-synergy/entsum_processed_loss": 0.012162734754383564, |
|
"eval_events-synergy/entsum_processed_runtime": 86.128, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.006, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.757, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 18.364227020829354, |
|
"grad_norm": 0.3818151354789734, |
|
"learning_rate": 4.109062980030722e-06, |
|
"loss": 0.0024, |
|
"step": 96100 |
|
}, |
|
{ |
|
"epoch": 18.38333651824957, |
|
"grad_norm": 0.4807966649532318, |
|
"learning_rate": 4.061059907834102e-06, |
|
"loss": 0.0024, |
|
"step": 96200 |
|
}, |
|
{ |
|
"epoch": 18.40244601566979, |
|
"grad_norm": 0.5500592589378357, |
|
"learning_rate": 4.013056835637481e-06, |
|
"loss": 0.004, |
|
"step": 96300 |
|
}, |
|
{ |
|
"epoch": 18.421555513090006, |
|
"grad_norm": 0.03873921185731888, |
|
"learning_rate": 3.96505376344086e-06, |
|
"loss": 0.0033, |
|
"step": 96400 |
|
}, |
|
{ |
|
"epoch": 18.440665010510223, |
|
"grad_norm": 0.003130296478047967, |
|
"learning_rate": 3.9170506912442395e-06, |
|
"loss": 0.0031, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 18.45977450793044, |
|
"grad_norm": 0.04637152701616287, |
|
"learning_rate": 3.869047619047619e-06, |
|
"loss": 0.0043, |
|
"step": 96600 |
|
}, |
|
{ |
|
"epoch": 18.478884005350658, |
|
"grad_norm": 0.1211615651845932, |
|
"learning_rate": 3.821044546850999e-06, |
|
"loss": 0.0032, |
|
"step": 96700 |
|
}, |
|
{ |
|
"epoch": 18.49799350277088, |
|
"grad_norm": 0.17123758792877197, |
|
"learning_rate": 3.7730414746543783e-06, |
|
"loss": 0.002, |
|
"step": 96800 |
|
}, |
|
{ |
|
"epoch": 18.517103000191096, |
|
"grad_norm": 0.2270338535308838, |
|
"learning_rate": 3.7250384024577574e-06, |
|
"loss": 0.0029, |
|
"step": 96900 |
|
}, |
|
{ |
|
"epoch": 18.536212497611313, |
|
"grad_norm": 0.07671944797039032, |
|
"learning_rate": 3.6770353302611366e-06, |
|
"loss": 0.003, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 18.536212497611313, |
|
"eval_events-synergy/entsum_processed_loss": 0.012130416929721832, |
|
"eval_events-synergy/entsum_processed_runtime": 86.1559, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.998, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.755, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 18.55532199503153, |
|
"grad_norm": 0.5913010835647583, |
|
"learning_rate": 3.6290322580645166e-06, |
|
"loss": 0.0028, |
|
"step": 97100 |
|
}, |
|
{ |
|
"epoch": 18.574431492451748, |
|
"grad_norm": 1.1253652572631836, |
|
"learning_rate": 3.5810291858678958e-06, |
|
"loss": 0.0036, |
|
"step": 97200 |
|
}, |
|
{ |
|
"epoch": 18.593540989871965, |
|
"grad_norm": 0.029553644359111786, |
|
"learning_rate": 3.533026113671275e-06, |
|
"loss": 0.0034, |
|
"step": 97300 |
|
}, |
|
{ |
|
"epoch": 18.612650487292186, |
|
"grad_norm": 0.057064156979322433, |
|
"learning_rate": 3.4850230414746545e-06, |
|
"loss": 0.003, |
|
"step": 97400 |
|
}, |
|
{ |
|
"epoch": 18.631759984712403, |
|
"grad_norm": 0.08024504035711288, |
|
"learning_rate": 3.4370199692780337e-06, |
|
"loss": 0.0028, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 18.65086948213262, |
|
"grad_norm": 0.17282864451408386, |
|
"learning_rate": 3.389016897081413e-06, |
|
"loss": 0.0026, |
|
"step": 97600 |
|
}, |
|
{ |
|
"epoch": 18.669978979552837, |
|
"grad_norm": 0.046736475080251694, |
|
"learning_rate": 3.341013824884793e-06, |
|
"loss": 0.0031, |
|
"step": 97700 |
|
}, |
|
{ |
|
"epoch": 18.689088476973055, |
|
"grad_norm": 0.01335865817964077, |
|
"learning_rate": 3.293010752688172e-06, |
|
"loss": 0.0033, |
|
"step": 97800 |
|
}, |
|
{ |
|
"epoch": 18.708197974393272, |
|
"grad_norm": 0.24637210369110107, |
|
"learning_rate": 3.245007680491552e-06, |
|
"loss": 0.0021, |
|
"step": 97900 |
|
}, |
|
{ |
|
"epoch": 18.727307471813493, |
|
"grad_norm": 0.11811818182468414, |
|
"learning_rate": 3.197004608294931e-06, |
|
"loss": 0.0022, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 18.727307471813493, |
|
"eval_events-synergy/entsum_processed_loss": 0.012472053989768028, |
|
"eval_events-synergy/entsum_processed_runtime": 85.9829, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 27.052, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.769, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 18.74641696923371, |
|
"grad_norm": 0.2450024038553238, |
|
"learning_rate": 3.1490015360983104e-06, |
|
"loss": 0.003, |
|
"step": 98100 |
|
}, |
|
{ |
|
"epoch": 18.765526466653927, |
|
"grad_norm": 0.06988529115915298, |
|
"learning_rate": 3.1009984639016895e-06, |
|
"loss": 0.0029, |
|
"step": 98200 |
|
}, |
|
{ |
|
"epoch": 18.784635964074145, |
|
"grad_norm": 0.05243882164359093, |
|
"learning_rate": 3.052995391705069e-06, |
|
"loss": 0.0028, |
|
"step": 98300 |
|
}, |
|
{ |
|
"epoch": 18.803745461494362, |
|
"grad_norm": 0.005393981467932463, |
|
"learning_rate": 3.0049923195084487e-06, |
|
"loss": 0.0033, |
|
"step": 98400 |
|
}, |
|
{ |
|
"epoch": 18.82285495891458, |
|
"grad_norm": 1.3344509601593018, |
|
"learning_rate": 2.9569892473118283e-06, |
|
"loss": 0.0029, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 18.8419644563348, |
|
"grad_norm": 0.0016993408789858222, |
|
"learning_rate": 2.9089861751152074e-06, |
|
"loss": 0.0017, |
|
"step": 98600 |
|
}, |
|
{ |
|
"epoch": 18.861073953755017, |
|
"grad_norm": 0.010616108775138855, |
|
"learning_rate": 2.860983102918587e-06, |
|
"loss": 0.0042, |
|
"step": 98700 |
|
}, |
|
{ |
|
"epoch": 18.880183451175235, |
|
"grad_norm": 0.01788509450852871, |
|
"learning_rate": 2.8129800307219666e-06, |
|
"loss": 0.0028, |
|
"step": 98800 |
|
}, |
|
{ |
|
"epoch": 18.899292948595452, |
|
"grad_norm": 0.07264339178800583, |
|
"learning_rate": 2.7649769585253458e-06, |
|
"loss": 0.0028, |
|
"step": 98900 |
|
}, |
|
{ |
|
"epoch": 18.91840244601567, |
|
"grad_norm": 0.6919617652893066, |
|
"learning_rate": 2.716973886328725e-06, |
|
"loss": 0.0022, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 18.91840244601567, |
|
"eval_events-synergy/entsum_processed_loss": 0.012552082538604736, |
|
"eval_events-synergy/entsum_processed_runtime": 86.508, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.888, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.728, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 18.937511943435887, |
|
"grad_norm": 0.13486118614673615, |
|
"learning_rate": 2.6689708141321045e-06, |
|
"loss": 0.0025, |
|
"step": 99100 |
|
}, |
|
{ |
|
"epoch": 18.956621440856104, |
|
"grad_norm": 0.36750730872154236, |
|
"learning_rate": 2.620967741935484e-06, |
|
"loss": 0.0029, |
|
"step": 99200 |
|
}, |
|
{ |
|
"epoch": 18.975730938276325, |
|
"grad_norm": 0.01782979629933834, |
|
"learning_rate": 2.5729646697388633e-06, |
|
"loss": 0.002, |
|
"step": 99300 |
|
}, |
|
{ |
|
"epoch": 18.994840435696542, |
|
"grad_norm": 0.012732760980725288, |
|
"learning_rate": 2.524961597542243e-06, |
|
"loss": 0.0019, |
|
"step": 99400 |
|
}, |
|
{ |
|
"epoch": 19.01394993311676, |
|
"grad_norm": 0.8101866245269775, |
|
"learning_rate": 2.4769585253456224e-06, |
|
"loss": 0.003, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 19.033059430536976, |
|
"grad_norm": 0.06456371396780014, |
|
"learning_rate": 2.4289554531490016e-06, |
|
"loss": 0.0021, |
|
"step": 99600 |
|
}, |
|
{ |
|
"epoch": 19.052168927957194, |
|
"grad_norm": 0.18568381667137146, |
|
"learning_rate": 2.3809523809523808e-06, |
|
"loss": 0.003, |
|
"step": 99700 |
|
}, |
|
{ |
|
"epoch": 19.07127842537741, |
|
"grad_norm": 0.025632772594690323, |
|
"learning_rate": 2.3329493087557603e-06, |
|
"loss": 0.0033, |
|
"step": 99800 |
|
}, |
|
{ |
|
"epoch": 19.090387922797632, |
|
"grad_norm": 0.4020284414291382, |
|
"learning_rate": 2.28494623655914e-06, |
|
"loss": 0.0024, |
|
"step": 99900 |
|
}, |
|
{ |
|
"epoch": 19.10949742021785, |
|
"grad_norm": 0.02960309013724327, |
|
"learning_rate": 2.2369431643625195e-06, |
|
"loss": 0.002, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 19.10949742021785, |
|
"eval_events-synergy/entsum_processed_loss": 0.01250072568655014, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4178, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.916, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.735, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 19.128606917638066, |
|
"grad_norm": 0.0692773312330246, |
|
"learning_rate": 2.1889400921658987e-06, |
|
"loss": 0.0019, |
|
"step": 100100 |
|
}, |
|
{ |
|
"epoch": 19.147716415058284, |
|
"grad_norm": 1.2192124128341675, |
|
"learning_rate": 2.1409370199692783e-06, |
|
"loss": 0.003, |
|
"step": 100200 |
|
}, |
|
{ |
|
"epoch": 19.1668259124785, |
|
"grad_norm": 0.027012521401047707, |
|
"learning_rate": 2.0929339477726574e-06, |
|
"loss": 0.0028, |
|
"step": 100300 |
|
}, |
|
{ |
|
"epoch": 19.18593540989872, |
|
"grad_norm": 1.0921802520751953, |
|
"learning_rate": 2.044930875576037e-06, |
|
"loss": 0.003, |
|
"step": 100400 |
|
}, |
|
{ |
|
"epoch": 19.20504490731894, |
|
"grad_norm": 0.047722671180963516, |
|
"learning_rate": 1.996927803379416e-06, |
|
"loss": 0.0034, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 19.224154404739156, |
|
"grad_norm": 0.14935222268104553, |
|
"learning_rate": 1.9489247311827958e-06, |
|
"loss": 0.0036, |
|
"step": 100600 |
|
}, |
|
{ |
|
"epoch": 19.243263902159374, |
|
"grad_norm": 0.35229289531707764, |
|
"learning_rate": 1.9009216589861754e-06, |
|
"loss": 0.0026, |
|
"step": 100700 |
|
}, |
|
{ |
|
"epoch": 19.26237339957959, |
|
"grad_norm": 0.09162531048059464, |
|
"learning_rate": 1.8529185867895545e-06, |
|
"loss": 0.0029, |
|
"step": 100800 |
|
}, |
|
{ |
|
"epoch": 19.281482896999808, |
|
"grad_norm": 0.10979507118463516, |
|
"learning_rate": 1.8049155145929339e-06, |
|
"loss": 0.0027, |
|
"step": 100900 |
|
}, |
|
{ |
|
"epoch": 19.300592394420025, |
|
"grad_norm": 0.11148755252361298, |
|
"learning_rate": 1.7569124423963135e-06, |
|
"loss": 0.0029, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 19.300592394420025, |
|
"eval_events-synergy/entsum_processed_loss": 0.012415879406034946, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4215, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.915, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.734, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 19.319701891840246, |
|
"grad_norm": 0.345984548330307, |
|
"learning_rate": 1.708909370199693e-06, |
|
"loss": 0.0025, |
|
"step": 101100 |
|
}, |
|
{ |
|
"epoch": 19.338811389260464, |
|
"grad_norm": 0.08527397364377975, |
|
"learning_rate": 1.6609062980030722e-06, |
|
"loss": 0.0021, |
|
"step": 101200 |
|
}, |
|
{ |
|
"epoch": 19.35792088668068, |
|
"grad_norm": 0.4920309782028198, |
|
"learning_rate": 1.6129032258064516e-06, |
|
"loss": 0.0025, |
|
"step": 101300 |
|
}, |
|
{ |
|
"epoch": 19.377030384100898, |
|
"grad_norm": 0.28533655405044556, |
|
"learning_rate": 1.5649001536098312e-06, |
|
"loss": 0.0024, |
|
"step": 101400 |
|
}, |
|
{ |
|
"epoch": 19.396139881521115, |
|
"grad_norm": 0.008158371783792973, |
|
"learning_rate": 1.5168970814132106e-06, |
|
"loss": 0.0019, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 19.415249378941333, |
|
"grad_norm": 0.00896093063056469, |
|
"learning_rate": 1.46889400921659e-06, |
|
"loss": 0.0028, |
|
"step": 101600 |
|
}, |
|
{ |
|
"epoch": 19.43435887636155, |
|
"grad_norm": 0.11068089306354523, |
|
"learning_rate": 1.4208909370199693e-06, |
|
"loss": 0.0036, |
|
"step": 101700 |
|
}, |
|
{ |
|
"epoch": 19.45346837378177, |
|
"grad_norm": 0.029200438410043716, |
|
"learning_rate": 1.3728878648233489e-06, |
|
"loss": 0.0029, |
|
"step": 101800 |
|
}, |
|
{ |
|
"epoch": 19.472577871201988, |
|
"grad_norm": 0.06840227544307709, |
|
"learning_rate": 1.3248847926267283e-06, |
|
"loss": 0.0029, |
|
"step": 101900 |
|
}, |
|
{ |
|
"epoch": 19.491687368622205, |
|
"grad_norm": 0.23425918817520142, |
|
"learning_rate": 1.2768817204301076e-06, |
|
"loss": 0.0022, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 19.491687368622205, |
|
"eval_events-synergy/entsum_processed_loss": 0.012387688271701336, |
|
"eval_events-synergy/entsum_processed_runtime": 86.7065, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.826, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.712, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 19.510796866042423, |
|
"grad_norm": 0.08164483308792114, |
|
"learning_rate": 1.228878648233487e-06, |
|
"loss": 0.0026, |
|
"step": 102100 |
|
}, |
|
{ |
|
"epoch": 19.52990636346264, |
|
"grad_norm": 0.4924757778644562, |
|
"learning_rate": 1.1808755760368664e-06, |
|
"loss": 0.003, |
|
"step": 102200 |
|
}, |
|
{ |
|
"epoch": 19.549015860882857, |
|
"grad_norm": 0.2224261462688446, |
|
"learning_rate": 1.1328725038402458e-06, |
|
"loss": 0.0022, |
|
"step": 102300 |
|
}, |
|
{ |
|
"epoch": 19.568125358303078, |
|
"grad_norm": 0.08271826058626175, |
|
"learning_rate": 1.0848694316436251e-06, |
|
"loss": 0.0035, |
|
"step": 102400 |
|
}, |
|
{ |
|
"epoch": 19.587234855723295, |
|
"grad_norm": 0.09882861375808716, |
|
"learning_rate": 1.0368663594470047e-06, |
|
"loss": 0.0029, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 19.606344353143513, |
|
"grad_norm": 0.054669950157403946, |
|
"learning_rate": 9.88863287250384e-07, |
|
"loss": 0.0027, |
|
"step": 102600 |
|
}, |
|
{ |
|
"epoch": 19.62545385056373, |
|
"grad_norm": 1.1425254344940186, |
|
"learning_rate": 9.408602150537635e-07, |
|
"loss": 0.0034, |
|
"step": 102700 |
|
}, |
|
{ |
|
"epoch": 19.644563347983947, |
|
"grad_norm": 0.0010029702680185437, |
|
"learning_rate": 8.928571428571428e-07, |
|
"loss": 0.0023, |
|
"step": 102800 |
|
}, |
|
{ |
|
"epoch": 19.663672845404164, |
|
"grad_norm": 0.025402860715985298, |
|
"learning_rate": 8.448540706605223e-07, |
|
"loss": 0.0029, |
|
"step": 102900 |
|
}, |
|
{ |
|
"epoch": 19.682782342824385, |
|
"grad_norm": 0.2910292446613312, |
|
"learning_rate": 7.968509984639017e-07, |
|
"loss": 0.0039, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 19.682782342824385, |
|
"eval_events-synergy/entsum_processed_loss": 0.012337877415120602, |
|
"eval_events-synergy/entsum_processed_runtime": 86.2586, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.965, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.747, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 19.701891840244603, |
|
"grad_norm": 0.0035727862268686295, |
|
"learning_rate": 7.488479262672811e-07, |
|
"loss": 0.0029, |
|
"step": 103100 |
|
}, |
|
{ |
|
"epoch": 19.72100133766482, |
|
"grad_norm": 0.25086501240730286, |
|
"learning_rate": 7.008448540706605e-07, |
|
"loss": 0.0028, |
|
"step": 103200 |
|
}, |
|
{ |
|
"epoch": 19.740110835085037, |
|
"grad_norm": 0.00701789790764451, |
|
"learning_rate": 6.528417818740399e-07, |
|
"loss": 0.0031, |
|
"step": 103300 |
|
}, |
|
{ |
|
"epoch": 19.759220332505254, |
|
"grad_norm": 0.8931819200515747, |
|
"learning_rate": 6.048387096774194e-07, |
|
"loss": 0.0037, |
|
"step": 103400 |
|
}, |
|
{ |
|
"epoch": 19.77832982992547, |
|
"grad_norm": 0.021326079964637756, |
|
"learning_rate": 5.568356374807988e-07, |
|
"loss": 0.0023, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 19.797439327345693, |
|
"grad_norm": 0.01552259735763073, |
|
"learning_rate": 5.088325652841783e-07, |
|
"loss": 0.0022, |
|
"step": 103600 |
|
}, |
|
{ |
|
"epoch": 19.81654882476591, |
|
"grad_norm": 0.026935463771224022, |
|
"learning_rate": 4.6082949308755763e-07, |
|
"loss": 0.0023, |
|
"step": 103700 |
|
}, |
|
{ |
|
"epoch": 19.835658322186127, |
|
"grad_norm": 0.013973666355013847, |
|
"learning_rate": 4.1282642089093706e-07, |
|
"loss": 0.0028, |
|
"step": 103800 |
|
}, |
|
{ |
|
"epoch": 19.854767819606344, |
|
"grad_norm": 0.001906028133817017, |
|
"learning_rate": 3.648233486943165e-07, |
|
"loss": 0.0025, |
|
"step": 103900 |
|
}, |
|
{ |
|
"epoch": 19.87387731702656, |
|
"grad_norm": 0.6687213778495789, |
|
"learning_rate": 3.168202764976959e-07, |
|
"loss": 0.0028, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 19.87387731702656, |
|
"eval_events-synergy/entsum_processed_loss": 0.012370930053293705, |
|
"eval_events-synergy/entsum_processed_runtime": 86.4336, |
|
"eval_events-synergy/entsum_processed_samples_per_second": 26.911, |
|
"eval_events-synergy/entsum_processed_steps_per_second": 6.733, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 19.89298681444678, |
|
"grad_norm": 0.019737957045435905, |
|
"learning_rate": 2.688172043010753e-07, |
|
"loss": 0.0023, |
|
"step": 104100 |
|
}, |
|
{ |
|
"epoch": 19.912096311866996, |
|
"grad_norm": 0.14768967032432556, |
|
"learning_rate": 2.2081413210445468e-07, |
|
"loss": 0.0031, |
|
"step": 104200 |
|
}, |
|
{ |
|
"epoch": 19.931205809287217, |
|
"grad_norm": 0.057144857943058014, |
|
"learning_rate": 1.728110599078341e-07, |
|
"loss": 0.0029, |
|
"step": 104300 |
|
}, |
|
{ |
|
"epoch": 19.950315306707434, |
|
"grad_norm": 0.320926696062088, |
|
"learning_rate": 1.248079877112135e-07, |
|
"loss": 0.0022, |
|
"step": 104400 |
|
}, |
|
{ |
|
"epoch": 19.96942480412765, |
|
"grad_norm": 0.519444465637207, |
|
"learning_rate": 7.680491551459294e-08, |
|
"loss": 0.0025, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 19.98853430154787, |
|
"grad_norm": 0.07837866246700287, |
|
"learning_rate": 2.8801843317972352e-08, |
|
"loss": 0.0024, |
|
"step": 104600 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 104660, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.647758642249728e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|