| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.125, |
| "eval_steps": 2000, |
| "global_step": 5000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00025, |
| "grad_norm": 39.5, |
| "learning_rate": 0.0001, |
| "loss": 7.8298, |
| "loss/crossentropy": 2.313796639442444, |
| "loss/hidden": 3.414453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21518087349832057, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0005, |
| "grad_norm": 31.5, |
| "grad_norm_var": 5.698893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.8693, |
| "loss/crossentropy": 2.1564369201660156, |
| "loss/hidden": 3.587109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21401480734348297, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.00075, |
| "grad_norm": 36.0, |
| "grad_norm_var": 6.930143229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.8779, |
| "loss/crossentropy": 2.179039953649044, |
| "loss/hidden": 3.709375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22207003347575666, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.001, |
| "grad_norm": 32.75, |
| "grad_norm_var": 40.942708333333336, |
| "learning_rate": 0.0001, |
| "loss": 7.7653, |
| "loss/crossentropy": 2.074952059984207, |
| "loss/hidden": 3.55625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20403100922703743, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.00125, |
| "grad_norm": 35.75, |
| "grad_norm_var": 94.25729166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.8641, |
| "loss/crossentropy": 2.087546107172966, |
| "loss/hidden": 3.50546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19412125833332539, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0015, |
| "grad_norm": 30.25, |
| "grad_norm_var": 110.89140625, |
| "learning_rate": 0.0001, |
| "loss": 7.8652, |
| "loss/crossentropy": 2.2259810894727705, |
| "loss/hidden": 3.528125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21063638497143983, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.00175, |
| "grad_norm": 36.0, |
| "grad_norm_var": 62.02805989583333, |
| "learning_rate": 0.0001, |
| "loss": 7.751, |
| "loss/crossentropy": 2.164659637212753, |
| "loss/hidden": 3.47109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19977533183991908, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 34.25, |
| "grad_norm_var": 8.6759765625, |
| "learning_rate": 0.0001, |
| "loss": 7.7596, |
| "loss/crossentropy": 2.097026476264, |
| "loss/hidden": 3.478125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20290330462157727, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.00225, |
| "grad_norm": 39.75, |
| "grad_norm_var": 71.35598958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.8106, |
| "loss/crossentropy": 2.1291788890957832, |
| "loss/hidden": 3.491796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19771635457873343, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.0025, |
| "grad_norm": 34.25, |
| "grad_norm_var": 9.158072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7473, |
| "loss/crossentropy": 2.147798593342304, |
| "loss/hidden": 3.558203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20517258979380132, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.00275, |
| "grad_norm": 31.625, |
| "grad_norm_var": 9.737239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7738, |
| "loss/crossentropy": 2.1884776622056963, |
| "loss/hidden": 3.458203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20624704901129007, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 37.75, |
| "grad_norm_var": 335.18170572916665, |
| "learning_rate": 0.0001, |
| "loss": 7.8546, |
| "loss/crossentropy": 2.2224678859114646, |
| "loss/hidden": 3.5296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22259013392031193, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.00325, |
| "grad_norm": 136.0, |
| "grad_norm_var": 1014.1374348958333, |
| "learning_rate": 0.0001, |
| "loss": 7.7227, |
| "loss/crossentropy": 2.135145714879036, |
| "loss/hidden": 3.459765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21895913481712342, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.0035, |
| "grad_norm": 36.25, |
| "grad_norm_var": 663.2072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6794, |
| "loss/crossentropy": 2.2155070066452027, |
| "loss/hidden": 3.428515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18895817659795283, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.00375, |
| "grad_norm": 38.5, |
| "grad_norm_var": 54.0384765625, |
| "learning_rate": 0.0001, |
| "loss": 7.7461, |
| "loss/crossentropy": 2.1793935388326644, |
| "loss/hidden": 3.5140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19897108823060988, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 62.0, |
| "grad_norm_var": 99.1447265625, |
| "learning_rate": 0.0001, |
| "loss": 7.7956, |
| "loss/crossentropy": 2.194957372546196, |
| "loss/hidden": 3.597265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22534323409199714, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.00425, |
| "grad_norm": 32.5, |
| "grad_norm_var": 62.60390625, |
| "learning_rate": 0.0001, |
| "loss": 7.7866, |
| "loss/crossentropy": 2.1939920127391814, |
| "loss/hidden": 3.529296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2311840608716011, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.0045, |
| "grad_norm": 33.5, |
| "grad_norm_var": 10.049934895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6691, |
| "loss/crossentropy": 2.1646964073181154, |
| "loss/hidden": 3.416015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2066217228770256, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.00475, |
| "grad_norm": 32.75, |
| "grad_norm_var": 13.889322916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.8529, |
| "loss/crossentropy": 2.135753521323204, |
| "loss/hidden": 3.64140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23793395943939685, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 30.375, |
| "grad_norm_var": 14.12265625, |
| "learning_rate": 0.0001, |
| "loss": 7.7357, |
| "loss/crossentropy": 2.1783783614635466, |
| "loss/hidden": 3.540234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2111268475651741, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.00525, |
| "grad_norm": 47.75, |
| "grad_norm_var": 167.33014322916668, |
| "learning_rate": 0.0001, |
| "loss": 7.8414, |
| "loss/crossentropy": 2.091558237373829, |
| "loss/hidden": 3.49765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2065280582755804, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.0055, |
| "grad_norm": 30.625, |
| "grad_norm_var": 185.38899739583334, |
| "learning_rate": 0.0001, |
| "loss": 7.7181, |
| "loss/crossentropy": 2.1866263896226883, |
| "loss/hidden": 3.434375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20596572011709213, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.00575, |
| "grad_norm": 30.125, |
| "grad_norm_var": 45.847330729166664, |
| "learning_rate": 0.0001, |
| "loss": 7.6276, |
| "loss/crossentropy": 2.1170753836631775, |
| "loss/hidden": 3.419140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20233637914061547, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 33.75, |
| "grad_norm_var": 17.477083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7487, |
| "loss/crossentropy": 2.1388430804014207, |
| "loss/hidden": 3.55703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20581382531672715, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.00625, |
| "grad_norm": 31.75, |
| "grad_norm_var": 1.54765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6568, |
| "loss/crossentropy": 2.2856020241975785, |
| "loss/hidden": 3.4375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2351478708907962, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.0065, |
| "grad_norm": 28.375, |
| "grad_norm_var": 28.54375, |
| "learning_rate": 0.0001, |
| "loss": 7.6993, |
| "loss/crossentropy": 2.0653378486633303, |
| "loss/hidden": 3.493359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19755732025951148, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.00675, |
| "grad_norm": 33.25, |
| "grad_norm_var": 28.384375, |
| "learning_rate": 0.0001, |
| "loss": 7.7075, |
| "loss/crossentropy": 2.1598333328962327, |
| "loss/hidden": 3.44765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19795978404581546, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 32.5, |
| "grad_norm_var": 20.862955729166668, |
| "learning_rate": 0.0001, |
| "loss": 7.6852, |
| "loss/crossentropy": 2.138056221604347, |
| "loss/hidden": 3.43671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1999417580664158, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.00725, |
| "grad_norm": 42.0, |
| "grad_norm_var": 20.856705729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.8621, |
| "loss/crossentropy": 2.1779348880052565, |
| "loss/hidden": 3.42421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1947902340441942, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 31.75, |
| "grad_norm_var": 8.949739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6542, |
| "loss/crossentropy": 2.174051034450531, |
| "loss/hidden": 3.49375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2129627451300621, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.00775, |
| "grad_norm": 33.5, |
| "grad_norm_var": 4.620247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6721, |
| "loss/crossentropy": 2.0598735958337784, |
| "loss/hidden": 3.54921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20318429488688708, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 35.5, |
| "grad_norm_var": 2.059375, |
| "learning_rate": 0.0001, |
| "loss": 7.6655, |
| "loss/crossentropy": 2.1254130959510804, |
| "loss/hidden": 3.446875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19875272288918494, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.00825, |
| "grad_norm": 35.0, |
| "grad_norm_var": 1.9639973958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.7461, |
| "loss/crossentropy": 2.1635933369398117, |
| "loss/hidden": 3.41640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2012931451201439, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.0085, |
| "grad_norm": 33.75, |
| "grad_norm_var": 2.255989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6974, |
| "loss/crossentropy": 2.214476653933525, |
| "loss/hidden": 3.39453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1954287003725767, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.00875, |
| "grad_norm": 30.625, |
| "grad_norm_var": 2.4942057291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6918, |
| "loss/crossentropy": 2.216859245300293, |
| "loss/hidden": 3.393359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20083636604249477, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 31.625, |
| "grad_norm_var": 1.7905598958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6896, |
| "loss/crossentropy": 2.2161539107561112, |
| "loss/hidden": 3.405078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19574192687869071, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.00925, |
| "grad_norm": 29.75, |
| "grad_norm_var": 7.141080729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.8109, |
| "loss/crossentropy": 2.153403599560261, |
| "loss/hidden": 3.559765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21339697316288947, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.0095, |
| "grad_norm": 37.5, |
| "grad_norm_var": 10.9244140625, |
| "learning_rate": 0.0001, |
| "loss": 7.7615, |
| "loss/crossentropy": 2.253763607144356, |
| "loss/hidden": 3.494140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2074073076248169, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.00975, |
| "grad_norm": 33.75, |
| "grad_norm_var": 13.8400390625, |
| "learning_rate": 0.0001, |
| "loss": 7.7209, |
| "loss/crossentropy": 2.1363648414611816, |
| "loss/hidden": 3.478125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20775138661265374, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 31.5, |
| "grad_norm_var": 14.397330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6574, |
| "loss/crossentropy": 2.1789979085326197, |
| "loss/hidden": 3.5109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20811444334685802, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.01025, |
| "grad_norm": 33.0, |
| "grad_norm_var": 9.6837890625, |
| "learning_rate": 0.0001, |
| "loss": 7.7272, |
| "loss/crossentropy": 2.232848098874092, |
| "loss/hidden": 3.5328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21815686002373696, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.0105, |
| "grad_norm": 30.0, |
| "grad_norm_var": 73.00201822916667, |
| "learning_rate": 0.0001, |
| "loss": 7.7767, |
| "loss/crossentropy": 2.064501041173935, |
| "loss/hidden": 3.616796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21888567861169578, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.01075, |
| "grad_norm": 29.625, |
| "grad_norm_var": 73.21015625, |
| "learning_rate": 0.0001, |
| "loss": 7.7084, |
| "loss/crossentropy": 2.1248373448848725, |
| "loss/hidden": 3.48828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20307110175490378, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 31.875, |
| "grad_norm_var": 8.3259765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6488, |
| "loss/crossentropy": 2.1684874832630157, |
| "loss/hidden": 3.371875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18734447471797466, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.01125, |
| "grad_norm": 40.75, |
| "grad_norm_var": 27.13125, |
| "learning_rate": 0.0001, |
| "loss": 7.831, |
| "loss/crossentropy": 2.1310232520103454, |
| "loss/hidden": 3.631640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20724854618310928, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.0115, |
| "grad_norm": 36.0, |
| "grad_norm_var": 27.680208333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7446, |
| "loss/crossentropy": 2.134530597925186, |
| "loss/hidden": 3.43984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21913636103272438, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.01175, |
| "grad_norm": 33.0, |
| "grad_norm_var": 7.48125, |
| "learning_rate": 0.0001, |
| "loss": 7.6288, |
| "loss/crossentropy": 2.2641385555267335, |
| "loss/hidden": 3.348828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1849798161536455, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 31.75, |
| "grad_norm_var": 11.4837890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6493, |
| "loss/crossentropy": 2.2282994374632836, |
| "loss/hidden": 3.443359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2014446135610342, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.01225, |
| "grad_norm": 37.25, |
| "grad_norm_var": 6.677083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7612, |
| "loss/crossentropy": 2.1222758114337923, |
| "loss/hidden": 3.5625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2013396628201008, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 32.5, |
| "grad_norm_var": 4.7744140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6421, |
| "loss/crossentropy": 2.069608175754547, |
| "loss/hidden": 3.45078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1966784244403243, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.01275, |
| "grad_norm": 41.5, |
| "grad_norm_var": 10.3697265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6818, |
| "loss/crossentropy": 2.1589883297681807, |
| "loss/hidden": 3.355078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18451723456382751, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 31.125, |
| "grad_norm_var": 6.667122395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.7136, |
| "loss/crossentropy": 2.149793979898095, |
| "loss/hidden": 3.598828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2093389181420207, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.01325, |
| "grad_norm": 35.25, |
| "grad_norm_var": 20.768489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.7677, |
| "loss/crossentropy": 2.195904017984867, |
| "loss/hidden": 3.425390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19832278694957495, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.0135, |
| "grad_norm": 35.5, |
| "grad_norm_var": 17.8619140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6679, |
| "loss/crossentropy": 2.1850160747766494, |
| "loss/hidden": 3.403125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20512696355581284, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.01375, |
| "grad_norm": 35.25, |
| "grad_norm_var": 25.736393229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6426, |
| "loss/crossentropy": 2.1822438329458236, |
| "loss/hidden": 3.441796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20032773297280074, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 35.25, |
| "grad_norm_var": 30.77890625, |
| "learning_rate": 0.0001, |
| "loss": 7.7347, |
| "loss/crossentropy": 2.1990185409784315, |
| "loss/hidden": 3.410546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19729668814688922, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.01425, |
| "grad_norm": 41.25, |
| "grad_norm_var": 46.00149739583333, |
| "learning_rate": 0.0001, |
| "loss": 7.6662, |
| "loss/crossentropy": 2.0567948162555694, |
| "loss/hidden": 3.487109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18916799686849117, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.0145, |
| "grad_norm": 31.875, |
| "grad_norm_var": 18.383072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.662, |
| "loss/crossentropy": 2.1589747786521913, |
| "loss/hidden": 3.6359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21064655482769012, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.01475, |
| "grad_norm": 34.0, |
| "grad_norm_var": 7.5025390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6739, |
| "loss/crossentropy": 2.053135275095701, |
| "loss/hidden": 3.4828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20297051095403731, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 42.0, |
| "grad_norm_var": 58.718684895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6386, |
| "loss/crossentropy": 2.0670476451516153, |
| "loss/hidden": 3.590234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.208550613373518, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.01525, |
| "grad_norm": 29.75, |
| "grad_norm_var": 57.89993489583333, |
| "learning_rate": 0.0001, |
| "loss": 7.6461, |
| "loss/crossentropy": 2.1219205021858216, |
| "loss/hidden": 3.4953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20205040834844112, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.0155, |
| "grad_norm": 29.25, |
| "grad_norm_var": 10.6228515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5641, |
| "loss/crossentropy": 2.114127852022648, |
| "loss/hidden": 3.4171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19217339344322681, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.01575, |
| "grad_norm": 30.375, |
| "grad_norm_var": 14.542643229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6319, |
| "loss/crossentropy": 2.2160026699304582, |
| "loss/hidden": 3.392578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19281109217554332, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 34.0, |
| "grad_norm_var": 8.1869140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6546, |
| "loss/crossentropy": 2.1914512276649476, |
| "loss/hidden": 3.47109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2007790008559823, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.01625, |
| "grad_norm": 30.375, |
| "grad_norm_var": 5.4587890625, |
| "learning_rate": 0.0001, |
| "loss": 7.573, |
| "loss/crossentropy": 2.0619212985038757, |
| "loss/hidden": 3.440625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19594881720840931, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.0165, |
| "grad_norm": 34.75, |
| "grad_norm_var": 6.121809895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.7448, |
| "loss/crossentropy": 2.1046764492988586, |
| "loss/hidden": 3.5921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20083924774080514, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.01675, |
| "grad_norm": 34.5, |
| "grad_norm_var": 5.715559895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.8327, |
| "loss/crossentropy": 2.2835423797369003, |
| "loss/hidden": 3.604296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23483402598649264, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 31.875, |
| "grad_norm_var": 10.7650390625, |
| "learning_rate": 0.0001, |
| "loss": 7.8138, |
| "loss/crossentropy": 2.0907129019498827, |
| "loss/hidden": 3.518359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1924523524940014, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.01725, |
| "grad_norm": 33.0, |
| "grad_norm_var": 1.42265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6162, |
| "loss/crossentropy": 2.127697338163853, |
| "loss/hidden": 3.50859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2057236723601818, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.0175, |
| "grad_norm": 31.0, |
| "grad_norm_var": 8.9822265625, |
| "learning_rate": 0.0001, |
| "loss": 7.727, |
| "loss/crossentropy": 2.088846719264984, |
| "loss/hidden": 3.487890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21837750263512135, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.01775, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.7385416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.504, |
| "loss/crossentropy": 2.1813240855932237, |
| "loss/hidden": 3.376171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18769481666386129, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 34.0, |
| "grad_norm_var": 2.569205729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6713, |
| "loss/crossentropy": 2.127785587310791, |
| "loss/hidden": 3.45546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20503429286181926, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.01825, |
| "grad_norm": 29.875, |
| "grad_norm_var": 12.484375, |
| "learning_rate": 0.0001, |
| "loss": 7.6481, |
| "loss/crossentropy": 2.171098938584328, |
| "loss/hidden": 3.474609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20801318623125553, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.0185, |
| "grad_norm": 37.25, |
| "grad_norm_var": 14.9978515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6067, |
| "loss/crossentropy": 2.1487890854477882, |
| "loss/hidden": 3.427734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18985433727502823, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.01875, |
| "grad_norm": 32.0, |
| "grad_norm_var": 7.299739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4877, |
| "loss/crossentropy": 2.2428383469581603, |
| "loss/hidden": 3.24375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18733534589409828, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 30.0, |
| "grad_norm_var": 3.379622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6509, |
| "loss/crossentropy": 2.1872796684503557, |
| "loss/hidden": 3.4203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21135813258588315, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.01925, |
| "grad_norm": 30.5, |
| "grad_norm_var": 46.3072265625, |
| "learning_rate": 0.0001, |
| "loss": 7.7384, |
| "loss/crossentropy": 2.2427982538938522, |
| "loss/hidden": 3.369921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1998496226966381, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.0195, |
| "grad_norm": 41.5, |
| "grad_norm_var": 45.49264322916667, |
| "learning_rate": 0.0001, |
| "loss": 7.6688, |
| "loss/crossentropy": 2.207463192939758, |
| "loss/hidden": 3.321875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18665656447410583, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.01975, |
| "grad_norm": 31.625, |
| "grad_norm_var": 53.53951822916667, |
| "learning_rate": 0.0001, |
| "loss": 7.6327, |
| "loss/crossentropy": 2.075051838159561, |
| "loss/hidden": 3.37265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19726874344050885, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 35.25, |
| "grad_norm_var": 15.802083333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6833, |
| "loss/crossentropy": 2.0811705768108366, |
| "loss/hidden": 3.40078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19689124524593354, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.02025, |
| "grad_norm": 36.25, |
| "grad_norm_var": 2.7916015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6998, |
| "loss/crossentropy": 2.139931133389473, |
| "loss/hidden": 3.56640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20113225914537908, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.0205, |
| "grad_norm": 33.25, |
| "grad_norm_var": 3.21640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6125, |
| "loss/crossentropy": 2.3070268869400024, |
| "loss/hidden": 3.4, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19928287118673324, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.02075, |
| "grad_norm": 31.375, |
| "grad_norm_var": 4.70390625, |
| "learning_rate": 0.0001, |
| "loss": 7.7169, |
| "loss/crossentropy": 2.1359834372997284, |
| "loss/hidden": 3.6421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22523897737264634, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 33.75, |
| "grad_norm_var": 7.06015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6629, |
| "loss/crossentropy": 2.1498879536986353, |
| "loss/hidden": 3.599609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21073536314070224, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.02125, |
| "grad_norm": 31.125, |
| "grad_norm_var": 11.855143229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.7246, |
| "loss/crossentropy": 2.154731386899948, |
| "loss/hidden": 3.379296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18697260301560165, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.0215, |
| "grad_norm": 28.0, |
| "grad_norm_var": 3.8988932291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4616, |
| "loss/crossentropy": 2.209418597817421, |
| "loss/hidden": 3.441015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19536950960755348, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.02175, |
| "grad_norm": 36.25, |
| "grad_norm_var": 28.367708333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6221, |
| "loss/crossentropy": 2.106307029724121, |
| "loss/hidden": 3.477734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20511649739928545, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 30.875, |
| "grad_norm_var": 25.5978515625, |
| "learning_rate": 0.0001, |
| "loss": 7.57, |
| "loss/crossentropy": 2.170385852456093, |
| "loss/hidden": 3.419140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18977888114750385, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.02225, |
| "grad_norm": 31.0, |
| "grad_norm_var": 4.178580729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.659, |
| "loss/crossentropy": 2.022993338108063, |
| "loss/hidden": 3.580859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2017082829028368, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.0225, |
| "grad_norm": 30.25, |
| "grad_norm_var": 4.118684895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6471, |
| "loss/crossentropy": 2.1982390731573105, |
| "loss/hidden": 3.4078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.186830697581172, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.02275, |
| "grad_norm": 36.0, |
| "grad_norm_var": 9.886393229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5929, |
| "loss/crossentropy": 2.1351534157991408, |
| "loss/hidden": 3.44296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19507032372057437, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 30.25, |
| "grad_norm_var": 69.10182291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.7006, |
| "loss/crossentropy": 2.1805424720048903, |
| "loss/hidden": 3.60546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2387762701138854, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.02325, |
| "grad_norm": 29.625, |
| "grad_norm_var": 148.09765625, |
| "learning_rate": 0.0001, |
| "loss": 7.603, |
| "loss/crossentropy": 2.1993222564458845, |
| "loss/hidden": 3.5421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2252051206305623, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.0235, |
| "grad_norm": 33.0, |
| "grad_norm_var": 149.05670572916668, |
| "learning_rate": 0.0001, |
| "loss": 7.6101, |
| "loss/crossentropy": 2.132229286432266, |
| "loss/hidden": 3.434765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19997731409966946, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.02375, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.372330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6309, |
| "loss/crossentropy": 2.057620918750763, |
| "loss/hidden": 3.525, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19510896243155001, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 30.125, |
| "grad_norm_var": 2.9291666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5599, |
| "loss/crossentropy": 2.1666407614946364, |
| "loss/hidden": 3.38828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19047823324799537, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.02425, |
| "grad_norm": 34.0, |
| "grad_norm_var": 750.4874348958333, |
| "learning_rate": 0.0001, |
| "loss": 7.667, |
| "loss/crossentropy": 2.199223425984383, |
| "loss/hidden": 3.561328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22954254262149335, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.0245, |
| "grad_norm": 31.0, |
| "grad_norm_var": 736.4518229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6648, |
| "loss/crossentropy": 2.052691954374313, |
| "loss/hidden": 3.612109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20743414014577866, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.02475, |
| "grad_norm": 29.625, |
| "grad_norm_var": 13.204166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.587, |
| "loss/crossentropy": 2.2363356560468675, |
| "loss/hidden": 3.383984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19663754627108573, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 29.5, |
| "grad_norm_var": 8.623958333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5999, |
| "loss/crossentropy": 2.096450260281563, |
| "loss/hidden": 3.437890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1896633107215166, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.02525, |
| "grad_norm": 30.0, |
| "grad_norm_var": 26.456184895833335, |
| "learning_rate": 0.0001, |
| "loss": 7.6469, |
| "loss/crossentropy": 2.219489449262619, |
| "loss/hidden": 3.461328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20298976600170135, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.0255, |
| "grad_norm": 32.75, |
| "grad_norm_var": 22.0525390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6634, |
| "loss/crossentropy": 2.157027468085289, |
| "loss/hidden": 3.641015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21785753238946198, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.02575, |
| "grad_norm": 33.0, |
| "grad_norm_var": 31.347330729166668, |
| "learning_rate": 0.0001, |
| "loss": 7.6449, |
| "loss/crossentropy": 2.1728423804044725, |
| "loss/hidden": 3.41015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19077841471880674, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.026, |
| "grad_norm": 74.5, |
| "grad_norm_var": 122.20045572916666, |
| "learning_rate": 0.0001, |
| "loss": 7.6132, |
| "loss/crossentropy": 2.1822386175394057, |
| "loss/hidden": 3.39453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1930880568921566, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.02625, |
| "grad_norm": 67.5, |
| "grad_norm_var": 178.43125, |
| "learning_rate": 0.0001, |
| "loss": 7.6429, |
| "loss/crossentropy": 2.2600297421216964, |
| "loss/hidden": 3.458203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20767511576414108, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.0265, |
| "grad_norm": 31.875, |
| "grad_norm_var": 92.06087239583333, |
| "learning_rate": 0.0001, |
| "loss": 7.5718, |
| "loss/crossentropy": 2.118990848958492, |
| "loss/hidden": 3.37890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19327420592308045, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.02675, |
| "grad_norm": 30.875, |
| "grad_norm_var": 35.283268229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.603, |
| "loss/crossentropy": 2.2320737928152083, |
| "loss/hidden": 3.335546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18573360554873944, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.027, |
| "grad_norm": 35.25, |
| "grad_norm_var": 3795.8796223958334, |
| "learning_rate": 0.0001, |
| "loss": 7.6632, |
| "loss/crossentropy": 2.1329027831554415, |
| "loss/hidden": 3.500390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.25382886435836555, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.02725, |
| "grad_norm": 41.0, |
| "grad_norm_var": 3810.5869140625, |
| "learning_rate": 0.0001, |
| "loss": 7.591, |
| "loss/crossentropy": 2.147709222137928, |
| "loss/hidden": 3.409375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19441522471606731, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.0275, |
| "grad_norm": 34.0, |
| "grad_norm_var": 10.347916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.459, |
| "loss/crossentropy": 2.134738603234291, |
| "loss/hidden": 3.45, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18917258959263564, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.02775, |
| "grad_norm": 30.5, |
| "grad_norm_var": 5.362239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4625, |
| "loss/crossentropy": 2.072269695997238, |
| "loss/hidden": 3.453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19563193432986736, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 29.375, |
| "grad_norm_var": 14.4478515625, |
| "learning_rate": 0.0001, |
| "loss": 7.514, |
| "loss/crossentropy": 2.131034165620804, |
| "loss/hidden": 3.44453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18194433208554983, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.02825, |
| "grad_norm": 33.0, |
| "grad_norm_var": 23.925, |
| "learning_rate": 0.0001, |
| "loss": 7.5884, |
| "loss/crossentropy": 2.023801653087139, |
| "loss/hidden": 3.637109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20569879673421382, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.0285, |
| "grad_norm": 32.0, |
| "grad_norm_var": 7.2744140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6524, |
| "loss/crossentropy": 2.1517456393688916, |
| "loss/hidden": 3.537890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2079196309670806, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.02875, |
| "grad_norm": 35.75, |
| "grad_norm_var": 7.99140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6074, |
| "loss/crossentropy": 2.004653300344944, |
| "loss/hidden": 3.583984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19787274841219188, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.029, |
| "grad_norm": 32.75, |
| "grad_norm_var": 4.120572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6348, |
| "loss/crossentropy": 2.1528601229190825, |
| "loss/hidden": 3.361328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1927174234762788, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.02925, |
| "grad_norm": 36.25, |
| "grad_norm_var": 7.069205729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6205, |
| "loss/crossentropy": 2.05783154964447, |
| "loss/hidden": 3.614453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22374887801706791, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.0295, |
| "grad_norm": 31.0, |
| "grad_norm_var": 41.484309895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.7356, |
| "loss/crossentropy": 2.126041141152382, |
| "loss/hidden": 3.5234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2197611417621374, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.02975, |
| "grad_norm": 38.5, |
| "grad_norm_var": 43.91223958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6157, |
| "loss/crossentropy": 2.2476495057344437, |
| "loss/hidden": 3.462109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19447711408138274, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 47.5, |
| "grad_norm_var": 95.72057291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7237, |
| "loss/crossentropy": 2.092088536918163, |
| "loss/hidden": 3.433203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19047515615820884, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.03025, |
| "grad_norm": 31.5, |
| "grad_norm_var": 99.6822265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5678, |
| "loss/crossentropy": 2.1249007523059844, |
| "loss/hidden": 3.505078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19545839354395866, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.0305, |
| "grad_norm": 30.625, |
| "grad_norm_var": 6.409309895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5862, |
| "loss/crossentropy": 2.2506365835666657, |
| "loss/hidden": 3.39921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19677093252539635, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.03075, |
| "grad_norm": 33.25, |
| "grad_norm_var": 6.676822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5504, |
| "loss/crossentropy": 2.133887434005737, |
| "loss/hidden": 3.459375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1896925836801529, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.031, |
| "grad_norm": 38.0, |
| "grad_norm_var": 5.2337890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5956, |
| "loss/crossentropy": 2.0669597774744033, |
| "loss/hidden": 3.475390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19292720556259155, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.03125, |
| "grad_norm": 30.625, |
| "grad_norm_var": 3.97265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5722, |
| "loss/crossentropy": 2.261713761091232, |
| "loss/hidden": 3.346875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19570228308439255, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.0315, |
| "grad_norm": 29.25, |
| "grad_norm_var": 4.916080729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5566, |
| "loss/crossentropy": 2.0476513862609864, |
| "loss/hidden": 3.668359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2093046260997653, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.03175, |
| "grad_norm": 34.75, |
| "grad_norm_var": 10.875455729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6172, |
| "loss/crossentropy": 2.1128817319869997, |
| "loss/hidden": 3.5703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21670667603611946, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 33.5, |
| "grad_norm_var": 3.824934895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6787, |
| "loss/crossentropy": 2.2115501552820205, |
| "loss/hidden": 3.417578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1827129926532507, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.03225, |
| "grad_norm": 30.375, |
| "grad_norm_var": 13.828125, |
| "learning_rate": 0.0001, |
| "loss": 7.6339, |
| "loss/crossentropy": 2.176504462957382, |
| "loss/hidden": 3.5, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2160520726814866, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.0325, |
| "grad_norm": 32.25, |
| "grad_norm_var": 5.916080729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6438, |
| "loss/crossentropy": 2.173138880729675, |
| "loss/hidden": 3.529296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2056989949196577, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.03275, |
| "grad_norm": 32.25, |
| "grad_norm_var": 5.78515625, |
| "learning_rate": 0.0001, |
| "loss": 7.7146, |
| "loss/crossentropy": 2.247766065597534, |
| "loss/hidden": 3.488671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20310762114822864, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.033, |
| "grad_norm": 32.75, |
| "grad_norm_var": 4.51640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6452, |
| "loss/crossentropy": 2.0862443327903746, |
| "loss/hidden": 3.406640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18912406917661428, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.03325, |
| "grad_norm": 34.5, |
| "grad_norm_var": 7.220833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7715, |
| "loss/crossentropy": 2.093398702144623, |
| "loss/hidden": 3.570703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2104920681566, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.0335, |
| "grad_norm": 38.0, |
| "grad_norm_var": 9.108268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.7368, |
| "loss/crossentropy": 2.17246213555336, |
| "loss/hidden": 3.576953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21665989980101585, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.03375, |
| "grad_norm": 33.5, |
| "grad_norm_var": 4.794205729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5892, |
| "loss/crossentropy": 2.1238946616649628, |
| "loss/hidden": 3.48828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2172164160758257, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.034, |
| "grad_norm": 32.5, |
| "grad_norm_var": 101.1431640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6341, |
| "loss/crossentropy": 2.194270025193691, |
| "loss/hidden": 3.4859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19632596522569656, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.03425, |
| "grad_norm": 33.25, |
| "grad_norm_var": 3.9119140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5496, |
| "loss/crossentropy": 2.1282873928546904, |
| "loss/hidden": 3.343359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17983752395957708, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.0345, |
| "grad_norm": 34.5, |
| "grad_norm_var": 4.324934895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6348, |
| "loss/crossentropy": 2.140147662162781, |
| "loss/hidden": 3.471484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20302014388144016, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.03475, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.818489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7271, |
| "loss/crossentropy": 2.128489089012146, |
| "loss/hidden": 3.4359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19602114744484425, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.1458333333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.6417, |
| "loss/crossentropy": 2.1306474581360817, |
| "loss/hidden": 3.582421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19735200479626655, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.03525, |
| "grad_norm": 31.25, |
| "grad_norm_var": 9.2009765625, |
| "learning_rate": 0.0001, |
| "loss": 7.7002, |
| "loss/crossentropy": 2.173697289824486, |
| "loss/hidden": 3.480859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20366120263934134, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.0355, |
| "grad_norm": 31.75, |
| "grad_norm_var": 9.913997395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6577, |
| "loss/crossentropy": 2.26003720164299, |
| "loss/hidden": 3.385546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20924863480031491, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.03575, |
| "grad_norm": 30.25, |
| "grad_norm_var": 26.66015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5718, |
| "loss/crossentropy": 2.2352112770080566, |
| "loss/hidden": 3.419140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19357634484767913, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 34.0, |
| "grad_norm_var": 30.602083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5861, |
| "loss/crossentropy": 2.0770506739616392, |
| "loss/hidden": 3.5453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21514309681951999, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.03625, |
| "grad_norm": 37.5, |
| "grad_norm_var": 12.3875, |
| "learning_rate": 0.0001, |
| "loss": 7.4986, |
| "loss/crossentropy": 2.0542123883962633, |
| "loss/hidden": 3.510546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19684152901172638, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.0365, |
| "grad_norm": 32.75, |
| "grad_norm_var": 8.548372395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6006, |
| "loss/crossentropy": 2.175110411643982, |
| "loss/hidden": 3.37578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19031002502888442, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.03675, |
| "grad_norm": 34.25, |
| "grad_norm_var": 2.8429676028135214e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.7702, |
| "loss/crossentropy": 2.1691948026418686, |
| "loss/hidden": 3.5859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22843880020081997, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.037, |
| "grad_norm": 36.25, |
| "grad_norm_var": 2.842967603101565e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.6046, |
| "loss/crossentropy": 2.0826233722269536, |
| "loss/hidden": 3.520703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1938928204588592, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.03725, |
| "grad_norm": 32.75, |
| "grad_norm_var": 8.939518229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6265, |
| "loss/crossentropy": 2.2077848985791206, |
| "loss/hidden": 3.435546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1943045362830162, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.0375, |
| "grad_norm": 34.25, |
| "grad_norm_var": 7.7125, |
| "learning_rate": 0.0001, |
| "loss": 7.6217, |
| "loss/crossentropy": 2.1079602181911468, |
| "loss/hidden": 3.395703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19446163363754748, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.03775, |
| "grad_norm": 34.25, |
| "grad_norm_var": 5.7931640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5893, |
| "loss/crossentropy": 2.078600898385048, |
| "loss/hidden": 3.5734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21464722994714974, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.038, |
| "grad_norm": 35.75, |
| "grad_norm_var": 6.186458333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6365, |
| "loss/crossentropy": 2.1014960765838624, |
| "loss/hidden": 3.53125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19028044641017913, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.03825, |
| "grad_norm": 36.0, |
| "grad_norm_var": 4.455989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5414, |
| "loss/crossentropy": 2.1112293377518654, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18414278626441954, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.0385, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.7249348958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5681, |
| "loss/crossentropy": 2.166412356495857, |
| "loss/hidden": 3.44453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19420933350920677, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.03875, |
| "grad_norm": 31.625, |
| "grad_norm_var": 5.330989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6663, |
| "loss/crossentropy": 2.0857026129961014, |
| "loss/hidden": 3.574609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2175652377307415, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.039, |
| "grad_norm": 30.75, |
| "grad_norm_var": 7.401822916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5661, |
| "loss/crossentropy": 2.1806214213371278, |
| "loss/hidden": 3.562890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21507157981395722, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.03925, |
| "grad_norm": 37.5, |
| "grad_norm_var": 9.043684895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5649, |
| "loss/crossentropy": 2.073585295677185, |
| "loss/hidden": 3.520703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19737527389079332, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.0395, |
| "grad_norm": 32.5, |
| "grad_norm_var": 4.607747395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6893, |
| "loss/crossentropy": 2.262183803319931, |
| "loss/hidden": 3.425390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22450251020491124, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.03975, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.3983723958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.7101, |
| "loss/crossentropy": 2.1200410187244416, |
| "loss/hidden": 3.464453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20633359774947166, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 32.25, |
| "grad_norm_var": 28.868684895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6821, |
| "loss/crossentropy": 2.1520946115255355, |
| "loss/hidden": 3.616796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20241751577705144, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.04025, |
| "grad_norm": 34.5, |
| "grad_norm_var": 24.308072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6482, |
| "loss/crossentropy": 2.109182408452034, |
| "loss/hidden": 3.491015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19619097150862216, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.0405, |
| "grad_norm": 33.25, |
| "grad_norm_var": 1.83125, |
| "learning_rate": 0.0001, |
| "loss": 7.614, |
| "loss/crossentropy": 2.220561644434929, |
| "loss/hidden": 3.44921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20333079397678375, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.04075, |
| "grad_norm": 31.0, |
| "grad_norm_var": 7.183072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.7748, |
| "loss/crossentropy": 2.2026446878910066, |
| "loss/hidden": 3.472265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20862093791365624, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.041, |
| "grad_norm": 36.25, |
| "grad_norm_var": 8.080989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6592, |
| "loss/crossentropy": 2.2313437908887863, |
| "loss/hidden": 3.32421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19155636206269264, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.04125, |
| "grad_norm": 31.5, |
| "grad_norm_var": 3.8863932291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.6964, |
| "loss/crossentropy": 2.0529640942811964, |
| "loss/hidden": 3.658984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23474433943629264, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.0415, |
| "grad_norm": 33.25, |
| "grad_norm_var": 1.0374348958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6483, |
| "loss/crossentropy": 2.175355441868305, |
| "loss/hidden": 3.41953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19280093312263488, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.04175, |
| "grad_norm": 29.625, |
| "grad_norm_var": 2.4268229166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6277, |
| "loss/crossentropy": 2.121490114927292, |
| "loss/hidden": 3.4421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20306031554937362, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.042, |
| "grad_norm": 36.75, |
| "grad_norm_var": 186.4869140625, |
| "learning_rate": 0.0001, |
| "loss": 7.7912, |
| "loss/crossentropy": 2.123810574412346, |
| "loss/hidden": 3.596875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20700039602816106, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.04225, |
| "grad_norm": 33.0, |
| "grad_norm_var": 194.72708333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5507, |
| "loss/crossentropy": 2.198003688454628, |
| "loss/hidden": 3.384375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19452486634254457, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.0425, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.842643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6641, |
| "loss/crossentropy": 2.1328449815511705, |
| "loss/hidden": 3.508984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20609250776469706, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.04275, |
| "grad_norm": 36.5, |
| "grad_norm_var": 23.064322916666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5838, |
| "loss/crossentropy": 2.1690568923950195, |
| "loss/hidden": 3.459765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19301791079342365, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.043, |
| "grad_norm": 30.0, |
| "grad_norm_var": 5.9712890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5441, |
| "loss/crossentropy": 2.1149508744478225, |
| "loss/hidden": 3.327734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19253603778779507, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.04325, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.82890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6644, |
| "loss/crossentropy": 2.070442554354668, |
| "loss/hidden": 3.5234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19407737776637077, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.0435, |
| "grad_norm": 30.625, |
| "grad_norm_var": 1.5947265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6468, |
| "loss/crossentropy": 2.2249585568904875, |
| "loss/hidden": 3.437890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20079109650105237, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.04375, |
| "grad_norm": 32.25, |
| "grad_norm_var": 4.1384765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6522, |
| "loss/crossentropy": 2.1712467283010484, |
| "loss/hidden": 3.481640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21240621842443944, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.7018229166666665, |
| "learning_rate": 0.0001, |
| "loss": 7.6326, |
| "loss/crossentropy": 2.1433209091424943, |
| "loss/hidden": 3.50546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19783576354384422, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.04425, |
| "grad_norm": 32.25, |
| "grad_norm_var": 10.2634765625, |
| "learning_rate": 0.0001, |
| "loss": 7.7252, |
| "loss/crossentropy": 2.1377856612205504, |
| "loss/hidden": 3.462109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19519764352589847, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.0445, |
| "grad_norm": 32.25, |
| "grad_norm_var": 9.817122395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6165, |
| "loss/crossentropy": 2.2387378960847855, |
| "loss/hidden": 3.465234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2087532427161932, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.04475, |
| "grad_norm": 31.0, |
| "grad_norm_var": 1.9650390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6701, |
| "loss/crossentropy": 2.280033028125763, |
| "loss/hidden": 3.31875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1907376278191805, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 33.75, |
| "grad_norm_var": 2.4197265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6553, |
| "loss/crossentropy": 2.205285739898682, |
| "loss/hidden": 3.448828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1980523556470871, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.04525, |
| "grad_norm": 31.0, |
| "grad_norm_var": 3.2462890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6001, |
| "loss/crossentropy": 2.047496220469475, |
| "loss/hidden": 3.548046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19389633461833, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.0455, |
| "grad_norm": 31.125, |
| "grad_norm_var": 2.562239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.615, |
| "loss/crossentropy": 2.174453580379486, |
| "loss/hidden": 3.516796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20545508041977883, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.04575, |
| "grad_norm": 34.25, |
| "grad_norm_var": 3.4296223958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.638, |
| "loss/crossentropy": 2.0722746759653092, |
| "loss/hidden": 3.437109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19747158586978913, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.046, |
| "grad_norm": 34.75, |
| "grad_norm_var": 3.0666015625, |
| "learning_rate": 0.0001, |
| "loss": 7.7087, |
| "loss/crossentropy": 2.1196924835443496, |
| "loss/hidden": 3.622265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20298538953065873, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.04625, |
| "grad_norm": 29.75, |
| "grad_norm_var": 2.6119140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6036, |
| "loss/crossentropy": 2.1688392132520677, |
| "loss/hidden": 3.323046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17962730433791876, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.0465, |
| "grad_norm": 33.75, |
| "grad_norm_var": 1.3374348958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7051, |
| "loss/crossentropy": 2.1148360162973403, |
| "loss/hidden": 3.422265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2195219134911895, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.04675, |
| "grad_norm": 33.5, |
| "grad_norm_var": 2.45625, |
| "learning_rate": 0.0001, |
| "loss": 7.6397, |
| "loss/crossentropy": 2.016439202427864, |
| "loss/hidden": 3.529296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2061541959643364, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.047, |
| "grad_norm": 29.75, |
| "grad_norm_var": 4.173893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.7054, |
| "loss/crossentropy": 2.1470705419778824, |
| "loss/hidden": 3.3828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21371309272944927, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.04725, |
| "grad_norm": 32.5, |
| "grad_norm_var": 4.054622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5959, |
| "loss/crossentropy": 2.265937978029251, |
| "loss/hidden": 3.31328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1929181769490242, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.0475, |
| "grad_norm": 30.125, |
| "grad_norm_var": 6.6197265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6012, |
| "loss/crossentropy": 2.0853475779294968, |
| "loss/hidden": 3.40625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20215214397758247, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.04775, |
| "grad_norm": 34.0, |
| "grad_norm_var": 23.695572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.7544, |
| "loss/crossentropy": 2.162308484315872, |
| "loss/hidden": 3.4796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2013952497392893, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 30.75, |
| "grad_norm_var": 4.120768229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6092, |
| "loss/crossentropy": 2.088267083466053, |
| "loss/hidden": 3.5234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2007219024002552, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.04825, |
| "grad_norm": 35.75, |
| "grad_norm_var": 2.97890625, |
| "learning_rate": 0.0001, |
| "loss": 7.7141, |
| "loss/crossentropy": 2.0617689430713653, |
| "loss/hidden": 3.46875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1928685350343585, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.0485, |
| "grad_norm": 35.0, |
| "grad_norm_var": 6.073893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.7068, |
| "loss/crossentropy": 2.1201131522655485, |
| "loss/hidden": 3.501953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20757155679166317, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.04875, |
| "grad_norm": 31.5, |
| "grad_norm_var": 21.345572916666665, |
| "learning_rate": 0.0001, |
| "loss": 7.6198, |
| "loss/crossentropy": 2.235423868894577, |
| "loss/hidden": 3.36875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1959926813840866, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.049, |
| "grad_norm": 30.625, |
| "grad_norm_var": 28.99765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6389, |
| "loss/crossentropy": 2.205905148386955, |
| "loss/hidden": 3.446484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2116202499717474, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.04925, |
| "grad_norm": 33.25, |
| "grad_norm_var": 9.9369140625, |
| "learning_rate": 0.0001, |
| "loss": 7.7121, |
| "loss/crossentropy": 2.163422483205795, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1981559544801712, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.0495, |
| "grad_norm": 29.0, |
| "grad_norm_var": 9.118489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6772, |
| "loss/crossentropy": 2.1636913806200027, |
| "loss/hidden": 3.442578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1968079771846533, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.04975, |
| "grad_norm": 35.25, |
| "grad_norm_var": 4.377018229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5948, |
| "loss/crossentropy": 2.174500140547752, |
| "loss/hidden": 3.42578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1923616673797369, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 30.75, |
| "grad_norm_var": 6.15625, |
| "learning_rate": 0.0001, |
| "loss": 7.5639, |
| "loss/crossentropy": 2.1197956264019013, |
| "loss/hidden": 3.50703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20423812307417394, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.05025, |
| "grad_norm": 33.5, |
| "grad_norm_var": 4.725455729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6238, |
| "loss/crossentropy": 2.1442053347826002, |
| "loss/hidden": 3.359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20131820477545262, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.0505, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.2212890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6024, |
| "loss/crossentropy": 2.1970301985740663, |
| "loss/hidden": 3.43203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19248049296438693, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.05075, |
| "grad_norm": 31.5, |
| "grad_norm_var": 2.2853515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6279, |
| "loss/crossentropy": 2.0732986360788344, |
| "loss/hidden": 3.476953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19802382439374924, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.051, |
| "grad_norm": 35.25, |
| "grad_norm_var": 3.2129557291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.643, |
| "loss/crossentropy": 2.196815450489521, |
| "loss/hidden": 3.48203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20899684820324183, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.05125, |
| "grad_norm": 34.5, |
| "grad_norm_var": 4.093489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6321, |
| "loss/crossentropy": 2.083095496892929, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18292178437113762, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.0515, |
| "grad_norm": 31.875, |
| "grad_norm_var": 19.478059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5882, |
| "loss/crossentropy": 2.2153579622507094, |
| "loss/hidden": 3.360546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19245057981461286, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.05175, |
| "grad_norm": 34.5, |
| "grad_norm_var": 16.97265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6006, |
| "loss/crossentropy": 2.2516845196485518, |
| "loss/hidden": 3.385546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1975632380694151, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 29.75, |
| "grad_norm_var": 2.71015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6393, |
| "loss/crossentropy": 2.1561204314231874, |
| "loss/hidden": 3.465234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2124529665336013, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.05225, |
| "grad_norm": 35.75, |
| "grad_norm_var": 36.984375, |
| "learning_rate": 0.0001, |
| "loss": 7.7289, |
| "loss/crossentropy": 2.2129232093691824, |
| "loss/hidden": 3.43125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1984808323904872, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.0525, |
| "grad_norm": 28.875, |
| "grad_norm_var": 38.076822916666664, |
| "learning_rate": 0.0001, |
| "loss": 7.5446, |
| "loss/crossentropy": 2.281945154070854, |
| "loss/hidden": 3.387109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1915616899728775, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.05275, |
| "grad_norm": 30.5, |
| "grad_norm_var": 2.10625, |
| "learning_rate": 0.0001, |
| "loss": 7.6215, |
| "loss/crossentropy": 2.0773366719484327, |
| "loss/hidden": 3.439453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18790210355073214, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.053, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.24765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4968, |
| "loss/crossentropy": 2.186223568022251, |
| "loss/hidden": 3.36875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1900737203657627, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.05325, |
| "grad_norm": 30.75, |
| "grad_norm_var": 4.638541666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6574, |
| "loss/crossentropy": 2.2741902500391005, |
| "loss/hidden": 3.38359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1898935280740261, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.0535, |
| "grad_norm": 33.75, |
| "grad_norm_var": 18.001041666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6903, |
| "loss/crossentropy": 2.1332941919565203, |
| "loss/hidden": 3.42265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1954928996041417, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.05375, |
| "grad_norm": 34.5, |
| "grad_norm_var": 17.939583333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5786, |
| "loss/crossentropy": 2.2076333969831468, |
| "loss/hidden": 3.4828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20088096596300603, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.054, |
| "grad_norm": 33.5, |
| "grad_norm_var": 8.947916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5995, |
| "loss/crossentropy": 2.201739010214806, |
| "loss/hidden": 3.394140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20021349862217902, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.05425, |
| "grad_norm": 30.0, |
| "grad_norm_var": 185.5900390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6214, |
| "loss/crossentropy": 2.1913442850112914, |
| "loss/hidden": 3.407421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1996122680604458, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.0545, |
| "grad_norm": 30.625, |
| "grad_norm_var": 186.84166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7375, |
| "loss/crossentropy": 2.173484447598457, |
| "loss/hidden": 3.428515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18794310167431832, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.05475, |
| "grad_norm": 31.375, |
| "grad_norm_var": 8.699739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6023, |
| "loss/crossentropy": 2.207549235224724, |
| "loss/hidden": 3.494140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21360519118607044, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 32.0, |
| "grad_norm_var": 5.228059895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5821, |
| "loss/crossentropy": 2.168141430988908, |
| "loss/hidden": 3.355078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1850608481094241, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.05525, |
| "grad_norm": 29.625, |
| "grad_norm_var": 10.708268229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6191, |
| "loss/crossentropy": 2.26080215126276, |
| "loss/hidden": 3.440625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19979589320719243, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.0555, |
| "grad_norm": 31.25, |
| "grad_norm_var": 10.148958333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.613, |
| "loss/crossentropy": 2.2105998665094377, |
| "loss/hidden": 3.430859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20264392383396626, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.05575, |
| "grad_norm": 29.5, |
| "grad_norm_var": 9.128059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5296, |
| "loss/crossentropy": 2.076467031240463, |
| "loss/hidden": 3.59296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22621012963354586, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 34.5, |
| "grad_norm_var": 3.99375, |
| "learning_rate": 0.0001, |
| "loss": 7.6686, |
| "loss/crossentropy": 2.0320577889680864, |
| "loss/hidden": 3.575, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19981470778584481, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.05625, |
| "grad_norm": 30.75, |
| "grad_norm_var": 14.473893229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5723, |
| "loss/crossentropy": 2.084241083264351, |
| "loss/hidden": 3.545703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2100257944315672, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.0565, |
| "grad_norm": 31.5, |
| "grad_norm_var": 44.7041015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6107, |
| "loss/crossentropy": 2.2695932418107985, |
| "loss/hidden": 3.371875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19781356416642665, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.05675, |
| "grad_norm": 30.875, |
| "grad_norm_var": 9.897916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6187, |
| "loss/crossentropy": 2.188571906089783, |
| "loss/hidden": 3.43984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2015857521444559, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.057, |
| "grad_norm": 28.375, |
| "grad_norm_var": 3.490625, |
| "learning_rate": 0.0001, |
| "loss": 7.6139, |
| "loss/crossentropy": 2.1134210243821143, |
| "loss/hidden": 3.481640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22417646870017052, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.05725, |
| "grad_norm": 32.0, |
| "grad_norm_var": 6.479622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6591, |
| "loss/crossentropy": 2.1894455403089523, |
| "loss/hidden": 3.465625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19847002141177655, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.0575, |
| "grad_norm": 31.25, |
| "grad_norm_var": 8.809309895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.657, |
| "loss/crossentropy": 2.1524556159973143, |
| "loss/hidden": 3.319921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1857963975518942, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.05775, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.121809895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6475, |
| "loss/crossentropy": 2.2037901908159254, |
| "loss/hidden": 3.44140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22191528491675855, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.058, |
| "grad_norm": 32.5, |
| "grad_norm_var": 2.6348307291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5846, |
| "loss/crossentropy": 2.214809921383858, |
| "loss/hidden": 3.3453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19421134144067764, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.05825, |
| "grad_norm": 29.375, |
| "grad_norm_var": 2.939322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6893, |
| "loss/crossentropy": 2.2204942047595977, |
| "loss/hidden": 3.437890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21439925488084555, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.0585, |
| "grad_norm": 28.625, |
| "grad_norm_var": 3.8744140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6038, |
| "loss/crossentropy": 2.1540059238672256, |
| "loss/hidden": 3.56953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.24175845962017775, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.05875, |
| "grad_norm": 31.875, |
| "grad_norm_var": 2.1426432291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6379, |
| "loss/crossentropy": 2.1948168754577635, |
| "loss/hidden": 3.43046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2041913490742445, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.059, |
| "grad_norm": 32.5, |
| "grad_norm_var": 1.7869140625, |
| "learning_rate": 0.0001, |
| "loss": 7.7135, |
| "loss/crossentropy": 2.1938526153564455, |
| "loss/hidden": 3.434375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1992840923368931, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.05925, |
| "grad_norm": 33.75, |
| "grad_norm_var": 1.1343098958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6931, |
| "loss/crossentropy": 2.12769907861948, |
| "loss/hidden": 3.3796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18500677905976773, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.0595, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.1884765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6297, |
| "loss/crossentropy": 2.1268584340810777, |
| "loss/hidden": 3.485546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20107861533761023, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.05975, |
| "grad_norm": 31.0, |
| "grad_norm_var": 5.368489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.555, |
| "loss/crossentropy": 2.198070913553238, |
| "loss/hidden": 3.4171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19437791910022498, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 31.0, |
| "grad_norm_var": 3.218489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.633, |
| "loss/crossentropy": 2.1521017968654634, |
| "loss/hidden": 3.472265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19696612432599067, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.06025, |
| "grad_norm": 31.625, |
| "grad_norm_var": 1.4098307291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6634, |
| "loss/crossentropy": 2.0935733556747436, |
| "loss/hidden": 3.46171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19025789983570576, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.0605, |
| "grad_norm": 31.75, |
| "grad_norm_var": 4.812434895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6523, |
| "loss/crossentropy": 2.206766763329506, |
| "loss/hidden": 3.421484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19223052635788918, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.06075, |
| "grad_norm": 32.0, |
| "grad_norm_var": 5.545247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6703, |
| "loss/crossentropy": 2.2091148614883425, |
| "loss/hidden": 3.471875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2191623793914914, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.061, |
| "grad_norm": 31.875, |
| "grad_norm_var": 3.06640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6194, |
| "loss/crossentropy": 2.2076220482587816, |
| "loss/hidden": 3.52265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21331611163914205, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.06125, |
| "grad_norm": 33.75, |
| "grad_norm_var": 3.753125, |
| "learning_rate": 0.0001, |
| "loss": 7.6143, |
| "loss/crossentropy": 2.1473243802785875, |
| "loss/hidden": 3.415625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20034591071307659, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.0615, |
| "grad_norm": 31.25, |
| "grad_norm_var": 4.09765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6166, |
| "loss/crossentropy": 2.2176205784082414, |
| "loss/hidden": 3.4015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1890367180109024, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.06175, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.56015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5864, |
| "loss/crossentropy": 2.139193335175514, |
| "loss/hidden": 3.55234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19700367711484432, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.062, |
| "grad_norm": 30.625, |
| "grad_norm_var": 3.5400390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6061, |
| "loss/crossentropy": 2.101886364817619, |
| "loss/hidden": 3.543359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20392275378108024, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.06225, |
| "grad_norm": 31.625, |
| "grad_norm_var": 2.3353515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5912, |
| "loss/crossentropy": 2.1105535492300986, |
| "loss/hidden": 3.451953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20442402064800264, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.183333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6553, |
| "loss/crossentropy": 2.1315447479486465, |
| "loss/hidden": 3.471484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20164060425013303, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.06275, |
| "grad_norm": 33.25, |
| "grad_norm_var": 147.15149739583333, |
| "learning_rate": 0.0001, |
| "loss": 7.6542, |
| "loss/crossentropy": 2.0641630738973618, |
| "loss/hidden": 3.468359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20385651774704455, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.063, |
| "grad_norm": 30.5, |
| "grad_norm_var": 150.99166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5568, |
| "loss/crossentropy": 2.191535955667496, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19768227599561214, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.06325, |
| "grad_norm": 29.0, |
| "grad_norm_var": 2.13515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6544, |
| "loss/crossentropy": 2.199158227443695, |
| "loss/hidden": 3.45, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2097537014633417, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.0635, |
| "grad_norm": 31.5, |
| "grad_norm_var": 2.731705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5063, |
| "loss/crossentropy": 2.1456793427467344, |
| "loss/hidden": 3.403515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19175102189183235, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.06375, |
| "grad_norm": 32.5, |
| "grad_norm_var": 6.859830729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5828, |
| "loss/crossentropy": 2.255453732609749, |
| "loss/hidden": 3.4203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19693338237702845, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 31.625, |
| "grad_norm_var": 5.178125, |
| "learning_rate": 0.0001, |
| "loss": 7.5702, |
| "loss/crossentropy": 2.2270909011363984, |
| "loss/hidden": 3.43359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20889390334486962, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.06425, |
| "grad_norm": 33.5, |
| "grad_norm_var": 3.6372395833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5904, |
| "loss/crossentropy": 2.190132850408554, |
| "loss/hidden": 3.3953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20386817157268525, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.0645, |
| "grad_norm": 34.25, |
| "grad_norm_var": 10.79765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5854, |
| "loss/crossentropy": 2.07715407460928, |
| "loss/hidden": 3.581640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20797281824052333, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.06475, |
| "grad_norm": 33.5, |
| "grad_norm_var": 12.35625, |
| "learning_rate": 0.0001, |
| "loss": 7.6279, |
| "loss/crossentropy": 2.1247923612594604, |
| "loss/hidden": 3.434765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21620508767664431, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 32.75, |
| "grad_norm_var": 5.094791666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5996, |
| "loss/crossentropy": 2.087959203124046, |
| "loss/hidden": 3.521875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19923710729926825, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.06525, |
| "grad_norm": 30.0, |
| "grad_norm_var": 7.6150390625, |
| "learning_rate": 0.0001, |
| "loss": 7.682, |
| "loss/crossentropy": 2.1805250465869905, |
| "loss/hidden": 3.38984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2021130472421646, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.0655, |
| "grad_norm": 34.75, |
| "grad_norm_var": 7.06015625, |
| "learning_rate": 0.0001, |
| "loss": 7.7244, |
| "loss/crossentropy": 2.1178730964660644, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19405451826751233, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.06575, |
| "grad_norm": 29.75, |
| "grad_norm_var": 3.065559895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6483, |
| "loss/crossentropy": 2.1593512505292893, |
| "loss/hidden": 3.381640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2096536297351122, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.066, |
| "grad_norm": 33.25, |
| "grad_norm_var": 4.623372395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5982, |
| "loss/crossentropy": 2.159625916182995, |
| "loss/hidden": 3.365234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18939675595611333, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.06625, |
| "grad_norm": 53.0, |
| "grad_norm_var": 49.99524739583333, |
| "learning_rate": 0.0001, |
| "loss": 7.6918, |
| "loss/crossentropy": 2.114516945183277, |
| "loss/hidden": 3.51484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2009023107588291, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.0665, |
| "grad_norm": 30.125, |
| "grad_norm_var": 38.81399739583333, |
| "learning_rate": 0.0001, |
| "loss": 7.5543, |
| "loss/crossentropy": 2.132766366004944, |
| "loss/hidden": 3.421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1965734062716365, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.06675, |
| "grad_norm": 30.875, |
| "grad_norm_var": 1.9559895833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6338, |
| "loss/crossentropy": 2.1092610150575637, |
| "loss/hidden": 3.431640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17978871315717698, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.067, |
| "grad_norm": 29.875, |
| "grad_norm_var": 4.47890625, |
| "learning_rate": 0.0001, |
| "loss": 7.617, |
| "loss/crossentropy": 2.2271903961896897, |
| "loss/hidden": 3.461328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2073811784386635, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.06725, |
| "grad_norm": 30.125, |
| "grad_norm_var": 3.2249348958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.641, |
| "loss/crossentropy": 2.0155764549970625, |
| "loss/hidden": 3.603515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2049756994470954, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.0675, |
| "grad_norm": 33.5, |
| "grad_norm_var": 3.0061848958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.6253, |
| "loss/crossentropy": 2.221065053343773, |
| "loss/hidden": 3.482421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2112014289945364, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.06775, |
| "grad_norm": 32.25, |
| "grad_norm_var": 18.753125, |
| "learning_rate": 0.0001, |
| "loss": 7.6427, |
| "loss/crossentropy": 2.180001160502434, |
| "loss/hidden": 3.353125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.193130424618721, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.068, |
| "grad_norm": 32.5, |
| "grad_norm_var": 20.773893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6163, |
| "loss/crossentropy": 2.283226564526558, |
| "loss/hidden": 3.48515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19388929307460784, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.06825, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.61640625, |
| "learning_rate": 0.0001, |
| "loss": 7.587, |
| "loss/crossentropy": 2.1378406554460527, |
| "loss/hidden": 3.4921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2115953892469406, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.0685, |
| "grad_norm": 32.5, |
| "grad_norm_var": 1.8684895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6011, |
| "loss/crossentropy": 2.0985760882496836, |
| "loss/hidden": 3.48125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19036055766046048, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.06875, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.9535807291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6583, |
| "loss/crossentropy": 2.1665745437145234, |
| "loss/hidden": 3.36640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18649150040000678, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.069, |
| "grad_norm": 34.5, |
| "grad_norm_var": 6.343489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5495, |
| "loss/crossentropy": 2.176983141899109, |
| "loss/hidden": 3.412890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20285341441631316, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.06925, |
| "grad_norm": 32.5, |
| "grad_norm_var": 4.972916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6597, |
| "loss/crossentropy": 2.1060123026371, |
| "loss/hidden": 3.400390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19327255934476853, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.0695, |
| "grad_norm": 31.625, |
| "grad_norm_var": 32.33639322916667, |
| "learning_rate": 0.0001, |
| "loss": 7.5862, |
| "loss/crossentropy": 2.1663936868309976, |
| "loss/hidden": 3.512109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22126073129475116, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.06975, |
| "grad_norm": 32.5, |
| "grad_norm_var": 5.694791666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5124, |
| "loss/crossentropy": 2.225750984251499, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19473073966801166, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 31.5, |
| "grad_norm_var": 4.237434895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6174, |
| "loss/crossentropy": 2.0647315263748167, |
| "loss/hidden": 3.4734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2136565549299121, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.07025, |
| "grad_norm": 36.0, |
| "grad_norm_var": 4.792708333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6789, |
| "loss/crossentropy": 2.1971701353788378, |
| "loss/hidden": 3.440625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21544951274991037, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.0705, |
| "grad_norm": 31.125, |
| "grad_norm_var": 11.145247395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.7043, |
| "loss/crossentropy": 2.2537077218294144, |
| "loss/hidden": 3.395703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19961411394178868, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.07075, |
| "grad_norm": 30.5, |
| "grad_norm_var": 85.65182291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6427, |
| "loss/crossentropy": 2.0513558954000475, |
| "loss/hidden": 3.615234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23545071221888064, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.071, |
| "grad_norm": 31.375, |
| "grad_norm_var": 64.77180989583333, |
| "learning_rate": 0.0001, |
| "loss": 7.6378, |
| "loss/crossentropy": 2.186201846599579, |
| "loss/hidden": 3.55078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20769538041204214, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.07125, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.0268229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5525, |
| "loss/crossentropy": 2.161085495352745, |
| "loss/hidden": 3.27421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18488222286105155, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.0715, |
| "grad_norm": 33.0, |
| "grad_norm_var": 10.437434895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6376, |
| "loss/crossentropy": 2.09626332372427, |
| "loss/hidden": 3.33203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.179809108376503, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.07175, |
| "grad_norm": 33.25, |
| "grad_norm_var": 9.233072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.62, |
| "loss/crossentropy": 2.2382488936185836, |
| "loss/hidden": 3.353515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20872681811451912, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.072, |
| "grad_norm": 28.5, |
| "grad_norm_var": 8.784375, |
| "learning_rate": 0.0001, |
| "loss": 7.6516, |
| "loss/crossentropy": 2.1699771240353583, |
| "loss/hidden": 3.470703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2100867312401533, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.07225, |
| "grad_norm": 33.75, |
| "grad_norm_var": 9.269791666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5839, |
| "loss/crossentropy": 2.1368533104658125, |
| "loss/hidden": 3.426953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21750828213989734, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.0725, |
| "grad_norm": 36.0, |
| "grad_norm_var": 5.518489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6849, |
| "loss/crossentropy": 2.12222815155983, |
| "loss/hidden": 3.580859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21114687696099282, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.07275, |
| "grad_norm": 31.125, |
| "grad_norm_var": 5.622916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5109, |
| "loss/crossentropy": 2.171084225177765, |
| "loss/hidden": 3.403125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1979156408458948, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.073, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.5619140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6895, |
| "loss/crossentropy": 2.164732736349106, |
| "loss/hidden": 3.404296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20426477529108525, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.07325, |
| "grad_norm": 29.375, |
| "grad_norm_var": 1.7854166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5573, |
| "loss/crossentropy": 2.1073058575391768, |
| "loss/hidden": 3.50234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2097570365294814, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.0735, |
| "grad_norm": 30.875, |
| "grad_norm_var": 2.4955729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5697, |
| "loss/crossentropy": 2.153279659152031, |
| "loss/hidden": 3.361328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1900124330073595, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.07375, |
| "grad_norm": 49.5, |
| "grad_norm_var": 22.75390625, |
| "learning_rate": 0.0001, |
| "loss": 7.671, |
| "loss/crossentropy": 2.2612457245588304, |
| "loss/hidden": 3.414453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18990697022527456, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.074, |
| "grad_norm": 32.0, |
| "grad_norm_var": 24.510416666666668, |
| "learning_rate": 0.0001, |
| "loss": 7.5814, |
| "loss/crossentropy": 2.123460465669632, |
| "loss/hidden": 3.496484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20658994875848294, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.07425, |
| "grad_norm": 30.125, |
| "grad_norm_var": 118.29837239583334, |
| "learning_rate": 0.0001, |
| "loss": 7.5481, |
| "loss/crossentropy": 2.2275219768285752, |
| "loss/hidden": 3.33515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18647960387170315, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.0745, |
| "grad_norm": 29.375, |
| "grad_norm_var": 21.989322916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5209, |
| "loss/crossentropy": 2.1731285482645033, |
| "loss/hidden": 3.4171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1889862149953842, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.07475, |
| "grad_norm": 31.125, |
| "grad_norm_var": 4.253059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5815, |
| "loss/crossentropy": 2.2546483501791954, |
| "loss/hidden": 3.415625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18856723569333553, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 32.0, |
| "grad_norm_var": 5.730143229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5835, |
| "loss/crossentropy": 2.092367857694626, |
| "loss/hidden": 3.52421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20502115599811077, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.07525, |
| "grad_norm": 29.5, |
| "grad_norm_var": 15.926822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5849, |
| "loss/crossentropy": 2.109961675107479, |
| "loss/hidden": 3.432421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20341113824397325, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.0755, |
| "grad_norm": 32.5, |
| "grad_norm_var": 3.314322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5896, |
| "loss/crossentropy": 2.1648701071739196, |
| "loss/hidden": 3.5, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20436643473803998, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.07575, |
| "grad_norm": 32.75, |
| "grad_norm_var": 1.4754557291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5959, |
| "loss/crossentropy": 2.2054502993822096, |
| "loss/hidden": 3.440234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20024821683764457, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.076, |
| "grad_norm": 30.875, |
| "grad_norm_var": 6.4931640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6957, |
| "loss/crossentropy": 2.166448511183262, |
| "loss/hidden": 3.484765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20373598877340554, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.07625, |
| "grad_norm": 36.5, |
| "grad_norm_var": 9.442122395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5957, |
| "loss/crossentropy": 2.218970799446106, |
| "loss/hidden": 3.503515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20884830448776484, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.0765, |
| "grad_norm": 32.25, |
| "grad_norm_var": 6.062955729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5458, |
| "loss/crossentropy": 2.080473840236664, |
| "loss/hidden": 3.57109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20190774220973254, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.07675, |
| "grad_norm": 29.5, |
| "grad_norm_var": 2.6093098958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5646, |
| "loss/crossentropy": 2.1775156021118165, |
| "loss/hidden": 3.355859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19825822599232196, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.077, |
| "grad_norm": 30.125, |
| "grad_norm_var": 2.690625, |
| "learning_rate": 0.0001, |
| "loss": 7.7174, |
| "loss/crossentropy": 2.246141794323921, |
| "loss/hidden": 3.425390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20639744736254215, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.07725, |
| "grad_norm": 30.875, |
| "grad_norm_var": 4.271809895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6196, |
| "loss/crossentropy": 2.060313332080841, |
| "loss/hidden": 3.481640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21316638588905334, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.0775, |
| "grad_norm": 34.0, |
| "grad_norm_var": 2.873893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5637, |
| "loss/crossentropy": 2.153154730796814, |
| "loss/hidden": 3.44453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20764457508921624, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.07775, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.2708333333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5924, |
| "loss/crossentropy": 2.2558963537216186, |
| "loss/hidden": 3.364453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19444480016827584, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.078, |
| "grad_norm": 29.5, |
| "grad_norm_var": 2.2301432291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.7202, |
| "loss/crossentropy": 2.190881980955601, |
| "loss/hidden": 3.422265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2061827789992094, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.07825, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.991080729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5479, |
| "loss/crossentropy": 2.1357465982437134, |
| "loss/hidden": 3.394140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19807269163429736, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.0785, |
| "grad_norm": 29.125, |
| "grad_norm_var": 5.457291666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.641, |
| "loss/crossentropy": 2.166859371960163, |
| "loss/hidden": 3.398046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1900594387203455, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.07875, |
| "grad_norm": 32.75, |
| "grad_norm_var": 27.958072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5223, |
| "loss/crossentropy": 2.1595762044191362, |
| "loss/hidden": 3.551171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22254167906939984, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.079, |
| "grad_norm": 29.75, |
| "grad_norm_var": 3.4056640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6761, |
| "loss/crossentropy": 2.156154304742813, |
| "loss/hidden": 3.407421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2026256375014782, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.07925, |
| "grad_norm": 32.0, |
| "grad_norm_var": 7.246875, |
| "learning_rate": 0.0001, |
| "loss": 7.4683, |
| "loss/crossentropy": 2.1108324408531187, |
| "loss/hidden": 3.5015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18734413515776396, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.0795, |
| "grad_norm": 28.375, |
| "grad_norm_var": 3.9212890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5591, |
| "loss/crossentropy": 2.1108986347913743, |
| "loss/hidden": 3.487109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19466390162706376, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.07975, |
| "grad_norm": 32.5, |
| "grad_norm_var": 15.6962890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6375, |
| "loss/crossentropy": 2.1180114537477492, |
| "loss/hidden": 3.478125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21690767258405685, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 31.875, |
| "grad_norm_var": 12.757747395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5977, |
| "loss/crossentropy": 2.1203838691115378, |
| "loss/hidden": 3.5421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20782926268875598, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.08025, |
| "grad_norm": 28.375, |
| "grad_norm_var": 3.3218098958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6303, |
| "loss/crossentropy": 2.1929849207401277, |
| "loss/hidden": 3.348828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19219291880726813, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.0805, |
| "grad_norm": 30.0, |
| "grad_norm_var": 3.1958333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5282, |
| "loss/crossentropy": 2.2013367488980293, |
| "loss/hidden": 3.6, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21040805242955685, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.08075, |
| "grad_norm": 32.25, |
| "grad_norm_var": 1.8684895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5331, |
| "loss/crossentropy": 2.184007254242897, |
| "loss/hidden": 3.371875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19979026056826116, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.081, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.130989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5964, |
| "loss/crossentropy": 2.2199858695268633, |
| "loss/hidden": 3.38515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21446770764887332, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.08125, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.6483723958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.602, |
| "loss/crossentropy": 2.1263694643974302, |
| "loss/hidden": 3.482421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19737922623753548, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.0815, |
| "grad_norm": 30.75, |
| "grad_norm_var": 3.207291666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5927, |
| "loss/crossentropy": 2.184669151902199, |
| "loss/hidden": 3.3359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18790993094444275, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.08175, |
| "grad_norm": 29.875, |
| "grad_norm_var": 2.857291666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.615, |
| "loss/crossentropy": 2.0831361666321753, |
| "loss/hidden": 3.48203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19330178536474704, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.082, |
| "grad_norm": 30.625, |
| "grad_norm_var": 15.702018229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6216, |
| "loss/crossentropy": 2.158697286248207, |
| "loss/hidden": 3.3765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18888361509889365, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.08225, |
| "grad_norm": 32.75, |
| "grad_norm_var": 18.211393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.564, |
| "loss/crossentropy": 2.2913430631160736, |
| "loss/hidden": 3.4828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2058469709008932, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.0825, |
| "grad_norm": 35.5, |
| "grad_norm_var": 4.24140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5168, |
| "loss/crossentropy": 2.2065580666065214, |
| "loss/hidden": 3.3515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18786473274230958, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.08275, |
| "grad_norm": 32.5, |
| "grad_norm_var": 3.692643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5099, |
| "loss/crossentropy": 2.1358665406703947, |
| "loss/hidden": 3.36875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18491616416722537, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.083, |
| "grad_norm": 31.75, |
| "grad_norm_var": 2.50390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6092, |
| "loss/crossentropy": 2.206757593154907, |
| "loss/hidden": 3.4328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20058272033929825, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.08325, |
| "grad_norm": 34.5, |
| "grad_norm_var": 1.7497395833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5493, |
| "loss/crossentropy": 2.0709328591823577, |
| "loss/hidden": 3.450390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1953151250258088, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.0835, |
| "grad_norm": 34.5, |
| "grad_norm_var": 2.9395182291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.7584, |
| "loss/crossentropy": 2.1559954971075057, |
| "loss/hidden": 3.5484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20604321975260972, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.08375, |
| "grad_norm": 33.75, |
| "grad_norm_var": 17.864322916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6969, |
| "loss/crossentropy": 2.1975975424051284, |
| "loss/hidden": 3.40078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2027706265449524, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.084, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.7997395833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.61, |
| "loss/crossentropy": 2.018556122481823, |
| "loss/hidden": 3.403515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18029189426451922, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.08425, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.5994140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5397, |
| "loss/crossentropy": 2.1838466703891752, |
| "loss/hidden": 3.39453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2002351511269808, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.0845, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.912955729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5982, |
| "loss/crossentropy": 2.184953287243843, |
| "loss/hidden": 3.391015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19311312437057496, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.08475, |
| "grad_norm": 34.25, |
| "grad_norm_var": 3.309375, |
| "learning_rate": 0.0001, |
| "loss": 7.5841, |
| "loss/crossentropy": 2.2160476714372637, |
| "loss/hidden": 3.408984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1966065490618348, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 31.75, |
| "grad_norm_var": 2.1936848958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.588, |
| "loss/crossentropy": 2.2071674168109894, |
| "loss/hidden": 3.423828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19414376243948936, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.08525, |
| "grad_norm": 33.0, |
| "grad_norm_var": 1.6301432291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5966, |
| "loss/crossentropy": 2.117925961315632, |
| "loss/hidden": 3.44609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21105701606720687, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.0855, |
| "grad_norm": 32.25, |
| "grad_norm_var": 5.82265625, |
| "learning_rate": 0.0001, |
| "loss": 7.62, |
| "loss/crossentropy": 2.0512605965137483, |
| "loss/hidden": 3.509375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20283049941062928, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.08575, |
| "grad_norm": 31.75, |
| "grad_norm_var": 6.539583333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5429, |
| "loss/crossentropy": 2.074887050688267, |
| "loss/hidden": 3.5, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18658901005983353, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.086, |
| "grad_norm": 33.5, |
| "grad_norm_var": 8.426497395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5759, |
| "loss/crossentropy": 2.1776267290115356, |
| "loss/hidden": 3.466796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20649599879980088, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.08625, |
| "grad_norm": 43.5, |
| "grad_norm_var": 14.962434895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6429, |
| "loss/crossentropy": 2.12281953394413, |
| "loss/hidden": 3.5265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20702828094363213, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.0865, |
| "grad_norm": 31.0, |
| "grad_norm_var": 196.96555989583334, |
| "learning_rate": 0.0001, |
| "loss": 7.7919, |
| "loss/crossentropy": 2.1028707295656206, |
| "loss/hidden": 3.536328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22825684808194638, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.08675, |
| "grad_norm": 32.25, |
| "grad_norm_var": 206.46979166666668, |
| "learning_rate": 0.0001, |
| "loss": 7.5698, |
| "loss/crossentropy": 2.1032180160284044, |
| "loss/hidden": 3.45625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1899772472679615, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.087, |
| "grad_norm": 37.0, |
| "grad_norm_var": 15.6322265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5884, |
| "loss/crossentropy": 2.0837722390890123, |
| "loss/hidden": 3.446484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20534363873302935, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.08725, |
| "grad_norm": 30.0, |
| "grad_norm_var": 16.8806640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5719, |
| "loss/crossentropy": 2.1673771381378173, |
| "loss/hidden": 3.45234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21180946305394172, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.0875, |
| "grad_norm": 33.0, |
| "grad_norm_var": 16.212955729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5171, |
| "loss/crossentropy": 2.2269717276096346, |
| "loss/hidden": 3.294140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18251859862357378, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.08775, |
| "grad_norm": 33.5, |
| "grad_norm_var": 395.25390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6825, |
| "loss/crossentropy": 2.2768601924180984, |
| "loss/hidden": 3.307421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17959882766008378, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.088, |
| "grad_norm": 31.875, |
| "grad_norm_var": 400.7280598958333, |
| "learning_rate": 0.0001, |
| "loss": 7.5387, |
| "loss/crossentropy": 2.174117147922516, |
| "loss/hidden": 3.325390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1909211568534374, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.08825, |
| "grad_norm": 34.0, |
| "grad_norm_var": 3.2527951689747005e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5099, |
| "loss/crossentropy": 2.102574473619461, |
| "loss/hidden": 3.365625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18131749220192434, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.0885, |
| "grad_norm": 34.75, |
| "grad_norm_var": 3.252795168997245e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5664, |
| "loss/crossentropy": 2.121107617020607, |
| "loss/hidden": 3.433203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18711038120090961, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.08875, |
| "grad_norm": 35.25, |
| "grad_norm_var": 26.5384765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5577, |
| "loss/crossentropy": 2.1354643225669863, |
| "loss/hidden": 3.372265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18915031235665083, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.089, |
| "grad_norm": 29.25, |
| "grad_norm_var": 39.177083333333336, |
| "learning_rate": 0.0001, |
| "loss": 7.5603, |
| "loss/crossentropy": 2.111011874675751, |
| "loss/hidden": 3.425390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20049102939665317, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.08925, |
| "grad_norm": 30.5, |
| "grad_norm_var": 24.4916015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5407, |
| "loss/crossentropy": 2.091498665511608, |
| "loss/hidden": 3.429296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19228591658174993, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.0895, |
| "grad_norm": 30.0, |
| "grad_norm_var": 21.582291666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5146, |
| "loss/crossentropy": 2.1603414684534075, |
| "loss/hidden": 3.586328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22721791528165342, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.08975, |
| "grad_norm": 29.25, |
| "grad_norm_var": 18.798893229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5322, |
| "loss/crossentropy": 2.1110543325543403, |
| "loss/hidden": 3.439453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19287437647581102, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 40.75, |
| "grad_norm_var": 15.987955729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.55, |
| "loss/crossentropy": 2.211816768348217, |
| "loss/hidden": 3.38984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19003268536180257, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.09025, |
| "grad_norm": 29.75, |
| "grad_norm_var": 14.333268229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6044, |
| "loss/crossentropy": 2.2199724197387694, |
| "loss/hidden": 3.45234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19937946014106273, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.0905, |
| "grad_norm": 29.5, |
| "grad_norm_var": 7.994205729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5672, |
| "loss/crossentropy": 2.1754990458488463, |
| "loss/hidden": 3.455859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1872939633205533, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.09075, |
| "grad_norm": 29.625, |
| "grad_norm_var": 8.087955729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.443, |
| "loss/crossentropy": 2.2714238941669462, |
| "loss/hidden": 3.36484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1929878756403923, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.091, |
| "grad_norm": 30.25, |
| "grad_norm_var": 6.550455729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5694, |
| "loss/crossentropy": 2.1840985506772994, |
| "loss/hidden": 3.504296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19485698137432336, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.09125, |
| "grad_norm": 32.25, |
| "grad_norm_var": 7.7494140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5571, |
| "loss/crossentropy": 2.1566817820072175, |
| "loss/hidden": 3.503125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21431526727974415, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.0915, |
| "grad_norm": 34.75, |
| "grad_norm_var": 5.333268229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5377, |
| "loss/crossentropy": 2.0471107825636863, |
| "loss/hidden": 3.497265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20124074276536702, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.09175, |
| "grad_norm": 33.0, |
| "grad_norm_var": 7.4353515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6125, |
| "loss/crossentropy": 2.1804106384515762, |
| "loss/hidden": 3.440625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22469761371612548, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.092, |
| "grad_norm": 33.5, |
| "grad_norm_var": 4.3572265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6491, |
| "loss/crossentropy": 2.2595307737588883, |
| "loss/hidden": 3.352734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1894306108355522, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.09225, |
| "grad_norm": 36.25, |
| "grad_norm_var": 8.666666666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5857, |
| "loss/crossentropy": 2.0454846382141114, |
| "loss/hidden": 3.465234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1902542944997549, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.0925, |
| "grad_norm": 28.625, |
| "grad_norm_var": 6.204166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6065, |
| "loss/crossentropy": 2.1835698932409286, |
| "loss/hidden": 3.4828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20310410112142563, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.09275, |
| "grad_norm": 35.25, |
| "grad_norm_var": 7.305989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6004, |
| "loss/crossentropy": 2.0759357810020447, |
| "loss/hidden": 3.446484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21512960288673638, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.093, |
| "grad_norm": 38.25, |
| "grad_norm_var": 19.737239583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6564, |
| "loss/crossentropy": 2.2961436778306963, |
| "loss/hidden": 3.318359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.198493617400527, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.09325, |
| "grad_norm": 30.5, |
| "grad_norm_var": 17.01015625, |
| "learning_rate": 0.0001, |
| "loss": 7.6998, |
| "loss/crossentropy": 2.1192551463842393, |
| "loss/hidden": 3.510546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19986802861094474, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.0935, |
| "grad_norm": 36.25, |
| "grad_norm_var": 10.20625, |
| "learning_rate": 0.0001, |
| "loss": 7.4855, |
| "loss/crossentropy": 2.0999212980270388, |
| "loss/hidden": 3.377734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19102167561650277, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.09375, |
| "grad_norm": 33.75, |
| "grad_norm_var": 7.556705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6165, |
| "loss/crossentropy": 2.1783443093299866, |
| "loss/hidden": 3.418359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1862858783453703, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.094, |
| "grad_norm": 28.125, |
| "grad_norm_var": 5.3603515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5372, |
| "loss/crossentropy": 2.0993641003966332, |
| "loss/hidden": 3.405078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17717746701091527, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.09425, |
| "grad_norm": 30.625, |
| "grad_norm_var": 5.297330729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.614, |
| "loss/crossentropy": 2.1238688945770265, |
| "loss/hidden": 3.424609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20928110517561435, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.0945, |
| "grad_norm": 29.75, |
| "grad_norm_var": 4.573893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5672, |
| "loss/crossentropy": 2.203542584180832, |
| "loss/hidden": 3.40546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19581303521990776, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.09475, |
| "grad_norm": 38.0, |
| "grad_norm_var": 5.322330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5661, |
| "loss/crossentropy": 2.159272998571396, |
| "loss/hidden": 3.46953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1973846558481455, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 30.75, |
| "grad_norm_var": 5.291666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5495, |
| "loss/crossentropy": 2.188914805650711, |
| "loss/hidden": 3.375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2011850569397211, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.09525, |
| "grad_norm": 31.125, |
| "grad_norm_var": 6.112434895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5776, |
| "loss/crossentropy": 2.1599004954099654, |
| "loss/hidden": 3.464453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21134469993412494, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.0955, |
| "grad_norm": 35.25, |
| "grad_norm_var": 29.9431640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6779, |
| "loss/crossentropy": 2.1196573287248612, |
| "loss/hidden": 3.576171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2261866919696331, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.09575, |
| "grad_norm": 37.5, |
| "grad_norm_var": 11.21640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4703, |
| "loss/crossentropy": 2.149521693587303, |
| "loss/hidden": 3.373046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1926643056795001, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 32.0, |
| "grad_norm_var": 4.3494140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5513, |
| "loss/crossentropy": 2.1707202911376955, |
| "loss/hidden": 3.425, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19597616009414195, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.09625, |
| "grad_norm": 29.5, |
| "grad_norm_var": 630.7197916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6191, |
| "loss/crossentropy": 2.0859180808067324, |
| "loss/hidden": 3.4640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20257378201931714, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.0965, |
| "grad_norm": 59.5, |
| "grad_norm_var": 100.66223958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5774, |
| "loss/crossentropy": 2.1815837740898134, |
| "loss/hidden": 3.426953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18438388928771018, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.09675, |
| "grad_norm": 36.75, |
| "grad_norm_var": 66.29212239583333, |
| "learning_rate": 0.0001, |
| "loss": 7.5808, |
| "loss/crossentropy": 2.0505243610590695, |
| "loss/hidden": 3.440234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18692483827471734, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.097, |
| "grad_norm": 30.375, |
| "grad_norm_var": 4.266080729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5664, |
| "loss/crossentropy": 2.2033773183822634, |
| "loss/hidden": 3.4625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19963842574507, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.09725, |
| "grad_norm": 32.25, |
| "grad_norm_var": 5.0025390625, |
| "learning_rate": 0.0001, |
| "loss": 7.59, |
| "loss/crossentropy": 2.1328989803791045, |
| "loss/hidden": 3.417578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19634215533733368, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.0975, |
| "grad_norm": 34.0, |
| "grad_norm_var": 2.1322916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6407, |
| "loss/crossentropy": 2.170455330610275, |
| "loss/hidden": 3.43828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19931643791496753, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.09775, |
| "grad_norm": 34.75, |
| "grad_norm_var": 3.558333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5899, |
| "loss/crossentropy": 2.1301774442195893, |
| "loss/hidden": 3.468359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19248898830264807, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.098, |
| "grad_norm": 33.75, |
| "grad_norm_var": 3.3478515625, |
| "learning_rate": 0.0001, |
| "loss": 7.6526, |
| "loss/crossentropy": 2.1600559651851654, |
| "loss/hidden": 3.541796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22827934101223946, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.09825, |
| "grad_norm": 30.75, |
| "grad_norm_var": 6.117643229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5635, |
| "loss/crossentropy": 2.0728287249803543, |
| "loss/hidden": 3.534765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20216128267347813, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.0985, |
| "grad_norm": 33.25, |
| "grad_norm_var": 7.5087890625, |
| "learning_rate": 0.0001, |
| "loss": 7.7253, |
| "loss/crossentropy": 2.1514860481023788, |
| "loss/hidden": 3.530078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21096254773437978, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.09875, |
| "grad_norm": 34.25, |
| "grad_norm_var": 2.6207682291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.538, |
| "loss/crossentropy": 2.169696259498596, |
| "loss/hidden": 3.48671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21556729041039943, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.099, |
| "grad_norm": 31.25, |
| "grad_norm_var": 5.3150390625, |
| "learning_rate": 0.0001, |
| "loss": 7.54, |
| "loss/crossentropy": 2.1874313950538635, |
| "loss/hidden": 3.399609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19146509394049643, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.09925, |
| "grad_norm": 32.0, |
| "grad_norm_var": 23.437239583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6292, |
| "loss/crossentropy": 2.165771406888962, |
| "loss/hidden": 3.592578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20717886611819267, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.0995, |
| "grad_norm": 31.375, |
| "grad_norm_var": 407.4603515625, |
| "learning_rate": 0.0001, |
| "loss": 7.7177, |
| "loss/crossentropy": 2.1150890797376634, |
| "loss/hidden": 3.501953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19676875434815883, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.09975, |
| "grad_norm": 33.5, |
| "grad_norm_var": 8.463997395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.6397, |
| "loss/crossentropy": 2.13585125207901, |
| "loss/hidden": 3.597265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2018281053751707, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 36.0, |
| "grad_norm_var": 8.787434895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.6957, |
| "loss/crossentropy": 2.062576304376125, |
| "loss/hidden": 3.556640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20351322293281554, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.10025, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.6931640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5171, |
| "loss/crossentropy": 2.1093045681715012, |
| "loss/hidden": 3.5, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19258121848106385, |
| "step": 4010 |
| }, |
| { |
| "epoch": 0.1005, |
| "grad_norm": 38.5, |
| "grad_norm_var": 6.976497395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.7046, |
| "loss/crossentropy": 2.1054726734757425, |
| "loss/hidden": 3.515234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18498583231121302, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.10075, |
| "grad_norm": 32.25, |
| "grad_norm_var": 16.50390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6242, |
| "loss/crossentropy": 2.0566830962896345, |
| "loss/hidden": 3.537890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21257028207182885, |
| "step": 4030 |
| }, |
| { |
| "epoch": 0.101, |
| "grad_norm": 30.125, |
| "grad_norm_var": 21.61640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5611, |
| "loss/crossentropy": 2.0847130313515665, |
| "loss/hidden": 3.378515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19317954257130623, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.10125, |
| "grad_norm": 31.0, |
| "grad_norm_var": 16.408268229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5456, |
| "loss/crossentropy": 2.116552269458771, |
| "loss/hidden": 3.444140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1943613938987255, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.1015, |
| "grad_norm": 31.25, |
| "grad_norm_var": 17.984375, |
| "learning_rate": 0.0001, |
| "loss": 7.6259, |
| "loss/crossentropy": 2.2868128657341003, |
| "loss/hidden": 3.494921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2375142715871334, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.10175, |
| "grad_norm": 29.625, |
| "grad_norm_var": 2.2025390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5637, |
| "loss/crossentropy": 2.092506285011768, |
| "loss/hidden": 3.466796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1903899708762765, |
| "step": 4070 |
| }, |
| { |
| "epoch": 0.102, |
| "grad_norm": 30.25, |
| "grad_norm_var": 55.06764322916667, |
| "learning_rate": 0.0001, |
| "loss": 7.6365, |
| "loss/crossentropy": 2.2538520216941835, |
| "loss/hidden": 3.4921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2204372201114893, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.10225, |
| "grad_norm": 48.0, |
| "grad_norm_var": 66.06555989583333, |
| "learning_rate": 0.0001, |
| "loss": 7.6539, |
| "loss/crossentropy": 2.198161965608597, |
| "loss/hidden": 3.3671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18744452036917209, |
| "step": 4090 |
| }, |
| { |
| "epoch": 0.1025, |
| "grad_norm": 31.625, |
| "grad_norm_var": 25.937239583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5872, |
| "loss/crossentropy": 2.161240801215172, |
| "loss/hidden": 3.548046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19324529767036439, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.10275, |
| "grad_norm": 31.125, |
| "grad_norm_var": 2.9613932291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5854, |
| "loss/crossentropy": 2.185439817607403, |
| "loss/hidden": 3.453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19476189762353896, |
| "step": 4110 |
| }, |
| { |
| "epoch": 0.103, |
| "grad_norm": 29.0, |
| "grad_norm_var": 5.4556640625, |
| "learning_rate": 0.0001, |
| "loss": 7.6728, |
| "loss/crossentropy": 2.1513148337602614, |
| "loss/hidden": 3.4875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20135847330093384, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.10325, |
| "grad_norm": 36.25, |
| "grad_norm_var": 4.3994140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6006, |
| "loss/crossentropy": 2.0776968479156492, |
| "loss/hidden": 3.49140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19831380508840085, |
| "step": 4130 |
| }, |
| { |
| "epoch": 0.1035, |
| "grad_norm": 33.0, |
| "grad_norm_var": 4.205143229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6157, |
| "loss/crossentropy": 2.0971890702843665, |
| "loss/hidden": 3.64140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2015662420541048, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.10375, |
| "grad_norm": 35.5, |
| "grad_norm_var": 23.512239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6638, |
| "loss/crossentropy": 2.128816670179367, |
| "loss/hidden": 3.420703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19698726907372474, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.104, |
| "grad_norm": 30.75, |
| "grad_norm_var": 22.026822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6175, |
| "loss/crossentropy": 2.0965539067983627, |
| "loss/hidden": 3.455859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22271894477307796, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.10425, |
| "grad_norm": 32.5, |
| "grad_norm_var": 2.426041666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.555, |
| "loss/crossentropy": 2.215752348303795, |
| "loss/hidden": 3.364453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1962002281099558, |
| "step": 4170 |
| }, |
| { |
| "epoch": 0.1045, |
| "grad_norm": 39.5, |
| "grad_norm_var": 23.042708333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6246, |
| "loss/crossentropy": 2.0542988061904905, |
| "loss/hidden": 3.440625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19908196646720172, |
| "step": 4180 |
| }, |
| { |
| "epoch": 0.10475, |
| "grad_norm": 34.75, |
| "grad_norm_var": 6.342643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.487, |
| "loss/crossentropy": 2.2133218079805372, |
| "loss/hidden": 3.382421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18823296912014484, |
| "step": 4190 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 31.125, |
| "grad_norm_var": 173.98430989583332, |
| "learning_rate": 0.0001, |
| "loss": 7.643, |
| "loss/crossentropy": 2.1089532509446145, |
| "loss/hidden": 3.501953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20591201409697532, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.10525, |
| "grad_norm": 36.75, |
| "grad_norm_var": 7.351041666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5643, |
| "loss/crossentropy": 2.289369744062424, |
| "loss/hidden": 3.36640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19611021652817726, |
| "step": 4210 |
| }, |
| { |
| "epoch": 0.1055, |
| "grad_norm": 35.75, |
| "grad_norm_var": 6.539322916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6756, |
| "loss/crossentropy": 2.1963788866996765, |
| "loss/hidden": 3.49921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19495316371321678, |
| "step": 4220 |
| }, |
| { |
| "epoch": 0.10575, |
| "grad_norm": 31.875, |
| "grad_norm_var": 6.276041666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5552, |
| "loss/crossentropy": 2.078681927919388, |
| "loss/hidden": 3.441015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1941295877099037, |
| "step": 4230 |
| }, |
| { |
| "epoch": 0.106, |
| "grad_norm": 44.5, |
| "grad_norm_var": 34.80520833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6072, |
| "loss/crossentropy": 2.222626182436943, |
| "loss/hidden": 3.2828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1848284311592579, |
| "step": 4240 |
| }, |
| { |
| "epoch": 0.10625, |
| "grad_norm": 31.375, |
| "grad_norm_var": 35.084375, |
| "learning_rate": 0.0001, |
| "loss": 7.6435, |
| "loss/crossentropy": 2.152433153986931, |
| "loss/hidden": 3.3390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1829435657709837, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.1065, |
| "grad_norm": 31.625, |
| "grad_norm_var": 4.431705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5977, |
| "loss/crossentropy": 2.1493207842111586, |
| "loss/hidden": 3.466015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19450047723948954, |
| "step": 4260 |
| }, |
| { |
| "epoch": 0.10675, |
| "grad_norm": 30.0, |
| "grad_norm_var": 8.024739583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5423, |
| "loss/crossentropy": 2.0623584628105163, |
| "loss/hidden": 3.42890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19236240349709988, |
| "step": 4270 |
| }, |
| { |
| "epoch": 0.107, |
| "grad_norm": 51.75, |
| "grad_norm_var": 105.76145833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.577, |
| "loss/crossentropy": 2.103591626882553, |
| "loss/hidden": 3.369140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18201838787645103, |
| "step": 4280 |
| }, |
| { |
| "epoch": 0.10725, |
| "grad_norm": 33.25, |
| "grad_norm_var": 144.54973958333332, |
| "learning_rate": 0.0001, |
| "loss": 7.7078, |
| "loss/crossentropy": 2.1076686546206473, |
| "loss/hidden": 3.56328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19488887619227171, |
| "step": 4290 |
| }, |
| { |
| "epoch": 0.1075, |
| "grad_norm": 32.0, |
| "grad_norm_var": 190.62057291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5811, |
| "loss/crossentropy": 2.168550156056881, |
| "loss/hidden": 3.34453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1908732896670699, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.10775, |
| "grad_norm": 28.875, |
| "grad_norm_var": 149.1822265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5999, |
| "loss/crossentropy": 2.100285217165947, |
| "loss/hidden": 3.411328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19184609185904264, |
| "step": 4310 |
| }, |
| { |
| "epoch": 0.108, |
| "grad_norm": 44.25, |
| "grad_norm_var": 12.502018229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.7075, |
| "loss/crossentropy": 2.0968768775463102, |
| "loss/hidden": 3.51796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20168912429362534, |
| "step": 4320 |
| }, |
| { |
| "epoch": 0.10825, |
| "grad_norm": 31.25, |
| "grad_norm_var": 12.81875, |
| "learning_rate": 0.0001, |
| "loss": 7.5576, |
| "loss/crossentropy": 2.1037441343069077, |
| "loss/hidden": 3.370703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17891897186636924, |
| "step": 4330 |
| }, |
| { |
| "epoch": 0.1085, |
| "grad_norm": 33.5, |
| "grad_norm_var": 2.700455729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5097, |
| "loss/crossentropy": 2.210876139998436, |
| "loss/hidden": 3.36171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19316814988851547, |
| "step": 4340 |
| }, |
| { |
| "epoch": 0.10875, |
| "grad_norm": 31.125, |
| "grad_norm_var": 17.4666015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5755, |
| "loss/crossentropy": 2.1331328481435774, |
| "loss/hidden": 3.4609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20047767795622348, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.109, |
| "grad_norm": 32.75, |
| "grad_norm_var": 3.198372395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5385, |
| "loss/crossentropy": 2.153067779541016, |
| "loss/hidden": 3.476953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20173794813454152, |
| "step": 4360 |
| }, |
| { |
| "epoch": 0.10925, |
| "grad_norm": 32.0, |
| "grad_norm_var": 4.010416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5426, |
| "loss/crossentropy": 2.165090653300285, |
| "loss/hidden": 3.330859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18712956104427575, |
| "step": 4370 |
| }, |
| { |
| "epoch": 0.1095, |
| "grad_norm": 30.0, |
| "grad_norm_var": 1.77265625, |
| "learning_rate": 0.0001, |
| "loss": 7.605, |
| "loss/crossentropy": 2.217823189496994, |
| "loss/hidden": 3.37265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20003505125641824, |
| "step": 4380 |
| }, |
| { |
| "epoch": 0.10975, |
| "grad_norm": 29.875, |
| "grad_norm_var": 1.7603515625, |
| "learning_rate": 0.0001, |
| "loss": 7.507, |
| "loss/crossentropy": 2.138795481622219, |
| "loss/hidden": 3.48828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1892871480435133, |
| "step": 4390 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 30.375, |
| "grad_norm_var": 22.449739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6965, |
| "loss/crossentropy": 2.22740375995636, |
| "loss/hidden": 3.5578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2264870759099722, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.11025, |
| "grad_norm": 29.5, |
| "grad_norm_var": 37.71666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.604, |
| "loss/crossentropy": 2.1781785815954207, |
| "loss/hidden": 3.471875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2012148156762123, |
| "step": 4410 |
| }, |
| { |
| "epoch": 0.1105, |
| "grad_norm": 33.0, |
| "grad_norm_var": 25.305989583333332, |
| "learning_rate": 0.0001, |
| "loss": 7.5767, |
| "loss/crossentropy": 2.0333445832133292, |
| "loss/hidden": 3.55703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1852614250034094, |
| "step": 4420 |
| }, |
| { |
| "epoch": 0.11075, |
| "grad_norm": 32.0, |
| "grad_norm_var": 0.9760416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6218, |
| "loss/crossentropy": 2.2101993292570112, |
| "loss/hidden": 3.505078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2069159124046564, |
| "step": 4430 |
| }, |
| { |
| "epoch": 0.111, |
| "grad_norm": 30.5, |
| "grad_norm_var": 7.8837890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6543, |
| "loss/crossentropy": 2.0182371377944945, |
| "loss/hidden": 3.475390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19055260960012674, |
| "step": 4440 |
| }, |
| { |
| "epoch": 0.11125, |
| "grad_norm": 29.0, |
| "grad_norm_var": 18.167643229166668, |
| "learning_rate": 0.0001, |
| "loss": 7.5559, |
| "loss/crossentropy": 2.209046494960785, |
| "loss/hidden": 3.4546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1926161792129278, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.1115, |
| "grad_norm": 30.375, |
| "grad_norm_var": 19.9634765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5527, |
| "loss/crossentropy": 2.2265418380498887, |
| "loss/hidden": 3.28671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17907681576907636, |
| "step": 4460 |
| }, |
| { |
| "epoch": 0.11175, |
| "grad_norm": 35.25, |
| "grad_norm_var": 3.0434895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6046, |
| "loss/crossentropy": 2.1534146428108216, |
| "loss/hidden": 3.436328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19181067440658808, |
| "step": 4470 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.161393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5151, |
| "loss/crossentropy": 2.2303753718733788, |
| "loss/hidden": 3.46796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19312014058232307, |
| "step": 4480 |
| }, |
| { |
| "epoch": 0.11225, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.071875, |
| "learning_rate": 0.0001, |
| "loss": 7.5733, |
| "loss/crossentropy": 2.2565354451537134, |
| "loss/hidden": 3.300390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19555974584072827, |
| "step": 4490 |
| }, |
| { |
| "epoch": 0.1125, |
| "grad_norm": 30.125, |
| "grad_norm_var": 6.21015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5721, |
| "loss/crossentropy": 2.1691703468561174, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19526711832731963, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.11275, |
| "grad_norm": 30.75, |
| "grad_norm_var": 34.985416666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6031, |
| "loss/crossentropy": 2.191486781835556, |
| "loss/hidden": 3.38515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2022854283452034, |
| "step": 4510 |
| }, |
| { |
| "epoch": 0.113, |
| "grad_norm": 32.75, |
| "grad_norm_var": 34.91041666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.566, |
| "loss/crossentropy": 2.07875557243824, |
| "loss/hidden": 3.510546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20517632961273194, |
| "step": 4520 |
| }, |
| { |
| "epoch": 0.11325, |
| "grad_norm": 30.25, |
| "grad_norm_var": 3.1869140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6204, |
| "loss/crossentropy": 2.1490323692560196, |
| "loss/hidden": 3.55390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20650937724858523, |
| "step": 4530 |
| }, |
| { |
| "epoch": 0.1135, |
| "grad_norm": 31.625, |
| "grad_norm_var": 3.6639973958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5605, |
| "loss/crossentropy": 2.19907369017601, |
| "loss/hidden": 3.3765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18378095962107183, |
| "step": 4540 |
| }, |
| { |
| "epoch": 0.11375, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.34140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5943, |
| "loss/crossentropy": 2.0509427756071092, |
| "loss/hidden": 3.46796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2012764386832714, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.114, |
| "grad_norm": 32.25, |
| "grad_norm_var": 55.52916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5349, |
| "loss/crossentropy": 2.247987303137779, |
| "loss/hidden": 3.491015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2166461084038019, |
| "step": 4560 |
| }, |
| { |
| "epoch": 0.11425, |
| "grad_norm": 31.0, |
| "grad_norm_var": 59.064518229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5699, |
| "loss/crossentropy": 2.256947749853134, |
| "loss/hidden": 3.369921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19454225115478038, |
| "step": 4570 |
| }, |
| { |
| "epoch": 0.1145, |
| "grad_norm": 30.5, |
| "grad_norm_var": 12.089583333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5587, |
| "loss/crossentropy": 2.230518189072609, |
| "loss/hidden": 3.380078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19083393104374408, |
| "step": 4580 |
| }, |
| { |
| "epoch": 0.11475, |
| "grad_norm": 31.5, |
| "grad_norm_var": 18.3056640625, |
| "learning_rate": 0.0001, |
| "loss": 7.664, |
| "loss/crossentropy": 2.113222661614418, |
| "loss/hidden": 3.51640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20051947552710772, |
| "step": 4590 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 29.25, |
| "grad_norm_var": 26.7837890625, |
| "learning_rate": 0.0001, |
| "loss": 7.6508, |
| "loss/crossentropy": 2.2963487923145296, |
| "loss/hidden": 3.430859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21824662014842033, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.11525, |
| "grad_norm": 31.75, |
| "grad_norm_var": 3.249739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5491, |
| "loss/crossentropy": 2.2380147099494936, |
| "loss/hidden": 3.488671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2021485272794962, |
| "step": 4610 |
| }, |
| { |
| "epoch": 0.1155, |
| "grad_norm": 34.0, |
| "grad_norm_var": 3.2072265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6035, |
| "loss/crossentropy": 2.1933206588029863, |
| "loss/hidden": 3.519921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2073045803233981, |
| "step": 4620 |
| }, |
| { |
| "epoch": 0.11575, |
| "grad_norm": 30.375, |
| "grad_norm_var": 25.005989583333335, |
| "learning_rate": 0.0001, |
| "loss": 7.577, |
| "loss/crossentropy": 2.3104471057653426, |
| "loss/hidden": 3.377734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20087463557720184, |
| "step": 4630 |
| }, |
| { |
| "epoch": 0.116, |
| "grad_norm": 33.5, |
| "grad_norm_var": 539.2056640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5764, |
| "loss/crossentropy": 2.1786745607852938, |
| "loss/hidden": 3.39453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19096632562577726, |
| "step": 4640 |
| }, |
| { |
| "epoch": 0.11625, |
| "grad_norm": 30.5, |
| "grad_norm_var": 132.26139322916666, |
| "learning_rate": 0.0001, |
| "loss": 7.7424, |
| "loss/crossentropy": 2.1628643572330475, |
| "loss/hidden": 3.526171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1994694285094738, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.1165, |
| "grad_norm": 40.25, |
| "grad_norm_var": 12.672330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6456, |
| "loss/crossentropy": 2.0927803248167036, |
| "loss/hidden": 3.507421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20613454841077328, |
| "step": 4660 |
| }, |
| { |
| "epoch": 0.11675, |
| "grad_norm": 37.25, |
| "grad_norm_var": 7.458072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.622, |
| "loss/crossentropy": 2.311227411031723, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21675110273063183, |
| "step": 4670 |
| }, |
| { |
| "epoch": 0.117, |
| "grad_norm": 34.5, |
| "grad_norm_var": 138.99524739583333, |
| "learning_rate": 0.0001, |
| "loss": 7.6633, |
| "loss/crossentropy": 2.1860357582569123, |
| "loss/hidden": 3.509375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20624178424477577, |
| "step": 4680 |
| }, |
| { |
| "epoch": 0.11725, |
| "grad_norm": 38.25, |
| "grad_norm_var": 11.161393229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.654, |
| "loss/crossentropy": 2.246461641788483, |
| "loss/hidden": 3.435546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19816880766302347, |
| "step": 4690 |
| }, |
| { |
| "epoch": 0.1175, |
| "grad_norm": 98.5, |
| "grad_norm_var": 275.63645833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.6871, |
| "loss/crossentropy": 2.1662321478128432, |
| "loss/hidden": 3.52421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23473294898867608, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.11775, |
| "grad_norm": 32.25, |
| "grad_norm_var": 273.496875, |
| "learning_rate": 0.0001, |
| "loss": 7.5664, |
| "loss/crossentropy": 2.1411470264196395, |
| "loss/hidden": 3.466015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20816716887056827, |
| "step": 4710 |
| }, |
| { |
| "epoch": 0.118, |
| "grad_norm": 30.625, |
| "grad_norm_var": 1.4697916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5106, |
| "loss/crossentropy": 2.184460151195526, |
| "loss/hidden": 3.387890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19597234334796668, |
| "step": 4720 |
| }, |
| { |
| "epoch": 0.11825, |
| "grad_norm": 32.75, |
| "grad_norm_var": 151.39368489583333, |
| "learning_rate": 0.0001, |
| "loss": 7.7143, |
| "loss/crossentropy": 2.18603872358799, |
| "loss/hidden": 3.509375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21333505641669034, |
| "step": 4730 |
| }, |
| { |
| "epoch": 0.1185, |
| "grad_norm": 31.125, |
| "grad_norm_var": 41.80774739583333, |
| "learning_rate": 0.0001, |
| "loss": 7.5158, |
| "loss/crossentropy": 2.038896057009697, |
| "loss/hidden": 3.413671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18354782909154893, |
| "step": 4740 |
| }, |
| { |
| "epoch": 0.11875, |
| "grad_norm": 31.5, |
| "grad_norm_var": 20.727083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.6044, |
| "loss/crossentropy": 2.07361024916172, |
| "loss/hidden": 3.505078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2144785810261965, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.119, |
| "grad_norm": 32.0, |
| "grad_norm_var": 14.9994140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5936, |
| "loss/crossentropy": 2.172766661643982, |
| "loss/hidden": 3.359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1904382836073637, |
| "step": 4760 |
| }, |
| { |
| "epoch": 0.11925, |
| "grad_norm": 33.0, |
| "grad_norm_var": 11.213541666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6024, |
| "loss/crossentropy": 2.2863214761018753, |
| "loss/hidden": 3.464453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20077989026904106, |
| "step": 4770 |
| }, |
| { |
| "epoch": 0.1195, |
| "grad_norm": 32.0, |
| "grad_norm_var": 23.308268229166668, |
| "learning_rate": 0.0001, |
| "loss": 7.5743, |
| "loss/crossentropy": 2.172411371767521, |
| "loss/hidden": 3.357421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1879224268719554, |
| "step": 4780 |
| }, |
| { |
| "epoch": 0.11975, |
| "grad_norm": 29.625, |
| "grad_norm_var": 27.662239583333335, |
| "learning_rate": 0.0001, |
| "loss": 7.6003, |
| "loss/crossentropy": 2.061740070581436, |
| "loss/hidden": 3.424609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.196690865047276, |
| "step": 4790 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 34.5, |
| "grad_norm_var": 9.820768229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5893, |
| "loss/crossentropy": 2.1725818127393723, |
| "loss/hidden": 3.33828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18248203694820403, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.12025, |
| "grad_norm": 31.375, |
| "grad_norm_var": 5.5478515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5542, |
| "loss/crossentropy": 2.0823758363723757, |
| "loss/hidden": 3.503125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19437449853867292, |
| "step": 4810 |
| }, |
| { |
| "epoch": 0.1205, |
| "grad_norm": 31.125, |
| "grad_norm_var": 2.8754557291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.7018, |
| "loss/crossentropy": 2.220066267251968, |
| "loss/hidden": 3.52734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20239269211888314, |
| "step": 4820 |
| }, |
| { |
| "epoch": 0.12075, |
| "grad_norm": 33.25, |
| "grad_norm_var": 6.868489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5765, |
| "loss/crossentropy": 2.0765403911471365, |
| "loss/hidden": 3.435546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19827509336173535, |
| "step": 4830 |
| }, |
| { |
| "epoch": 0.121, |
| "grad_norm": 34.0, |
| "grad_norm_var": 27.168489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.7248, |
| "loss/crossentropy": 2.1397932201623915, |
| "loss/hidden": 3.50546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.204028557613492, |
| "step": 4840 |
| }, |
| { |
| "epoch": 0.12125, |
| "grad_norm": 38.0, |
| "grad_norm_var": 22.633072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6658, |
| "loss/crossentropy": 2.3317618519067764, |
| "loss/hidden": 3.33046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19565313905477524, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.1215, |
| "grad_norm": 29.875, |
| "grad_norm_var": 4.6041015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5415, |
| "loss/crossentropy": 2.060061091184616, |
| "loss/hidden": 3.491015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19083615019917488, |
| "step": 4860 |
| }, |
| { |
| "epoch": 0.12175, |
| "grad_norm": 33.75, |
| "grad_norm_var": 5.06875, |
| "learning_rate": 0.0001, |
| "loss": 7.651, |
| "loss/crossentropy": 2.158045071363449, |
| "loss/hidden": 3.52421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2148456061258912, |
| "step": 4870 |
| }, |
| { |
| "epoch": 0.122, |
| "grad_norm": 30.625, |
| "grad_norm_var": 16.091666666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5793, |
| "loss/crossentropy": 2.0583921030163763, |
| "loss/hidden": 3.487109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19111349806189537, |
| "step": 4880 |
| }, |
| { |
| "epoch": 0.12225, |
| "grad_norm": 33.25, |
| "grad_norm_var": 17.422330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.6588, |
| "loss/crossentropy": 2.1186117827892303, |
| "loss/hidden": 3.500390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19436944983899593, |
| "step": 4890 |
| }, |
| { |
| "epoch": 0.1225, |
| "grad_norm": 36.75, |
| "grad_norm_var": 3.2676432291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4748, |
| "loss/crossentropy": 2.2382855489850044, |
| "loss/hidden": 3.39921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19216692261397839, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.12275, |
| "grad_norm": 29.375, |
| "grad_norm_var": 31.048958333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5018, |
| "loss/crossentropy": 2.1136436641216276, |
| "loss/hidden": 3.482421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19329534620046615, |
| "step": 4910 |
| }, |
| { |
| "epoch": 0.123, |
| "grad_norm": 40.5, |
| "grad_norm_var": 8.3572265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5661, |
| "loss/crossentropy": 2.043731611967087, |
| "loss/hidden": 3.472265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18812808189541103, |
| "step": 4920 |
| }, |
| { |
| "epoch": 0.12325, |
| "grad_norm": 31.25, |
| "grad_norm_var": 16.280989583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5565, |
| "loss/crossentropy": 2.129016649723053, |
| "loss/hidden": 3.345703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1941742904484272, |
| "step": 4930 |
| }, |
| { |
| "epoch": 0.1235, |
| "grad_norm": 30.125, |
| "grad_norm_var": 1.4504557291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5356, |
| "loss/crossentropy": 2.1981059461832047, |
| "loss/hidden": 3.533203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19572316966950892, |
| "step": 4940 |
| }, |
| { |
| "epoch": 0.12375, |
| "grad_norm": 54.5, |
| "grad_norm_var": 36.024739583333336, |
| "learning_rate": 0.0001, |
| "loss": 7.5463, |
| "loss/crossentropy": 2.107844803482294, |
| "loss/hidden": 3.551953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18902508020401002, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.124, |
| "grad_norm": 33.0, |
| "grad_norm_var": 64.778125, |
| "learning_rate": 0.0001, |
| "loss": 7.6053, |
| "loss/crossentropy": 2.1164773657917975, |
| "loss/hidden": 3.5078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19264463931322098, |
| "step": 4960 |
| }, |
| { |
| "epoch": 0.12425, |
| "grad_norm": 31.5, |
| "grad_norm_var": 42.0666015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5774, |
| "loss/crossentropy": 2.176864555478096, |
| "loss/hidden": 3.491796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20114411041140556, |
| "step": 4970 |
| }, |
| { |
| "epoch": 0.1245, |
| "grad_norm": 32.25, |
| "grad_norm_var": 33.07180989583333, |
| "learning_rate": 0.0001, |
| "loss": 7.5698, |
| "loss/crossentropy": 2.1456878036260605, |
| "loss/hidden": 3.38515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19218399338424205, |
| "step": 4980 |
| }, |
| { |
| "epoch": 0.12475, |
| "grad_norm": 33.25, |
| "grad_norm_var": 9.938997395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5467, |
| "loss/crossentropy": 2.3402266025543215, |
| "loss/hidden": 3.290625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18540082685649395, |
| "step": 4990 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 32.5, |
| "grad_norm_var": 1.6580729166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5883, |
| "loss/crossentropy": 2.186999189853668, |
| "loss/hidden": 3.341796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21516974158585073, |
| "step": 5000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 40000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 5000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.4287550160044032e+19, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|