| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.25, |
| "eval_steps": 2000, |
| "global_step": 10000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00025, |
| "grad_norm": 31.5, |
| "learning_rate": 0.0001, |
| "loss": 7.633, |
| "loss/crossentropy": 2.065455098450184, |
| "loss/hidden": 3.476953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20220321230590343, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0005, |
| "grad_norm": 35.0, |
| "grad_norm_var": 2.6895182291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4618, |
| "loss/crossentropy": 1.9399560801684856, |
| "loss/hidden": 3.394140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19191570337861777, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.00075, |
| "grad_norm": 37.5, |
| "grad_norm_var": 6.579622395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5972, |
| "loss/crossentropy": 2.130601316690445, |
| "loss/hidden": 3.38984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20188977513462306, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.001, |
| "grad_norm": 33.5, |
| "grad_norm_var": 6.253125, |
| "learning_rate": 0.0001, |
| "loss": 7.5917, |
| "loss/crossentropy": 2.2571407079696657, |
| "loss/hidden": 3.422265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19847887996584176, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.00125, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.1619140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6054, |
| "loss/crossentropy": 2.1717565625905992, |
| "loss/hidden": 3.43359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20264342725276946, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0015, |
| "grad_norm": 35.5, |
| "grad_norm_var": 15.786393229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5513, |
| "loss/crossentropy": 2.070718301087618, |
| "loss/hidden": 3.409375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19855907820165158, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.00175, |
| "grad_norm": 31.0, |
| "grad_norm_var": 12.4625, |
| "learning_rate": 0.0001, |
| "loss": 7.5447, |
| "loss/crossentropy": 2.118075390160084, |
| "loss/hidden": 3.473828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20283062420785428, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 32.25, |
| "grad_norm_var": 1.2643229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.468, |
| "loss/crossentropy": 2.0006178975105287, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18958428762853147, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.00225, |
| "grad_norm": 30.625, |
| "grad_norm_var": 3.470572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5061, |
| "loss/crossentropy": 1.9605075903236866, |
| "loss/hidden": 3.54375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20559987109154462, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.0025, |
| "grad_norm": 31.125, |
| "grad_norm_var": 6.763541666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4928, |
| "loss/crossentropy": 2.1205389350652695, |
| "loss/hidden": 3.44140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19496036488562823, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.00275, |
| "grad_norm": 31.0, |
| "grad_norm_var": 6.1509765625, |
| "learning_rate": 0.0001, |
| "loss": 7.595, |
| "loss/crossentropy": 2.1240097641944886, |
| "loss/hidden": 3.43671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19564666803926228, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 31.25, |
| "grad_norm_var": 3.348893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5329, |
| "loss/crossentropy": 2.175096944719553, |
| "loss/hidden": 3.41796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21303062327206135, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.00325, |
| "grad_norm": 32.0, |
| "grad_norm_var": 2.8541666666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5536, |
| "loss/crossentropy": 2.1472502022981645, |
| "loss/hidden": 3.342578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18929538186639547, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.0035, |
| "grad_norm": 29.375, |
| "grad_norm_var": 29.683268229166668, |
| "learning_rate": 0.0001, |
| "loss": 7.5191, |
| "loss/crossentropy": 2.015011890232563, |
| "loss/hidden": 3.44296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20328481420874595, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.00375, |
| "grad_norm": 28.75, |
| "grad_norm_var": 28.74765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4158, |
| "loss/crossentropy": 1.9774167470633983, |
| "loss/hidden": 3.43515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19464388117194176, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 30.875, |
| "grad_norm_var": 1.3635416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6354, |
| "loss/crossentropy": 2.320629420876503, |
| "loss/hidden": 3.418359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20745602920651435, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.00425, |
| "grad_norm": 31.5, |
| "grad_norm_var": 1.0270182291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4137, |
| "loss/crossentropy": 1.900385806709528, |
| "loss/hidden": 3.345703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16769229620695114, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.0045, |
| "grad_norm": 31.25, |
| "grad_norm_var": 0.9833333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5763, |
| "loss/crossentropy": 2.129625543951988, |
| "loss/hidden": 3.5171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2102549459785223, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.00475, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.05390625, |
| "learning_rate": 0.0001, |
| "loss": 7.6166, |
| "loss/crossentropy": 2.1552532628178596, |
| "loss/hidden": 3.469140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2250068686902523, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 29.625, |
| "grad_norm_var": 3.8375, |
| "learning_rate": 0.0001, |
| "loss": 7.5745, |
| "loss/crossentropy": 1.9441482461988926, |
| "loss/hidden": 3.387890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.195942450594157, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.00525, |
| "grad_norm": 32.5, |
| "grad_norm_var": 18.396875, |
| "learning_rate": 0.0001, |
| "loss": 7.5292, |
| "loss/crossentropy": 1.9941987417638303, |
| "loss/hidden": 3.394140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18264975901693106, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.0055, |
| "grad_norm": 31.75, |
| "grad_norm_var": 20.736393229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.4899, |
| "loss/crossentropy": 2.0191620789468288, |
| "loss/hidden": 3.355078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18100650198757648, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.00575, |
| "grad_norm": 30.375, |
| "grad_norm_var": 2.342643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5199, |
| "loss/crossentropy": 2.001779730618, |
| "loss/hidden": 3.32109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17959208656102418, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 30.75, |
| "grad_norm_var": 1.271875, |
| "learning_rate": 0.0001, |
| "loss": 7.6842, |
| "loss/crossentropy": 2.1846971333026888, |
| "loss/hidden": 3.397265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2059234745800495, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.00625, |
| "grad_norm": 29.5, |
| "grad_norm_var": 5.688541666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5196, |
| "loss/crossentropy": 2.174124576151371, |
| "loss/hidden": 3.401953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20000722594559192, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.0065, |
| "grad_norm": 28.75, |
| "grad_norm_var": 1.9572265625, |
| "learning_rate": 0.0001, |
| "loss": 7.3875, |
| "loss/crossentropy": 1.9285166233778, |
| "loss/hidden": 3.396875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18449910767376423, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.00675, |
| "grad_norm": 33.5, |
| "grad_norm_var": 2.0999348958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5877, |
| "loss/crossentropy": 2.0323276594281197, |
| "loss/hidden": 3.37890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19395631980150937, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 30.5, |
| "grad_norm_var": 2.15390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5791, |
| "loss/crossentropy": 2.126656140387058, |
| "loss/hidden": 3.496875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21661139875650406, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.00725, |
| "grad_norm": 29.5, |
| "grad_norm_var": 3.193489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5587, |
| "loss/crossentropy": 2.200097793340683, |
| "loss/hidden": 3.529296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21046234332025052, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 26.75, |
| "grad_norm_var": 4.27265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5404, |
| "loss/crossentropy": 2.1184144005179406, |
| "loss/hidden": 3.487890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20949590150266886, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.00775, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.3643229166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5628, |
| "loss/crossentropy": 1.9984030593186617, |
| "loss/hidden": 3.453515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18789457948878407, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 32.5, |
| "grad_norm_var": 2.5645182291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5695, |
| "loss/crossentropy": 2.143594169616699, |
| "loss/hidden": 3.42421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19360470157116652, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.00825, |
| "grad_norm": 29.375, |
| "grad_norm_var": 1.8749348958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.3627, |
| "loss/crossentropy": 2.1077703177928924, |
| "loss/hidden": 3.373828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19771252572536469, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.0085, |
| "grad_norm": 29.75, |
| "grad_norm_var": 1.5978515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4192, |
| "loss/crossentropy": 2.0583472289144993, |
| "loss/hidden": 3.3671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20189273860305548, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.00875, |
| "grad_norm": 29.875, |
| "grad_norm_var": 1.2872395833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5432, |
| "loss/crossentropy": 2.0804511278867723, |
| "loss/hidden": 3.38828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19735569059848784, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 30.5, |
| "grad_norm_var": 18.731184895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4948, |
| "loss/crossentropy": 2.0466629534959795, |
| "loss/hidden": 3.315234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18366040643304588, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.00925, |
| "grad_norm": 30.875, |
| "grad_norm_var": 25.9916015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5081, |
| "loss/crossentropy": 1.9005662694573402, |
| "loss/hidden": 3.501171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1900689721107483, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.0095, |
| "grad_norm": 28.75, |
| "grad_norm_var": 2.451041666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4305, |
| "loss/crossentropy": 2.0674299761652946, |
| "loss/hidden": 3.517578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21062961965799332, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.00975, |
| "grad_norm": 31.25, |
| "grad_norm_var": 5.645247395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5168, |
| "loss/crossentropy": 2.0279919117689134, |
| "loss/hidden": 3.503125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20519332773983479, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 31.125, |
| "grad_norm_var": 5.928125, |
| "learning_rate": 0.0001, |
| "loss": 7.4985, |
| "loss/crossentropy": 2.0427632443606853, |
| "loss/hidden": 3.53125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20287631042301654, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.01025, |
| "grad_norm": 38.5, |
| "grad_norm_var": 438.43515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5633, |
| "loss/crossentropy": 2.199043881893158, |
| "loss/hidden": 3.397265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21130343191325665, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.0105, |
| "grad_norm": 30.875, |
| "grad_norm_var": 43.14140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4835, |
| "loss/crossentropy": 1.9102243572473525, |
| "loss/hidden": 3.42578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1895731385797262, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.01075, |
| "grad_norm": 31.75, |
| "grad_norm_var": 5.658268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3897, |
| "loss/crossentropy": 2.159160128980875, |
| "loss/hidden": 3.464453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20280379485338926, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 28.375, |
| "grad_norm_var": 16.3375, |
| "learning_rate": 0.0001, |
| "loss": 7.5463, |
| "loss/crossentropy": 2.1217672407627104, |
| "loss/hidden": 3.545703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23856931366026402, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.01125, |
| "grad_norm": 30.5, |
| "grad_norm_var": 17.098372395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5225, |
| "loss/crossentropy": 1.969854873791337, |
| "loss/hidden": 3.430078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19548849146813155, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.0115, |
| "grad_norm": 29.875, |
| "grad_norm_var": 2.5677083333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5046, |
| "loss/crossentropy": 2.121321603655815, |
| "loss/hidden": 3.476171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19364523217082025, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.01175, |
| "grad_norm": 32.25, |
| "grad_norm_var": 8.585416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4558, |
| "loss/crossentropy": 1.9360710382461548, |
| "loss/hidden": 3.382421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1893781816586852, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 29.875, |
| "grad_norm_var": 3.417122395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.531, |
| "loss/crossentropy": 2.082458943128586, |
| "loss/hidden": 3.471875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2220946006476879, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.01225, |
| "grad_norm": 31.0, |
| "grad_norm_var": 48.96640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5651, |
| "loss/crossentropy": 2.1382531195878984, |
| "loss/hidden": 3.480078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20847559962421655, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 29.875, |
| "grad_norm_var": 49.2666015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5679, |
| "loss/crossentropy": 2.0875915244221686, |
| "loss/hidden": 3.33125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1850985599681735, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.01275, |
| "grad_norm": 31.875, |
| "grad_norm_var": 1.45, |
| "learning_rate": 0.0001, |
| "loss": 7.5263, |
| "loss/crossentropy": 2.182442346215248, |
| "loss/hidden": 3.446484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19555890336632728, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 34.0, |
| "grad_norm_var": 1.6931640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5209, |
| "loss/crossentropy": 1.9812136888504028, |
| "loss/hidden": 3.4921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1965757070109248, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.01325, |
| "grad_norm": 31.0, |
| "grad_norm_var": 2.101822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6059, |
| "loss/crossentropy": 2.0372241511940956, |
| "loss/hidden": 3.564453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.204646560549736, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.0135, |
| "grad_norm": 29.125, |
| "grad_norm_var": 20.071875, |
| "learning_rate": 0.0001, |
| "loss": 7.5725, |
| "loss/crossentropy": 2.155761349201202, |
| "loss/hidden": 3.4125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19602423422038556, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.01375, |
| "grad_norm": 29.125, |
| "grad_norm_var": 20.506705729166665, |
| "learning_rate": 0.0001, |
| "loss": 7.5842, |
| "loss/crossentropy": 1.8869566857814788, |
| "loss/hidden": 3.437890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20957522764801978, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 30.625, |
| "grad_norm_var": 10.025455729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4975, |
| "loss/crossentropy": 2.0370677679777147, |
| "loss/hidden": 3.361328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19026046600192786, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.01425, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.2270833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5688, |
| "loss/crossentropy": 2.1931444257497787, |
| "loss/hidden": 3.415234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2036376902833581, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.0145, |
| "grad_norm": 35.0, |
| "grad_norm_var": 3.5681640625, |
| "learning_rate": 0.0001, |
| "loss": 7.478, |
| "loss/crossentropy": 2.061052493005991, |
| "loss/hidden": 3.478125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2282864760607481, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.01475, |
| "grad_norm": 32.5, |
| "grad_norm_var": 2.8705729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5957, |
| "loss/crossentropy": 2.0078392371535303, |
| "loss/hidden": 3.45, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19647251404821872, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 30.25, |
| "grad_norm_var": 31.449934895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5096, |
| "loss/crossentropy": 2.0417068414390087, |
| "loss/hidden": 3.423046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19782953998073935, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.01525, |
| "grad_norm": 30.5, |
| "grad_norm_var": 26.253059895833335, |
| "learning_rate": 0.0001, |
| "loss": 7.5368, |
| "loss/crossentropy": 2.1738049775362014, |
| "loss/hidden": 3.409765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1996332859620452, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.0155, |
| "grad_norm": 30.125, |
| "grad_norm_var": 2.334375, |
| "learning_rate": 0.0001, |
| "loss": 7.4868, |
| "loss/crossentropy": 1.7587297886610032, |
| "loss/hidden": 3.475390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18938990794122218, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.01575, |
| "grad_norm": 29.25, |
| "grad_norm_var": 27.393684895833335, |
| "learning_rate": 0.0001, |
| "loss": 7.4833, |
| "loss/crossentropy": 1.9551145888864994, |
| "loss/hidden": 3.384375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20075901364907622, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 29.75, |
| "grad_norm_var": 29.6947265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4608, |
| "loss/crossentropy": 2.128718316555023, |
| "loss/hidden": 3.3625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19077460393309592, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.01625, |
| "grad_norm": 29.75, |
| "grad_norm_var": 27.322330729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.6033, |
| "loss/crossentropy": 1.9678708665072917, |
| "loss/hidden": 3.413671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18875791020691396, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.0165, |
| "grad_norm": 30.375, |
| "grad_norm_var": 3.129622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3873, |
| "loss/crossentropy": 1.9582339562475681, |
| "loss/hidden": 3.34765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18309127148240806, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.01675, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.7009765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4913, |
| "loss/crossentropy": 2.0773802563548087, |
| "loss/hidden": 3.505078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20910798981785775, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 34.0, |
| "grad_norm_var": 3.3854166666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.4847, |
| "loss/crossentropy": 2.12913373708725, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.201920267008245, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.01725, |
| "grad_norm": 30.75, |
| "grad_norm_var": 1.7176432291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5065, |
| "loss/crossentropy": 1.9141538538038732, |
| "loss/hidden": 3.44921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1841401271522045, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.0175, |
| "grad_norm": 31.0, |
| "grad_norm_var": 1.6374348958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5897, |
| "loss/crossentropy": 2.207232800126076, |
| "loss/hidden": 3.399609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21376523859798907, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.01775, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.3655598958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5075, |
| "loss/crossentropy": 2.03845998942852, |
| "loss/hidden": 3.41953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1920805646572262, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 32.5, |
| "grad_norm_var": 1.3893229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4669, |
| "loss/crossentropy": 2.054341807588935, |
| "loss/hidden": 3.489453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19716067584231495, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.01825, |
| "grad_norm": 31.625, |
| "grad_norm_var": 3.54140625, |
| "learning_rate": 0.0001, |
| "loss": 7.517, |
| "loss/crossentropy": 2.2111608639359472, |
| "loss/hidden": 3.409765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20262118335813284, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.0185, |
| "grad_norm": 29.125, |
| "grad_norm_var": 4.692122395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4784, |
| "loss/crossentropy": 2.0551758617162705, |
| "loss/hidden": 3.446875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20378697756677866, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.01875, |
| "grad_norm": 33.0, |
| "grad_norm_var": 4.295572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4016, |
| "loss/crossentropy": 2.128055375814438, |
| "loss/hidden": 3.3953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19904747987166047, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 6106906624.0, |
| "grad_norm_var": 2.3308942582349476e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4633, |
| "loss/crossentropy": 2.248567137122154, |
| "loss/hidden": 3.37265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19723597317934036, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.01925, |
| "grad_norm": 28.5, |
| "grad_norm_var": 2.330894258158611e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4542, |
| "loss/crossentropy": 2.132212319970131, |
| "loss/hidden": 3.373828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18174959290772677, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.0195, |
| "grad_norm": 36.5, |
| "grad_norm_var": 4.833333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.465, |
| "loss/crossentropy": 2.046277052164078, |
| "loss/hidden": 3.491015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21161840241402388, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.01975, |
| "grad_norm": 32.75, |
| "grad_norm_var": 5.137434895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4171, |
| "loss/crossentropy": 2.058088332414627, |
| "loss/hidden": 3.315234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1815673651173711, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 30.125, |
| "grad_norm_var": 12.37265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4153, |
| "loss/crossentropy": 2.064726157486439, |
| "loss/hidden": 3.515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19402222614735365, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.02025, |
| "grad_norm": 32.0, |
| "grad_norm_var": 12.240625, |
| "learning_rate": 0.0001, |
| "loss": 7.3739, |
| "loss/crossentropy": 2.0926051691174505, |
| "loss/hidden": 3.476953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21017331834882497, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.0205, |
| "grad_norm": 31.875, |
| "grad_norm_var": 3.6853515625, |
| "learning_rate": 0.0001, |
| "loss": 7.409, |
| "loss/crossentropy": 2.016859006881714, |
| "loss/hidden": 3.436328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20363395065069198, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.02075, |
| "grad_norm": 34.0, |
| "grad_norm_var": 278.1108723958333, |
| "learning_rate": 0.0001, |
| "loss": 7.6725, |
| "loss/crossentropy": 2.03957434669137, |
| "loss/hidden": 3.4625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19866096526384353, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 35.75, |
| "grad_norm_var": 281.2239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4058, |
| "loss/crossentropy": 2.1190530106425287, |
| "loss/hidden": 3.41796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19663097113370895, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.02125, |
| "grad_norm": 32.25, |
| "grad_norm_var": 4.044791666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4687, |
| "loss/crossentropy": 2.1552326917648315, |
| "loss/hidden": 3.41796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19604418501257898, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.0215, |
| "grad_norm": 37.25, |
| "grad_norm_var": 2.7587362193217157e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5552, |
| "loss/crossentropy": 2.1164004117250443, |
| "loss/hidden": 3.375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19724889248609542, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.02175, |
| "grad_norm": 35.25, |
| "grad_norm_var": 2.758736219342478e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5021, |
| "loss/crossentropy": 2.036998500674963, |
| "loss/hidden": 3.298828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18320635841228067, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 37.0, |
| "grad_norm_var": 16.9541015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5059, |
| "loss/crossentropy": 1.9707016140222549, |
| "loss/hidden": 3.36328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20436920877546072, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.02225, |
| "grad_norm": 31.375, |
| "grad_norm_var": 30.538541666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4935, |
| "loss/crossentropy": 2.206394499540329, |
| "loss/hidden": 3.366015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20495780408382416, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.0225, |
| "grad_norm": 29.875, |
| "grad_norm_var": 28.020833333333332, |
| "learning_rate": 0.0001, |
| "loss": 7.4823, |
| "loss/crossentropy": 2.091763325035572, |
| "loss/hidden": 3.43828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20592593550682067, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.02275, |
| "grad_norm": 31.875, |
| "grad_norm_var": 3.5645182291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.422, |
| "loss/crossentropy": 1.9740761511027813, |
| "loss/hidden": 3.494921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2015986293554306, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 32.0, |
| "grad_norm_var": 56.256184895833336, |
| "learning_rate": 0.0001, |
| "loss": 7.4528, |
| "loss/crossentropy": 2.030415116250515, |
| "loss/hidden": 3.205078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1614784031175077, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.02325, |
| "grad_norm": 30.0, |
| "grad_norm_var": 57.1619140625, |
| "learning_rate": 0.0001, |
| "loss": 7.3713, |
| "loss/crossentropy": 2.0250086903572084, |
| "loss/hidden": 3.455859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19023355115205048, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.0235, |
| "grad_norm": 30.625, |
| "grad_norm_var": 1.3830729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5277, |
| "loss/crossentropy": 2.222324788570404, |
| "loss/hidden": 3.366796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19078677501529456, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.02375, |
| "grad_norm": 31.0, |
| "grad_norm_var": 3.1455729166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5086, |
| "loss/crossentropy": 2.1299516543745995, |
| "loss/hidden": 3.49921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21310927756130696, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 29.875, |
| "grad_norm_var": 8.883072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5579, |
| "loss/crossentropy": 2.0535727672278883, |
| "loss/hidden": 3.43828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18507701791822911, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.02425, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.5916015625, |
| "learning_rate": 0.0001, |
| "loss": 7.537, |
| "loss/crossentropy": 2.1785535484552385, |
| "loss/hidden": 3.309765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1955953363329172, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.0245, |
| "grad_norm": 36.5, |
| "grad_norm_var": 6.852083333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5091, |
| "loss/crossentropy": 2.0967498391866686, |
| "loss/hidden": 3.43515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2146583067253232, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.02475, |
| "grad_norm": 29.625, |
| "grad_norm_var": 4.325455729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5901, |
| "loss/crossentropy": 2.1134474128484726, |
| "loss/hidden": 3.3953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19056662563234567, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 42.0, |
| "grad_norm_var": 4.1552039405313587e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.6082, |
| "loss/crossentropy": 2.0916516482830048, |
| "loss/hidden": 3.46640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19376826155930757, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.02525, |
| "grad_norm": 29.625, |
| "grad_norm_var": 4.1552039416015355e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4528, |
| "loss/crossentropy": 2.003750593960285, |
| "loss/hidden": 3.330859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18129821103066207, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.0255, |
| "grad_norm": 35.25, |
| "grad_norm_var": 24.095572916666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5395, |
| "loss/crossentropy": 2.0453194856643675, |
| "loss/hidden": 3.477734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.199107607267797, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.02575, |
| "grad_norm": 32.25, |
| "grad_norm_var": 19.5259765625, |
| "learning_rate": 0.0001, |
| "loss": 7.31, |
| "loss/crossentropy": 2.1016619503498077, |
| "loss/hidden": 3.34453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.184703135676682, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.026, |
| "grad_norm": 30.75, |
| "grad_norm_var": 1.87890625, |
| "learning_rate": 0.0001, |
| "loss": 7.5425, |
| "loss/crossentropy": 2.1467826470732687, |
| "loss/hidden": 3.432421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20074132941663264, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.02625, |
| "grad_norm": 30.625, |
| "grad_norm_var": 0.7452473958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4114, |
| "loss/crossentropy": 2.049474111199379, |
| "loss/hidden": 3.41796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20267941821366547, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.0265, |
| "grad_norm": 31.75, |
| "grad_norm_var": 3.124739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4845, |
| "loss/crossentropy": 2.036583887040615, |
| "loss/hidden": 3.391796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1893632340244949, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.02675, |
| "grad_norm": 40.75, |
| "grad_norm_var": 3.405847188209664e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.3982, |
| "loss/crossentropy": 2.124411530792713, |
| "loss/hidden": 3.4484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19454579129815103, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.027, |
| "grad_norm": 28.25, |
| "grad_norm_var": 3.4058471885941417e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.3928, |
| "loss/crossentropy": 2.0034691862761975, |
| "loss/hidden": 3.503515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21349683087319135, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.02725, |
| "grad_norm": 29.875, |
| "grad_norm_var": 4.88515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5095, |
| "loss/crossentropy": 1.9183670297265052, |
| "loss/hidden": 3.405859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19249978363513948, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.0275, |
| "grad_norm": 30.5, |
| "grad_norm_var": 3.2728515625, |
| "learning_rate": 0.0001, |
| "loss": 7.37, |
| "loss/crossentropy": 2.145428071916103, |
| "loss/hidden": 3.35703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19729665387421846, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.02775, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.34765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4772, |
| "loss/crossentropy": 2.10652961358428, |
| "loss/hidden": 3.398046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19585925145074726, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.434477049308093e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4016, |
| "loss/crossentropy": 1.9645449101924897, |
| "loss/hidden": 3.44453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19977953620254993, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.02825, |
| "grad_norm": 32.0, |
| "grad_norm_var": 2.4344770492950907e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4453, |
| "loss/crossentropy": 2.131172102689743, |
| "loss/hidden": 3.383984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2083016105927527, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.0285, |
| "grad_norm": 32.75, |
| "grad_norm_var": 3.7080729166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4009, |
| "loss/crossentropy": 2.003016713261604, |
| "loss/hidden": 3.34453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18665643623098732, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.02875, |
| "grad_norm": 30.875, |
| "grad_norm_var": 1.34765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5648, |
| "loss/crossentropy": 2.0709651306271555, |
| "loss/hidden": 3.45703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18793081305921078, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.029, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.1582682291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4644, |
| "loss/crossentropy": 2.06434089243412, |
| "loss/hidden": 3.454296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2109043262898922, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.02925, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.4010416666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.4403, |
| "loss/crossentropy": 2.0107607185840606, |
| "loss/hidden": 3.498046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20349722560495137, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.0295, |
| "grad_norm": 33.25, |
| "grad_norm_var": 1.2260416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4412, |
| "loss/crossentropy": 2.096436749398708, |
| "loss/hidden": 3.474609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20087064132094384, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.02975, |
| "grad_norm": 29.75, |
| "grad_norm_var": 1.8046223958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4458, |
| "loss/crossentropy": 1.972258360683918, |
| "loss/hidden": 3.583984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20998958311975002, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 33.75, |
| "grad_norm_var": 3.7395833333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.3931, |
| "loss/crossentropy": 1.8556599006056786, |
| "loss/hidden": 3.397265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19810242671519518, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.03025, |
| "grad_norm": 29.0, |
| "grad_norm_var": 9.394791666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5849, |
| "loss/crossentropy": 2.0611833460628985, |
| "loss/hidden": 3.3984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19216072149574756, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.0305, |
| "grad_norm": 31.75, |
| "grad_norm_var": 3.26640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4844, |
| "loss/crossentropy": 2.0546294137835504, |
| "loss/hidden": 3.58828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21588555499911308, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.03075, |
| "grad_norm": 31.625, |
| "grad_norm_var": 2.3968098958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4858, |
| "loss/crossentropy": 2.0615282475948336, |
| "loss/hidden": 3.3671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.206529095210135, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.031, |
| "grad_norm": 32.0, |
| "grad_norm_var": 1.6124348958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4647, |
| "loss/crossentropy": 1.9786661133170127, |
| "loss/hidden": 3.381640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17899234425276517, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.03125, |
| "grad_norm": 5838471168.0, |
| "grad_norm_var": 2.1304840753447437e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4926, |
| "loss/crossentropy": 2.04936410933733, |
| "loss/hidden": 3.714453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1995564555749297, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.0315, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.1304840747304878e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5078, |
| "loss/crossentropy": 2.1189576953649523, |
| "loss/hidden": 3.43515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19967459067702292, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.03175, |
| "grad_norm": 30.5, |
| "grad_norm_var": 3.178580729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4255, |
| "loss/crossentropy": 2.163596141338348, |
| "loss/hidden": 3.4546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19321363251656293, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.1639973958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4609, |
| "loss/crossentropy": 1.9938266813755035, |
| "loss/hidden": 3.351953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18334759529680014, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.03225, |
| "grad_norm": 29.375, |
| "grad_norm_var": 1.67890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4652, |
| "loss/crossentropy": 2.161333967000246, |
| "loss/hidden": 3.38828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19740422032773494, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.0325, |
| "grad_norm": 32.75, |
| "grad_norm_var": 3.0385416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3146, |
| "loss/crossentropy": 2.0165325723588468, |
| "loss/hidden": 3.49921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19117104820907116, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.03275, |
| "grad_norm": 28.25, |
| "grad_norm_var": 9.158072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4955, |
| "loss/crossentropy": 2.124955786764622, |
| "loss/hidden": 3.491015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19802952595055104, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.033, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.4535807291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4311, |
| "loss/crossentropy": 2.018800371140242, |
| "loss/hidden": 3.542578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2196814114227891, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.03325, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.39375, |
| "learning_rate": 0.0001, |
| "loss": 7.5164, |
| "loss/crossentropy": 2.0520452961325644, |
| "loss/hidden": 3.454296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2013697015121579, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.0335, |
| "grad_norm": 32.5, |
| "grad_norm_var": 1.0431640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5302, |
| "loss/crossentropy": 2.12932348549366, |
| "loss/hidden": 3.525, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20245677568018436, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.03375, |
| "grad_norm": 30.625, |
| "grad_norm_var": 3.3900390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5292, |
| "loss/crossentropy": 2.031618994474411, |
| "loss/hidden": 3.44140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19062725063413383, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.034, |
| "grad_norm": 32.0, |
| "grad_norm_var": 3.3447265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5755, |
| "loss/crossentropy": 2.2257011234760284, |
| "loss/hidden": 3.447265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1979327043518424, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.03425, |
| "grad_norm": 30.625, |
| "grad_norm_var": 3.3421223958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4219, |
| "loss/crossentropy": 2.155778780579567, |
| "loss/hidden": 3.31796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19018295016139747, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.0345, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.5872395833333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4637, |
| "loss/crossentropy": 2.058405503630638, |
| "loss/hidden": 3.39296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2114524593576789, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.03475, |
| "grad_norm": 32.5, |
| "grad_norm_var": 3.2994140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5834, |
| "loss/crossentropy": 2.1654782712459566, |
| "loss/hidden": 3.442578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2024593001231551, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 31.125, |
| "grad_norm_var": 12.812239583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4442, |
| "loss/crossentropy": 2.0921876966953277, |
| "loss/hidden": 3.286328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19270132519304753, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.03525, |
| "grad_norm": 29.25, |
| "grad_norm_var": 1.5108723958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4779, |
| "loss/crossentropy": 1.9434148371219635, |
| "loss/hidden": 3.366015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17576389852911234, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.0355, |
| "grad_norm": 30.125, |
| "grad_norm_var": 2.154166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.508, |
| "loss/crossentropy": 2.0766889482736586, |
| "loss/hidden": 3.485546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20394362770020963, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.03575, |
| "grad_norm": 30.125, |
| "grad_norm_var": 17.580208333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4612, |
| "loss/crossentropy": 2.00380075648427, |
| "loss/hidden": 3.4453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18816210143268108, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 31.375, |
| "grad_norm_var": 16.758268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4602, |
| "loss/crossentropy": 2.1938020154833793, |
| "loss/hidden": 3.4234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2016971528530121, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.03625, |
| "grad_norm": 30.875, |
| "grad_norm_var": 1.2556640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4245, |
| "loss/crossentropy": 2.0232372283935547, |
| "loss/hidden": 3.40234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19209201391786337, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.0365, |
| "grad_norm": 31.0, |
| "grad_norm_var": 1.4041015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5518, |
| "loss/crossentropy": 2.2000616788864136, |
| "loss/hidden": 3.473046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22938326951116322, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.03675, |
| "grad_norm": 28.375, |
| "grad_norm_var": 2.0322916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4397, |
| "loss/crossentropy": 2.0838582158088683, |
| "loss/hidden": 3.451953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20685861641541123, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.037, |
| "grad_norm": 32.0, |
| "grad_norm_var": 1.5020833333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4183, |
| "loss/crossentropy": 2.149951633810997, |
| "loss/hidden": 3.375390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1984950641170144, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.03725, |
| "grad_norm": 33.75, |
| "grad_norm_var": 34.10826822916667, |
| "learning_rate": 0.0001, |
| "loss": 7.453, |
| "loss/crossentropy": 2.128306310623884, |
| "loss/hidden": 3.33203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19783397912979125, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.0375, |
| "grad_norm": 29.5, |
| "grad_norm_var": 5.008072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.469, |
| "loss/crossentropy": 2.042660539597273, |
| "loss/hidden": 3.365234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19274956732988358, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.03775, |
| "grad_norm": 33.0, |
| "grad_norm_var": 19.1775390625, |
| "learning_rate": 0.0001, |
| "loss": 7.4119, |
| "loss/crossentropy": 2.043857058137655, |
| "loss/hidden": 3.376953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18266947232186795, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.038, |
| "grad_norm": 29.625, |
| "grad_norm_var": 14.303580729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4362, |
| "loss/crossentropy": 1.9492302805185318, |
| "loss/hidden": 3.3515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1754497304558754, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.03825, |
| "grad_norm": 29.75, |
| "grad_norm_var": 23.764518229166665, |
| "learning_rate": 0.0001, |
| "loss": 7.4444, |
| "loss/crossentropy": 2.0668226674199106, |
| "loss/hidden": 3.473828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1921279976144433, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.0385, |
| "grad_norm": 32.75, |
| "grad_norm_var": 3.2226069790467994e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5077, |
| "loss/crossentropy": 2.1122784771025183, |
| "loss/hidden": 3.46953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22245875597000123, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.03875, |
| "grad_norm": 30.25, |
| "grad_norm_var": 5.382291666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4525, |
| "loss/crossentropy": 2.264697426557541, |
| "loss/hidden": 3.432421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2075907403603196, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.039, |
| "grad_norm": 30.0, |
| "grad_norm_var": 6.353580729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5064, |
| "loss/crossentropy": 2.1150408178567885, |
| "loss/hidden": 3.5203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23207673486322164, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.03925, |
| "grad_norm": 34.25, |
| "grad_norm_var": 6.72265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4578, |
| "loss/crossentropy": 2.188142140209675, |
| "loss/hidden": 3.445703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20429779235273599, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.0395, |
| "grad_norm": 34.75, |
| "grad_norm_var": 897.6666015625, |
| "learning_rate": 0.0001, |
| "loss": 7.434, |
| "loss/crossentropy": 2.0795677445828913, |
| "loss/hidden": 3.3828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18706642352044583, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.03975, |
| "grad_norm": 28.0, |
| "grad_norm_var": 903.6327473958333, |
| "learning_rate": 0.0001, |
| "loss": 7.5655, |
| "loss/crossentropy": 2.1025844663381577, |
| "loss/hidden": 3.469140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1966788914054632, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 28.625, |
| "grad_norm_var": 11.97890625, |
| "learning_rate": 0.0001, |
| "loss": 7.2578, |
| "loss/crossentropy": 2.050418493151665, |
| "loss/hidden": 3.453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20104087069630622, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.04025, |
| "grad_norm": 28.0, |
| "grad_norm_var": 2.255989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4393, |
| "loss/crossentropy": 2.1767756581306457, |
| "loss/hidden": 3.5140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2213939843699336, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.0405, |
| "grad_norm": 29.75, |
| "grad_norm_var": 3.80390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5026, |
| "loss/crossentropy": 2.126803469657898, |
| "loss/hidden": 3.39375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19106289148330688, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.04075, |
| "grad_norm": 32.0, |
| "grad_norm_var": 3.1249348958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4274, |
| "loss/crossentropy": 2.144256164133549, |
| "loss/hidden": 3.424609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21435861438512802, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.041, |
| "grad_norm": 30.25, |
| "grad_norm_var": 29.265559895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5728, |
| "loss/crossentropy": 2.2575725719332693, |
| "loss/hidden": 3.4421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20658138059079648, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.04125, |
| "grad_norm": 30.5, |
| "grad_norm_var": 48.35390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5776, |
| "loss/crossentropy": 2.096929042041302, |
| "loss/hidden": 3.346875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18803389491513373, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.0415, |
| "grad_norm": 30.5, |
| "grad_norm_var": 1.1010416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3792, |
| "loss/crossentropy": 2.0290944524109364, |
| "loss/hidden": 3.313671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19023821037262678, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.04175, |
| "grad_norm": 28.125, |
| "grad_norm_var": 33.49270833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5018, |
| "loss/crossentropy": 2.0678361281752586, |
| "loss/hidden": 3.35234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18862500675022603, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.042, |
| "grad_norm": 29.75, |
| "grad_norm_var": 2.2955729166666665, |
| "learning_rate": 0.0001, |
| "loss": 7.4432, |
| "loss/crossentropy": 2.0549797296524046, |
| "loss/hidden": 3.441796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19089050237089394, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.04225, |
| "grad_norm": 29.75, |
| "grad_norm_var": 1.8791666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3842, |
| "loss/crossentropy": 2.0077505365014074, |
| "loss/hidden": 3.382421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18722779098898173, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.0425, |
| "grad_norm": 29.375, |
| "grad_norm_var": 0.9434895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4273, |
| "loss/crossentropy": 2.071325332671404, |
| "loss/hidden": 3.486328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20270166713744403, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.04275, |
| "grad_norm": 38.25, |
| "grad_norm_var": 7.669791666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4176, |
| "loss/crossentropy": 2.1353142291307448, |
| "loss/hidden": 3.453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19663168713450432, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.043, |
| "grad_norm": 28.25, |
| "grad_norm_var": 7.75, |
| "learning_rate": 0.0001, |
| "loss": 7.3818, |
| "loss/crossentropy": 1.9995346069335938, |
| "loss/hidden": 3.41015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18310597026720643, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.04325, |
| "grad_norm": 29.5, |
| "grad_norm_var": 3.7619140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4912, |
| "loss/crossentropy": 2.1415088951587675, |
| "loss/hidden": 3.55078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22313783299177886, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.0435, |
| "grad_norm": 31.625, |
| "grad_norm_var": 3.0416666666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.4999, |
| "loss/crossentropy": 2.1686330527067184, |
| "loss/hidden": 3.384765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20409150077030064, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.04375, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.724739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.438, |
| "loss/crossentropy": 1.9411263287067413, |
| "loss/hidden": 3.304296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17631518254056572, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 32.0, |
| "grad_norm_var": 1.9145833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.679, |
| "loss/crossentropy": 2.1614590853452684, |
| "loss/hidden": 3.36015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.194198589771986, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.04425, |
| "grad_norm": 28.5, |
| "grad_norm_var": 2.039322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5095, |
| "loss/crossentropy": 2.282147654891014, |
| "loss/hidden": 3.359765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19978236705064772, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.0445, |
| "grad_norm": 29.625, |
| "grad_norm_var": 2.34140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5296, |
| "loss/crossentropy": 2.2078514605760575, |
| "loss/hidden": 3.403515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19668537452816964, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.04475, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.70390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5779, |
| "loss/crossentropy": 2.1053253799676894, |
| "loss/hidden": 3.433984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20323336366564035, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 28.5, |
| "grad_norm_var": 4.8712890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4866, |
| "loss/crossentropy": 2.060333488881588, |
| "loss/hidden": 3.373828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18627767637372017, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.04525, |
| "grad_norm": 28.0, |
| "grad_norm_var": 14.480989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5225, |
| "loss/crossentropy": 1.9755317773669958, |
| "loss/hidden": 3.54375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20334282671101392, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.0455, |
| "grad_norm": 29.875, |
| "grad_norm_var": 12.935872395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4781, |
| "loss/crossentropy": 2.1289859026670457, |
| "loss/hidden": 3.346484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1973018018528819, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.04575, |
| "grad_norm": 31.75, |
| "grad_norm_var": 2.123893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3915, |
| "loss/crossentropy": 1.9609280914068221, |
| "loss/hidden": 3.386328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1916458262130618, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.046, |
| "grad_norm": 32.0, |
| "grad_norm_var": 1.6332682291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5095, |
| "loss/crossentropy": 2.0019985377788543, |
| "loss/hidden": 3.384375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19768325993791222, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.04625, |
| "grad_norm": 29.875, |
| "grad_norm_var": 2.225455729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.623, |
| "loss/crossentropy": 2.0607564479112623, |
| "loss/hidden": 3.507421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20858939345926047, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.0465, |
| "grad_norm": 29.5, |
| "grad_norm_var": 1.9863932291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3836, |
| "loss/crossentropy": 2.132562433928251, |
| "loss/hidden": 3.40859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1956317812204361, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.04675, |
| "grad_norm": 36.0, |
| "grad_norm_var": 3.2171223958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4803, |
| "loss/crossentropy": 2.0316790327429772, |
| "loss/hidden": 3.396875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20630075875669718, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.047, |
| "grad_norm": 33.25, |
| "grad_norm_var": 16.304622395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.576, |
| "loss/crossentropy": 2.161964085698128, |
| "loss/hidden": 3.513671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21842746511101724, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.04725, |
| "grad_norm": 29.75, |
| "grad_norm_var": 2.3541666666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5036, |
| "loss/crossentropy": 1.8695943117141725, |
| "loss/hidden": 3.453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18793469872325658, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.0475, |
| "grad_norm": 34.25, |
| "grad_norm_var": 2.1780598958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5623, |
| "loss/crossentropy": 2.2376974314451217, |
| "loss/hidden": 3.489453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21696731727570295, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.04775, |
| "grad_norm": 30.75, |
| "grad_norm_var": 14.924934895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.388, |
| "loss/crossentropy": 1.9403380863368511, |
| "loss/hidden": 3.34921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18128401823341847, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 29.25, |
| "grad_norm_var": 25.1916015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4109, |
| "loss/crossentropy": 2.1744547933340073, |
| "loss/hidden": 3.423046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20097011709585785, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.04825, |
| "grad_norm": 29.25, |
| "grad_norm_var": 14.801822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.2893, |
| "loss/crossentropy": 2.101319019496441, |
| "loss/hidden": 3.44140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1921493023633957, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.0485, |
| "grad_norm": 30.125, |
| "grad_norm_var": 14.517708333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.579, |
| "loss/crossentropy": 2.057158224284649, |
| "loss/hidden": 3.59140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21765361074358225, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.04875, |
| "grad_norm": 29.625, |
| "grad_norm_var": 15.790559895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3712, |
| "loss/crossentropy": 1.9415803879499436, |
| "loss/hidden": 3.3359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18346730088815094, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.049, |
| "grad_norm": 27.625, |
| "grad_norm_var": 9.794791666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4902, |
| "loss/crossentropy": 2.035348242521286, |
| "loss/hidden": 3.439453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20268035624176264, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.04925, |
| "grad_norm": 35.25, |
| "grad_norm_var": 12.768684895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4627, |
| "loss/crossentropy": 2.054542076587677, |
| "loss/hidden": 3.426171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2003987120464444, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.0495, |
| "grad_norm": 36.0, |
| "grad_norm_var": 12.572916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.353, |
| "loss/crossentropy": 1.9634785205125809, |
| "loss/hidden": 3.301171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17985089337453247, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.04975, |
| "grad_norm": 36.25, |
| "grad_norm_var": 9.2353515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4473, |
| "loss/crossentropy": 2.059533824026585, |
| "loss/hidden": 3.3578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19096513148397207, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 29.125, |
| "grad_norm_var": 13.320572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3914, |
| "loss/crossentropy": 2.011685383319855, |
| "loss/hidden": 3.4421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19188414234668016, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.05025, |
| "grad_norm": 36.25, |
| "grad_norm_var": 14.026822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4213, |
| "loss/crossentropy": 2.309766414761543, |
| "loss/hidden": 3.39453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20372038893401623, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.0505, |
| "grad_norm": 29.0, |
| "grad_norm_var": 9.237239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4145, |
| "loss/crossentropy": 2.1240487143397333, |
| "loss/hidden": 3.447265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20137840434908866, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.05075, |
| "grad_norm": 38.5, |
| "grad_norm_var": 89.21432291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3696, |
| "loss/crossentropy": 2.112667274475098, |
| "loss/hidden": 3.487109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19770587887614965, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.051, |
| "grad_norm": 27.75, |
| "grad_norm_var": 94.06015625, |
| "learning_rate": 0.0001, |
| "loss": 7.2471, |
| "loss/crossentropy": 1.9955052442848682, |
| "loss/hidden": 3.30546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1880181163549423, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.05125, |
| "grad_norm": 35.25, |
| "grad_norm_var": 3.67265625, |
| "learning_rate": 0.0001, |
| "loss": 7.458, |
| "loss/crossentropy": 2.1320972844958304, |
| "loss/hidden": 3.384765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18908526431769132, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.0515, |
| "grad_norm": 38.75, |
| "grad_norm_var": 10.776822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3769, |
| "loss/crossentropy": 2.171598494052887, |
| "loss/hidden": 3.29765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18929236195981503, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.05175, |
| "grad_norm": 32.75, |
| "grad_norm_var": 10.53515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5279, |
| "loss/crossentropy": 2.0172302186489106, |
| "loss/hidden": 3.4203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2013201082125306, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 32.0, |
| "grad_norm_var": 7.678125, |
| "learning_rate": 0.0001, |
| "loss": 7.3619, |
| "loss/crossentropy": 1.982726515084505, |
| "loss/hidden": 3.394921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17850281894207, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.05225, |
| "grad_norm": 29.75, |
| "grad_norm_var": 63.6681640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5109, |
| "loss/crossentropy": 2.121504098176956, |
| "loss/hidden": 3.50703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.240205854550004, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.0525, |
| "grad_norm": 34.5, |
| "grad_norm_var": 7.506184895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4658, |
| "loss/crossentropy": 2.110687591135502, |
| "loss/hidden": 3.530078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2039638390764594, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.05275, |
| "grad_norm": 32.5, |
| "grad_norm_var": 19.075455729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5668, |
| "loss/crossentropy": 1.9557841390371322, |
| "loss/hidden": 3.462109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18774209143593906, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.053, |
| "grad_norm": 31.125, |
| "grad_norm_var": 3.85390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5735, |
| "loss/crossentropy": 2.0219520531594752, |
| "loss/hidden": 3.3796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18533632289618254, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.05325, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.8910807291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4083, |
| "loss/crossentropy": 2.1359280541539194, |
| "loss/hidden": 3.412890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1897095028311014, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.0535, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.5957682291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.446, |
| "loss/crossentropy": 2.170258317142725, |
| "loss/hidden": 3.32109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1826348526403308, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.05375, |
| "grad_norm": 31.25, |
| "grad_norm_var": 3.785416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4014, |
| "loss/crossentropy": 2.131239393353462, |
| "loss/hidden": 3.303515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18656531646847724, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.054, |
| "grad_norm": 31.0, |
| "grad_norm_var": 4.8666015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5478, |
| "loss/crossentropy": 2.223896725475788, |
| "loss/hidden": 3.383203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1951376979239285, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.05425, |
| "grad_norm": 30.375, |
| "grad_norm_var": 8.437955729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5562, |
| "loss/crossentropy": 2.1203987300395966, |
| "loss/hidden": 3.351171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1970507999882102, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.0545, |
| "grad_norm": 32.0, |
| "grad_norm_var": 2.9488932291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5532, |
| "loss/crossentropy": 2.080265050381422, |
| "loss/hidden": 3.544140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2216239819303155, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.05475, |
| "grad_norm": 31.125, |
| "grad_norm_var": 8.1728515625, |
| "learning_rate": 0.0001, |
| "loss": 7.382, |
| "loss/crossentropy": 2.2114535331726075, |
| "loss/hidden": 3.37734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20577374435961246, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 28.875, |
| "grad_norm_var": 14.520833333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5766, |
| "loss/crossentropy": 2.1003271512687207, |
| "loss/hidden": 3.358984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18811229150742292, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.05525, |
| "grad_norm": 33.5, |
| "grad_norm_var": 16.099739583333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5553, |
| "loss/crossentropy": 2.1326127350330353, |
| "loss/hidden": 3.436328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22006579730659723, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.0555, |
| "grad_norm": 32.25, |
| "grad_norm_var": 9.305143229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3766, |
| "loss/crossentropy": 2.1496046826243402, |
| "loss/hidden": 3.476171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1952402491122484, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.05575, |
| "grad_norm": 29.125, |
| "grad_norm_var": 6.805143229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.3648, |
| "loss/crossentropy": 2.13938904479146, |
| "loss/hidden": 3.36640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19394674636423587, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 27.625, |
| "grad_norm_var": 15.0712890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4292, |
| "loss/crossentropy": 2.0648645758628845, |
| "loss/hidden": 3.436328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18520106598734856, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.05625, |
| "grad_norm": 29.25, |
| "grad_norm_var": 12.034309895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4469, |
| "loss/crossentropy": 2.080448921024799, |
| "loss/hidden": 3.3109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18507405128329993, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.0565, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.014518229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4325, |
| "loss/crossentropy": 2.0871294140815735, |
| "loss/hidden": 3.409375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20059894528239966, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.05675, |
| "grad_norm": 28.75, |
| "grad_norm_var": 1.8103515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4268, |
| "loss/crossentropy": 2.010594163835049, |
| "loss/hidden": 3.39453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19413960948586464, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.057, |
| "grad_norm": 32.5, |
| "grad_norm_var": 4.0369140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4346, |
| "loss/crossentropy": 2.1129174560308455, |
| "loss/hidden": 3.416015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1961110396310687, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.05725, |
| "grad_norm": 39.0, |
| "grad_norm_var": 30.42265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4422, |
| "loss/crossentropy": 2.002947611361742, |
| "loss/hidden": 3.432421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2081361676566303, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.0575, |
| "grad_norm": 37.25, |
| "grad_norm_var": 25.699934895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4312, |
| "loss/crossentropy": 2.06134437918663, |
| "loss/hidden": 3.376171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18918452728539706, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.05775, |
| "grad_norm": 28.875, |
| "grad_norm_var": 9.115559895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4209, |
| "loss/crossentropy": 2.041922479122877, |
| "loss/hidden": 3.403515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20907302405685185, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.058, |
| "grad_norm": 30.125, |
| "grad_norm_var": 22.248372395833332, |
| "learning_rate": 0.0001, |
| "loss": 7.6844, |
| "loss/crossentropy": 2.0152460247278214, |
| "loss/hidden": 3.426171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1905667196959257, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.05825, |
| "grad_norm": 38.25, |
| "grad_norm_var": 31.398893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4713, |
| "loss/crossentropy": 2.105386929959059, |
| "loss/hidden": 3.452734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1982942834496498, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.0585, |
| "grad_norm": 28.375, |
| "grad_norm_var": 54.94264322916667, |
| "learning_rate": 0.0001, |
| "loss": 7.4575, |
| "loss/crossentropy": 2.2358868844807147, |
| "loss/hidden": 3.419921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19232469592243434, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.05875, |
| "grad_norm": 33.5, |
| "grad_norm_var": 165.74583333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.2987, |
| "loss/crossentropy": 1.9657546751201154, |
| "loss/hidden": 3.3921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18062973748892547, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.059, |
| "grad_norm": 41.0, |
| "grad_norm_var": 15.376822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4431, |
| "loss/crossentropy": 2.191007924079895, |
| "loss/hidden": 3.3609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2068317520432174, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.05925, |
| "grad_norm": 30.625, |
| "grad_norm_var": 12.109375, |
| "learning_rate": 0.0001, |
| "loss": 7.3325, |
| "loss/crossentropy": 2.0140789330005644, |
| "loss/hidden": 3.4109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18166892379522323, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.0595, |
| "grad_norm": 31.875, |
| "grad_norm_var": 6.941666666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4039, |
| "loss/crossentropy": 2.0221361994743345, |
| "loss/hidden": 3.401953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1934544663876295, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.05975, |
| "grad_norm": 30.125, |
| "grad_norm_var": 10.472330729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5862, |
| "loss/crossentropy": 1.9840030640363693, |
| "loss/hidden": 3.46640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19178631734102963, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 29.875, |
| "grad_norm_var": 14.10625, |
| "learning_rate": 0.0001, |
| "loss": 7.4826, |
| "loss/crossentropy": 2.1700179904699324, |
| "loss/hidden": 3.408984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1915024297311902, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.06025, |
| "grad_norm": 32.75, |
| "grad_norm_var": 7.370768229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3889, |
| "loss/crossentropy": 2.091843403875828, |
| "loss/hidden": 3.358203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18695627991110086, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.0605, |
| "grad_norm": 29.0, |
| "grad_norm_var": 9.922330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4655, |
| "loss/crossentropy": 2.172381104528904, |
| "loss/hidden": 3.380078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20078962799161673, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.06075, |
| "grad_norm": 34.25, |
| "grad_norm_var": 8.637239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.519, |
| "loss/crossentropy": 1.995463601499796, |
| "loss/hidden": 3.411328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1993358489125967, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.061, |
| "grad_norm": 31.25, |
| "grad_norm_var": 11.9431640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5169, |
| "loss/crossentropy": 2.296917426586151, |
| "loss/hidden": 3.513671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.23228074796497822, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.06125, |
| "grad_norm": 30.25, |
| "grad_norm_var": 3.4368798046573737e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5038, |
| "loss/crossentropy": 2.1944432735443113, |
| "loss/hidden": 3.3921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21073084995150565, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.0615, |
| "grad_norm": 33.5, |
| "grad_norm_var": 3.436879805205814e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4423, |
| "loss/crossentropy": 2.152103579044342, |
| "loss/hidden": 3.512109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20929353777319193, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.06175, |
| "grad_norm": 39.0, |
| "grad_norm_var": 2.2045823633093297e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4382, |
| "loss/crossentropy": 2.017627691477537, |
| "loss/hidden": 3.355078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19590776292607187, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.062, |
| "grad_norm": 29.375, |
| "grad_norm_var": 2.2045823636681523e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4072, |
| "loss/crossentropy": 2.1076912328600885, |
| "loss/hidden": 3.433203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1988623272627592, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.06225, |
| "grad_norm": 30.125, |
| "grad_norm_var": 3.2494140625, |
| "learning_rate": 0.0001, |
| "loss": 7.3192, |
| "loss/crossentropy": 1.9777067750692368, |
| "loss/hidden": 3.429296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20539684109389783, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 29.125, |
| "grad_norm_var": 5.580208333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3283, |
| "loss/crossentropy": 2.061080713570118, |
| "loss/hidden": 3.4953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20077812522649766, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.06275, |
| "grad_norm": 28.375, |
| "grad_norm_var": 5.618489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4401, |
| "loss/crossentropy": 2.2099071338772776, |
| "loss/hidden": 3.411328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2055276283994317, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.063, |
| "grad_norm": 28.125, |
| "grad_norm_var": 7.118684895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.3509, |
| "loss/crossentropy": 1.962952435016632, |
| "loss/hidden": 3.421484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19731322024017572, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.06325, |
| "grad_norm": 31.375, |
| "grad_norm_var": 1.9681640625, |
| "learning_rate": 0.0001, |
| "loss": 7.3695, |
| "loss/crossentropy": 1.9843583509325982, |
| "loss/hidden": 3.4921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2062232268974185, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.0635, |
| "grad_norm": 31.5, |
| "grad_norm_var": 3.7988932291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4485, |
| "loss/crossentropy": 2.1427679538726805, |
| "loss/hidden": 3.38125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2011977185495198, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.06375, |
| "grad_norm": 30.0, |
| "grad_norm_var": 2.5885416666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.4157, |
| "loss/crossentropy": 1.9085583783686162, |
| "loss/hidden": 3.325, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17416954301297666, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.21015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5141, |
| "loss/crossentropy": 1.9622327491641045, |
| "loss/hidden": 3.361328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18756412472575903, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.06425, |
| "grad_norm": 30.0, |
| "grad_norm_var": 1.7143229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4624, |
| "loss/crossentropy": 2.192887546122074, |
| "loss/hidden": 3.4296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1984951412305236, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.0645, |
| "grad_norm": 30.125, |
| "grad_norm_var": 1.9143229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3947, |
| "loss/crossentropy": 2.102549520134926, |
| "loss/hidden": 3.463671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1989850653335452, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.06475, |
| "grad_norm": 32.25, |
| "grad_norm_var": 9.5322265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5147, |
| "loss/crossentropy": 2.213281115144491, |
| "loss/hidden": 3.405859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2027151037938893, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 30.625, |
| "grad_norm_var": 2.3427083333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4691, |
| "loss/crossentropy": 2.1138279482722284, |
| "loss/hidden": 3.379296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20825629755854608, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.06525, |
| "grad_norm": 36.0, |
| "grad_norm_var": 3.3395182291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.4775, |
| "loss/crossentropy": 2.107349547743797, |
| "loss/hidden": 3.404296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19337845854461194, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.0655, |
| "grad_norm": 29.25, |
| "grad_norm_var": 12.757291666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5438, |
| "loss/crossentropy": 2.0628502368927, |
| "loss/hidden": 3.4984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20967572089284658, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.06575, |
| "grad_norm": 28.625, |
| "grad_norm_var": 11.805208333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3354, |
| "loss/crossentropy": 2.1009589530527593, |
| "loss/hidden": 3.33828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18132725274190306, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.066, |
| "grad_norm": 32.5, |
| "grad_norm_var": 2.730208333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4257, |
| "loss/crossentropy": 1.983342681080103, |
| "loss/hidden": 3.480078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19340286049991845, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.06625, |
| "grad_norm": 30.25, |
| "grad_norm_var": 3.7549465282226944e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.309, |
| "loss/crossentropy": 2.0057250812649725, |
| "loss/hidden": 3.418359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18936716187745334, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.0665, |
| "grad_norm": 36.25, |
| "grad_norm_var": 8.832747395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5442, |
| "loss/crossentropy": 2.054753464460373, |
| "loss/hidden": 3.410546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2035602940246463, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.06675, |
| "grad_norm": 32.5, |
| "grad_norm_var": 4.8900390625, |
| "learning_rate": 0.0001, |
| "loss": 7.4106, |
| "loss/crossentropy": 2.0181221179664135, |
| "loss/hidden": 3.3859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1878144398331642, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.067, |
| "grad_norm": 30.125, |
| "grad_norm_var": 4.280989583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4597, |
| "loss/crossentropy": 2.200540581345558, |
| "loss/hidden": 3.4046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20286752395331858, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.06725, |
| "grad_norm": 31.75, |
| "grad_norm_var": 3.8559895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4643, |
| "loss/crossentropy": 2.0630861818790436, |
| "loss/hidden": 3.419921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20401672925800085, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.0675, |
| "grad_norm": 33.0, |
| "grad_norm_var": 7.073958333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4001, |
| "loss/crossentropy": 1.927167509496212, |
| "loss/hidden": 3.31328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17901942003518342, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.06775, |
| "grad_norm": 30.25, |
| "grad_norm_var": 8.9009765625, |
| "learning_rate": 0.0001, |
| "loss": 7.3461, |
| "loss/crossentropy": 2.0538916781544687, |
| "loss/hidden": 3.35234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1864149821922183, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.068, |
| "grad_norm": 29.5, |
| "grad_norm_var": 2.218489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.526, |
| "loss/crossentropy": 2.211588367819786, |
| "loss/hidden": 3.487890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20801848396658898, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.06825, |
| "grad_norm": 31.375, |
| "grad_norm_var": 1.0768229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5535, |
| "loss/crossentropy": 2.268890543282032, |
| "loss/hidden": 3.39921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21352684032171965, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.0685, |
| "grad_norm": 33.25, |
| "grad_norm_var": 5.663997395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.411, |
| "loss/crossentropy": 1.902898482978344, |
| "loss/hidden": 3.423046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1794701736420393, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.06875, |
| "grad_norm": 32.25, |
| "grad_norm_var": 6.167708333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.3718, |
| "loss/crossentropy": 1.9450767874717712, |
| "loss/hidden": 3.453515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18759301900863648, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.069, |
| "grad_norm": 31.125, |
| "grad_norm_var": 31.185872395833332, |
| "learning_rate": 0.0001, |
| "loss": 7.4359, |
| "loss/crossentropy": 2.0783849939703942, |
| "loss/hidden": 3.334375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18503105416893958, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.06925, |
| "grad_norm": 36.5, |
| "grad_norm_var": 35.412434895833336, |
| "learning_rate": 0.0001, |
| "loss": 7.5806, |
| "loss/crossentropy": 2.2374701410532, |
| "loss/hidden": 3.378125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19615829903632404, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.0695, |
| "grad_norm": 30.25, |
| "grad_norm_var": 19.787239583333335, |
| "learning_rate": 0.0001, |
| "loss": 7.3197, |
| "loss/crossentropy": 1.8297001466155052, |
| "loss/hidden": 3.3171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16481583826243879, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.06975, |
| "grad_norm": 428.0, |
| "grad_norm_var": 9873.31640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5313, |
| "loss/crossentropy": 2.249661484360695, |
| "loss/hidden": 3.392578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2018596636131406, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 31.0, |
| "grad_norm_var": 9755.6625, |
| "learning_rate": 0.0001, |
| "loss": 7.3957, |
| "loss/crossentropy": 1.9368772380053998, |
| "loss/hidden": 3.48203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18386599626392125, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.07025, |
| "grad_norm": 30.75, |
| "grad_norm_var": 1.8317057291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4372, |
| "loss/crossentropy": 1.98307463824749, |
| "loss/hidden": 3.464453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19818334747105837, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.0705, |
| "grad_norm": 29.375, |
| "grad_norm_var": 2.589583333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5014, |
| "loss/crossentropy": 2.1463105253875256, |
| "loss/hidden": 3.5046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20105676222592592, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.07075, |
| "grad_norm": 60.5, |
| "grad_norm_var": 178.2556640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4527, |
| "loss/crossentropy": 2.0776613369584083, |
| "loss/hidden": 3.420703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19452448841184378, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.071, |
| "grad_norm": 29.25, |
| "grad_norm_var": 172.31451822916668, |
| "learning_rate": 0.0001, |
| "loss": 7.4802, |
| "loss/crossentropy": 2.1200039610266685, |
| "loss/hidden": 3.417578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19831879772245883, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.07125, |
| "grad_norm": 69.0, |
| "grad_norm_var": 117.23098958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.434, |
| "loss/crossentropy": 2.024143140017986, |
| "loss/hidden": 3.348828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1836528332903981, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.0715, |
| "grad_norm": 31.375, |
| "grad_norm_var": 92.53723958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4934, |
| "loss/crossentropy": 2.2765417456626893, |
| "loss/hidden": 3.446484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20736196860671044, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.07175, |
| "grad_norm": 31.625, |
| "grad_norm_var": 7.986393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4826, |
| "loss/crossentropy": 2.269197002053261, |
| "loss/hidden": 3.404296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19869209118187428, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.072, |
| "grad_norm": 31.25, |
| "grad_norm_var": 3.1806640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4018, |
| "loss/crossentropy": 2.2985214799642564, |
| "loss/hidden": 3.390234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20524807646870613, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.07225, |
| "grad_norm": 30.875, |
| "grad_norm_var": 4.801822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5148, |
| "loss/crossentropy": 2.2387808740139006, |
| "loss/hidden": 3.46015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19951685946434736, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.0725, |
| "grad_norm": 28.875, |
| "grad_norm_var": 13.836458333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5232, |
| "loss/crossentropy": 2.049694790691137, |
| "loss/hidden": 3.410546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19052465092390775, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.07275, |
| "grad_norm": 29.625, |
| "grad_norm_var": 17.91640625, |
| "learning_rate": 0.0001, |
| "loss": 7.3227, |
| "loss/crossentropy": 2.0360258772969244, |
| "loss/hidden": 3.40546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18495636582374572, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.073, |
| "grad_norm": 32.0, |
| "grad_norm_var": 1.8926377214767268e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4512, |
| "loss/crossentropy": 2.13848315179348, |
| "loss/hidden": 3.3859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18625867497175932, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.07325, |
| "grad_norm": 29.875, |
| "grad_norm_var": 1.8926377199175642e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5038, |
| "loss/crossentropy": 2.166595605015755, |
| "loss/hidden": 3.49375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20948194600641729, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.0735, |
| "grad_norm": 28.5, |
| "grad_norm_var": 73.08020833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.374, |
| "loss/crossentropy": 1.9849643550813199, |
| "loss/hidden": 3.301171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18302082028239966, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.07375, |
| "grad_norm": 29.125, |
| "grad_norm_var": 24.825, |
| "learning_rate": 0.0001, |
| "loss": 7.3651, |
| "loss/crossentropy": 2.057874396443367, |
| "loss/hidden": 3.3609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1866615541279316, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.074, |
| "grad_norm": 30.625, |
| "grad_norm_var": 883.6354166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5415, |
| "loss/crossentropy": 2.1631729155778885, |
| "loss/hidden": 3.388671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20762786027044058, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.07425, |
| "grad_norm": 32.75, |
| "grad_norm_var": 887.2705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4471, |
| "loss/crossentropy": 1.9493468508124352, |
| "loss/hidden": 3.3578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1884406829252839, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.0745, |
| "grad_norm": 28.875, |
| "grad_norm_var": 5.070768229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.605, |
| "loss/crossentropy": 2.122344336658716, |
| "loss/hidden": 3.460546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21057356838136912, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.07475, |
| "grad_norm": 37.0, |
| "grad_norm_var": 21.535416666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.469, |
| "loss/crossentropy": 2.008989527821541, |
| "loss/hidden": 3.54140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2172183733433485, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 29.375, |
| "grad_norm_var": 18.198958333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.3932, |
| "loss/crossentropy": 2.1922819674015046, |
| "loss/hidden": 3.453515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20425879992544652, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.07525, |
| "grad_norm": 29.5, |
| "grad_norm_var": 2.668684895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3505, |
| "loss/crossentropy": 2.189265179634094, |
| "loss/hidden": 3.34609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20808048862963915, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.0755, |
| "grad_norm": 30.75, |
| "grad_norm_var": 14.20625, |
| "learning_rate": 0.0001, |
| "loss": 7.5013, |
| "loss/crossentropy": 2.0573098927736284, |
| "loss/hidden": 3.3515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18116160985082388, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.07575, |
| "grad_norm": 31.375, |
| "grad_norm_var": 16.983333333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4455, |
| "loss/crossentropy": 1.9735823571681976, |
| "loss/hidden": 3.459765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19495000168681145, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.076, |
| "grad_norm": 7247757312.0, |
| "grad_norm_var": 3.2831240991582193e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4881, |
| "loss/crossentropy": 1.971890377253294, |
| "loss/hidden": 3.40625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18015608433634042, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.07625, |
| "grad_norm": 28.25, |
| "grad_norm_var": 3.283124098780732e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.3664, |
| "loss/crossentropy": 1.8378953270614147, |
| "loss/hidden": 3.3609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1741427879780531, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.0765, |
| "grad_norm": 31.75, |
| "grad_norm_var": 1.89140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5137, |
| "loss/crossentropy": 2.141886255145073, |
| "loss/hidden": 3.443359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19584037065505983, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.07675, |
| "grad_norm": 27.25, |
| "grad_norm_var": 2.4244140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4296, |
| "loss/crossentropy": 2.0373554110527037, |
| "loss/hidden": 3.5640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.216986732929945, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.077, |
| "grad_norm": 35.25, |
| "grad_norm_var": 3.7322265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5269, |
| "loss/crossentropy": 1.975497831404209, |
| "loss/hidden": 3.333984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1780722170136869, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.07725, |
| "grad_norm": 32.75, |
| "grad_norm_var": 3.6895182291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4938, |
| "loss/crossentropy": 2.151789793372154, |
| "loss/hidden": 3.502734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21854450944811105, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.0775, |
| "grad_norm": 29.5, |
| "grad_norm_var": 6.82265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4321, |
| "loss/crossentropy": 1.9484706297516823, |
| "loss/hidden": 3.506640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19896488767117262, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.07775, |
| "grad_norm": 29.75, |
| "grad_norm_var": 3.0780598958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5471, |
| "loss/crossentropy": 2.165594828128815, |
| "loss/hidden": 3.36796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20095103643834591, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.078, |
| "grad_norm": 29.0, |
| "grad_norm_var": 2.2197916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.6334, |
| "loss/crossentropy": 2.1854751259088516, |
| "loss/hidden": 3.4640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21246263310313224, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.07825, |
| "grad_norm": 29.0, |
| "grad_norm_var": 3.71640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4278, |
| "loss/crossentropy": 1.914103902876377, |
| "loss/hidden": 3.451953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18373754434287548, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.0785, |
| "grad_norm": 29.0, |
| "grad_norm_var": 1.2952473958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4487, |
| "loss/crossentropy": 1.9421842776238918, |
| "loss/hidden": 3.5296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19919300880283117, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.07875, |
| "grad_norm": 29.375, |
| "grad_norm_var": 1.8268229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5818, |
| "loss/crossentropy": 2.0765694811940194, |
| "loss/hidden": 3.5171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19946561977267266, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.079, |
| "grad_norm": 28.125, |
| "grad_norm_var": 11.483268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4372, |
| "loss/crossentropy": 2.013955050334334, |
| "loss/hidden": 3.4078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20109358858317136, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.07925, |
| "grad_norm": 28.875, |
| "grad_norm_var": 12.871809895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4606, |
| "loss/crossentropy": 2.2802242666482924, |
| "loss/hidden": 3.423046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21229397617280482, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.0795, |
| "grad_norm": 28.375, |
| "grad_norm_var": 1.6301432291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4691, |
| "loss/crossentropy": 2.134338477253914, |
| "loss/hidden": 3.404296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18632632456719875, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.07975, |
| "grad_norm": 30.625, |
| "grad_norm_var": 2.6113932291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4903, |
| "loss/crossentropy": 2.192245528101921, |
| "loss/hidden": 3.381640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19276445377618073, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 27.875, |
| "grad_norm_var": 2.6830729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4715, |
| "loss/crossentropy": 2.1333388604223726, |
| "loss/hidden": 3.376953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1902673264965415, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.08025, |
| "grad_norm": 29.625, |
| "grad_norm_var": 2.7072265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4646, |
| "loss/crossentropy": 2.1069626569747926, |
| "loss/hidden": 3.374609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18933899328112602, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.0805, |
| "grad_norm": 33.0, |
| "grad_norm_var": 1.6457682291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3771, |
| "loss/crossentropy": 2.143903985619545, |
| "loss/hidden": 3.34921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19841080270707606, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.08075, |
| "grad_norm": 29.5, |
| "grad_norm_var": 2.405143229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4629, |
| "loss/crossentropy": 1.9501185864210129, |
| "loss/hidden": 3.474609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2003694986924529, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.081, |
| "grad_norm": 35.0, |
| "grad_norm_var": 3.4619140625, |
| "learning_rate": 0.0001, |
| "loss": 7.6085, |
| "loss/crossentropy": 2.1099744185805323, |
| "loss/hidden": 3.33515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1865939747542143, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.08125, |
| "grad_norm": 38.0, |
| "grad_norm_var": 15.54140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4858, |
| "loss/crossentropy": 1.8915734700858593, |
| "loss/hidden": 3.550390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20414282865822314, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.0815, |
| "grad_norm": 31.875, |
| "grad_norm_var": 15.074934895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4995, |
| "loss/crossentropy": 2.0746393710374833, |
| "loss/hidden": 3.448046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19025763403624296, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.08175, |
| "grad_norm": 29.625, |
| "grad_norm_var": 4.532291666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4517, |
| "loss/crossentropy": 2.201898355782032, |
| "loss/hidden": 3.38515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1851862959563732, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.082, |
| "grad_norm": 32.25, |
| "grad_norm_var": 9.199739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4085, |
| "loss/crossentropy": 1.9774614453315735, |
| "loss/hidden": 3.5078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1853517958894372, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.08225, |
| "grad_norm": 31.0, |
| "grad_norm_var": 13.801497395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4065, |
| "loss/crossentropy": 2.1263367265462874, |
| "loss/hidden": 3.412109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18529028967022895, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.0825, |
| "grad_norm": 29.5, |
| "grad_norm_var": 2.967643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4165, |
| "loss/crossentropy": 2.193544697761536, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19897244460880756, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.08275, |
| "grad_norm": 33.75, |
| "grad_norm_var": 9.687239583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5716, |
| "loss/crossentropy": 2.0868531957268717, |
| "loss/hidden": 3.616796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21278488002717494, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.083, |
| "grad_norm": 31.0, |
| "grad_norm_var": 7.9478515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5543, |
| "loss/crossentropy": 2.1392074063420297, |
| "loss/hidden": 3.395703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21113577168434858, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.08325, |
| "grad_norm": 30.0, |
| "grad_norm_var": 2.0268229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4454, |
| "loss/crossentropy": 2.0691144198179243, |
| "loss/hidden": 3.341796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20186964478343725, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.0835, |
| "grad_norm": 31.5, |
| "grad_norm_var": 2.6211653265769103e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4481, |
| "loss/crossentropy": 2.0832756504416468, |
| "loss/hidden": 3.423046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19915037509053946, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.08375, |
| "grad_norm": 32.5, |
| "grad_norm_var": 2.621165324337292e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.3606, |
| "loss/crossentropy": 2.102260760962963, |
| "loss/hidden": 3.372265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19333885367959738, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.084, |
| "grad_norm": 29.25, |
| "grad_norm_var": 85.575, |
| "learning_rate": 0.0001, |
| "loss": 7.4073, |
| "loss/crossentropy": 2.149528594315052, |
| "loss/hidden": 3.491796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2177526842802763, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.08425, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.8645833333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4642, |
| "loss/crossentropy": 2.085590344667435, |
| "loss/hidden": 3.330859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17804578468203544, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.0845, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.996875, |
| "learning_rate": 0.0001, |
| "loss": 7.3953, |
| "loss/crossentropy": 2.0975965946912765, |
| "loss/hidden": 3.3, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1847201505675912, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.08475, |
| "grad_norm": 32.0, |
| "grad_norm_var": 2.470572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4553, |
| "loss/crossentropy": 2.1018140748143197, |
| "loss/hidden": 3.36796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18250287007540464, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.887955729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5238, |
| "loss/crossentropy": 2.1050665065646172, |
| "loss/hidden": 3.501953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2124734738841653, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.08525, |
| "grad_norm": 30.0, |
| "grad_norm_var": 1.7143229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.2754, |
| "loss/crossentropy": 2.0948296964168547, |
| "loss/hidden": 3.37421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.178215317055583, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.0855, |
| "grad_norm": 30.875, |
| "grad_norm_var": 5.354622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4191, |
| "loss/crossentropy": 2.0418393671512605, |
| "loss/hidden": 3.4734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18740264605730772, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.08575, |
| "grad_norm": 32.25, |
| "grad_norm_var": 6.430989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5642, |
| "loss/crossentropy": 2.0279636546969413, |
| "loss/hidden": 3.55859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20154636316001415, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.086, |
| "grad_norm": 29.125, |
| "grad_norm_var": 53.64791666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.485, |
| "loss/crossentropy": 2.0705729112029077, |
| "loss/hidden": 3.456640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22035282999277114, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.08625, |
| "grad_norm": 30.375, |
| "grad_norm_var": 5.54765625, |
| "learning_rate": 0.0001, |
| "loss": 7.428, |
| "loss/crossentropy": 1.9830067940056324, |
| "loss/hidden": 3.376171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19354272997006775, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.0865, |
| "grad_norm": 28.375, |
| "grad_norm_var": 2.758736220726598e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4342, |
| "loss/crossentropy": 2.1590976014733316, |
| "loss/hidden": 3.489453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20231232214719058, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.08675, |
| "grad_norm": 29.125, |
| "grad_norm_var": 2.470572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3376, |
| "loss/crossentropy": 2.108407254517078, |
| "loss/hidden": 3.416796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18425025548785925, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.087, |
| "grad_norm": 32.5, |
| "grad_norm_var": 19.315559895833335, |
| "learning_rate": 0.0001, |
| "loss": 7.391, |
| "loss/crossentropy": 2.086346108466387, |
| "loss/hidden": 3.380859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19492445401847364, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.08725, |
| "grad_norm": 30.75, |
| "grad_norm_var": 3.9009765625, |
| "learning_rate": 0.0001, |
| "loss": 7.454, |
| "loss/crossentropy": 2.0728737086057665, |
| "loss/hidden": 3.474609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21246139723807572, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.0875, |
| "grad_norm": 53.25, |
| "grad_norm_var": 34.962955729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.4001, |
| "loss/crossentropy": 1.9173476293683052, |
| "loss/hidden": 3.330859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18263984741643072, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.08775, |
| "grad_norm": 29.875, |
| "grad_norm_var": 36.22389322916667, |
| "learning_rate": 0.0001, |
| "loss": 7.5855, |
| "loss/crossentropy": 1.9761252515017986, |
| "loss/hidden": 3.391015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20959299746900797, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.088, |
| "grad_norm": 32.25, |
| "grad_norm_var": 17.7337890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4728, |
| "loss/crossentropy": 2.0416554152965545, |
| "loss/hidden": 3.4703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19014756735414268, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.08825, |
| "grad_norm": 29.375, |
| "grad_norm_var": 14.664322916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5608, |
| "loss/crossentropy": 2.2834356099367143, |
| "loss/hidden": 3.3953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19908843878656626, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.0885, |
| "grad_norm": 31.875, |
| "grad_norm_var": 2.6702473958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4804, |
| "loss/crossentropy": 2.0417330890893934, |
| "loss/hidden": 3.46875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20852382443845272, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.08875, |
| "grad_norm": 31.625, |
| "grad_norm_var": 2.460724589971584e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5559, |
| "loss/crossentropy": 2.1676768481731417, |
| "loss/hidden": 3.394921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1989177169278264, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.089, |
| "grad_norm": 30.0, |
| "grad_norm_var": 6.881705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4678, |
| "loss/crossentropy": 2.221273897588253, |
| "loss/hidden": 3.4, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19402988757938147, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.08925, |
| "grad_norm": 31.375, |
| "grad_norm_var": 7.732747395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4508, |
| "loss/crossentropy": 2.1802149415016174, |
| "loss/hidden": 3.43828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20121808685362338, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.0895, |
| "grad_norm": 52.5, |
| "grad_norm_var": 30.9775390625, |
| "learning_rate": 0.0001, |
| "loss": 7.3982, |
| "loss/crossentropy": 2.085124118626118, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18448642026633025, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.08975, |
| "grad_norm": 30.875, |
| "grad_norm_var": 32.91295572916667, |
| "learning_rate": 0.0001, |
| "loss": 7.4381, |
| "loss/crossentropy": 2.1467449337244036, |
| "loss/hidden": 3.3734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19393185302615165, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 29.25, |
| "grad_norm_var": 1.4708333333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.415, |
| "loss/crossentropy": 2.0135369554162024, |
| "loss/hidden": 3.37109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18443848174065353, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.09025, |
| "grad_norm": 31.375, |
| "grad_norm_var": 6.1962890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4028, |
| "loss/crossentropy": 2.1443901300430297, |
| "loss/hidden": 3.440234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2054579086601734, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.0905, |
| "grad_norm": 26.5, |
| "grad_norm_var": 3.562239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3255, |
| "loss/crossentropy": 1.799356396496296, |
| "loss/hidden": 3.36953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17441922090947629, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.09075, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.2083333333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4272, |
| "loss/crossentropy": 1.9925116747617722, |
| "loss/hidden": 3.52578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21653544921427964, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.091, |
| "grad_norm": 30.125, |
| "grad_norm_var": 0.6125, |
| "learning_rate": 0.0001, |
| "loss": 7.3649, |
| "loss/crossentropy": 2.135761073231697, |
| "loss/hidden": 3.4140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18989351522177458, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.09125, |
| "grad_norm": 31.375, |
| "grad_norm_var": 1.4330729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4505, |
| "loss/crossentropy": 2.0986070543527604, |
| "loss/hidden": 3.334375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18352905213832854, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.0915, |
| "grad_norm": 29.625, |
| "grad_norm_var": 2.5869140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4199, |
| "loss/crossentropy": 2.1555575743317603, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19261632524430752, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.09175, |
| "grad_norm": 31.5, |
| "grad_norm_var": 2.371875, |
| "learning_rate": 0.0001, |
| "loss": 7.5463, |
| "loss/crossentropy": 2.1411691516637803, |
| "loss/hidden": 3.446875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2046652188524604, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.092, |
| "grad_norm": 30.625, |
| "grad_norm_var": 4.703580729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.404, |
| "loss/crossentropy": 2.142404294013977, |
| "loss/hidden": 3.445703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20414466112852098, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.09225, |
| "grad_norm": 30.375, |
| "grad_norm_var": 3.25625, |
| "learning_rate": 0.0001, |
| "loss": 7.4774, |
| "loss/crossentropy": 2.187901920080185, |
| "loss/hidden": 3.480859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21911972090601922, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.0925, |
| "grad_norm": 31.875, |
| "grad_norm_var": 1.2166666666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5965, |
| "loss/crossentropy": 2.086391404271126, |
| "loss/hidden": 3.438671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2020766455680132, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.09275, |
| "grad_norm": 30.625, |
| "grad_norm_var": 2.147330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4579, |
| "loss/crossentropy": 2.09081457182765, |
| "loss/hidden": 3.369140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1868050311692059, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.093, |
| "grad_norm": 34.25, |
| "grad_norm_var": 2.467643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.522, |
| "loss/crossentropy": 2.12264247238636, |
| "loss/hidden": 3.453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18927707765251398, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.09325, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.981184895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4155, |
| "loss/crossentropy": 2.1118928104639054, |
| "loss/hidden": 3.44765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19489197488874196, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.0935, |
| "grad_norm": 34.0, |
| "grad_norm_var": 5.312434895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5053, |
| "loss/crossentropy": 2.1360882744193077, |
| "loss/hidden": 3.426171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19313989579677582, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.09375, |
| "grad_norm": 29.125, |
| "grad_norm_var": 4.549739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3275, |
| "loss/crossentropy": 2.010613538324833, |
| "loss/hidden": 3.355859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18421147018671036, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.094, |
| "grad_norm": 31.625, |
| "grad_norm_var": 1.5541666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4784, |
| "loss/crossentropy": 2.1465295113623144, |
| "loss/hidden": 3.323828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18987073097378016, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.09425, |
| "grad_norm": 32.75, |
| "grad_norm_var": 1.9018229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3495, |
| "loss/crossentropy": 2.17747982442379, |
| "loss/hidden": 3.48046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2016214355826378, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.0945, |
| "grad_norm": 30.875, |
| "grad_norm_var": 3.088997395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5384, |
| "loss/crossentropy": 2.179350584745407, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19142594784498215, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.09475, |
| "grad_norm": 29.625, |
| "grad_norm_var": 1.1559895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4035, |
| "loss/crossentropy": 2.155378046631813, |
| "loss/hidden": 3.32109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19720839541405438, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 30.625, |
| "grad_norm_var": 1.1999348958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4441, |
| "loss/crossentropy": 2.0597486779093743, |
| "loss/hidden": 3.413671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19279775265604257, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.09525, |
| "grad_norm": 33.5, |
| "grad_norm_var": 2.1666666666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5146, |
| "loss/crossentropy": 2.1966816753149034, |
| "loss/hidden": 3.44375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20174810625612735, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.0955, |
| "grad_norm": 31.5, |
| "grad_norm_var": 1.9593098958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.539, |
| "loss/crossentropy": 2.165803623199463, |
| "loss/hidden": 3.34375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1953417781740427, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.09575, |
| "grad_norm": 32.0, |
| "grad_norm_var": 6.690625, |
| "learning_rate": 0.0001, |
| "loss": 7.514, |
| "loss/crossentropy": 2.0817860513925552, |
| "loss/hidden": 3.453515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20838446952402592, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 32.75, |
| "grad_norm_var": 7.6431640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5472, |
| "loss/crossentropy": 2.231910442560911, |
| "loss/hidden": 3.442578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21717903479002415, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.09625, |
| "grad_norm": 32.0, |
| "grad_norm_var": 16.134375, |
| "learning_rate": 0.0001, |
| "loss": 7.5807, |
| "loss/crossentropy": 2.0746277555823327, |
| "loss/hidden": 3.47578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20851925816386938, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.0965, |
| "grad_norm": 30.625, |
| "grad_norm_var": 16.132747395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.3749, |
| "loss/crossentropy": 2.1463438466191294, |
| "loss/hidden": 3.356640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19305863380432128, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.09675, |
| "grad_norm": 32.5, |
| "grad_norm_var": 1.0895182291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5499, |
| "loss/crossentropy": 2.2108413323760034, |
| "loss/hidden": 3.42734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20310868676751853, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.097, |
| "grad_norm": 30.75, |
| "grad_norm_var": 1.4559895833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4788, |
| "loss/crossentropy": 2.0900154620409013, |
| "loss/hidden": 3.42734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18780422061681748, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.09725, |
| "grad_norm": 30.625, |
| "grad_norm_var": 13.917643229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.4391, |
| "loss/crossentropy": 2.0574848279356956, |
| "loss/hidden": 3.3875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19390027467161416, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.0975, |
| "grad_norm": 27.375, |
| "grad_norm_var": 13.55, |
| "learning_rate": 0.0001, |
| "loss": 7.4327, |
| "loss/crossentropy": 2.2832688719034193, |
| "loss/hidden": 3.3796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20608801003545524, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.09775, |
| "grad_norm": 29.0, |
| "grad_norm_var": 3.296875, |
| "learning_rate": 0.0001, |
| "loss": 7.3691, |
| "loss/crossentropy": 1.9183307077735663, |
| "loss/hidden": 3.37890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1917601386550814, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.098, |
| "grad_norm": 34.0, |
| "grad_norm_var": 3.24765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4628, |
| "loss/crossentropy": 2.0630046002566815, |
| "loss/hidden": 3.338671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1871832549571991, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.09825, |
| "grad_norm": 31.75, |
| "grad_norm_var": 1.5384765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4868, |
| "loss/crossentropy": 2.061261148750782, |
| "loss/hidden": 3.415234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18525551967322826, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.0985, |
| "grad_norm": 29.75, |
| "grad_norm_var": 1.584375, |
| "learning_rate": 0.0001, |
| "loss": 7.5498, |
| "loss/crossentropy": 2.0895790114998816, |
| "loss/hidden": 3.409375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1932330032810569, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.09875, |
| "grad_norm": 30.625, |
| "grad_norm_var": 25.79765625, |
| "learning_rate": 0.0001, |
| "loss": 7.6502, |
| "loss/crossentropy": 2.1616804771125318, |
| "loss/hidden": 3.365625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18905209768563508, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.099, |
| "grad_norm": 30.5, |
| "grad_norm_var": 28.547916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3334, |
| "loss/crossentropy": 2.1435488507151605, |
| "loss/hidden": 3.397265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1910943292081356, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.09925, |
| "grad_norm": 32.75, |
| "grad_norm_var": 6.3650390625, |
| "learning_rate": 0.0001, |
| "loss": 7.542, |
| "loss/crossentropy": 2.176460310816765, |
| "loss/hidden": 3.41328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18821860365569593, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.0995, |
| "grad_norm": 31.625, |
| "grad_norm_var": 3.9905598958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5231, |
| "loss/crossentropy": 2.2077176332473756, |
| "loss/hidden": 3.4515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21911400128155947, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.09975, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.75625, |
| "learning_rate": 0.0001, |
| "loss": 7.4868, |
| "loss/crossentropy": 2.105836200714111, |
| "loss/hidden": 3.36953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1997914554551244, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 38.0, |
| "grad_norm_var": 4.710416666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5675, |
| "loss/crossentropy": 2.233233967423439, |
| "loss/hidden": 3.401953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20876242108643056, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.10025, |
| "grad_norm": 28.625, |
| "grad_norm_var": 7.56640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4736, |
| "loss/crossentropy": 2.103509198874235, |
| "loss/hidden": 3.413671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1953927006572485, |
| "step": 4010 |
| }, |
| { |
| "epoch": 0.1005, |
| "grad_norm": 28.875, |
| "grad_norm_var": 4.119791666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4509, |
| "loss/crossentropy": 1.9697775058448315, |
| "loss/hidden": 3.308984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17186311883851885, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.10075, |
| "grad_norm": 29.5, |
| "grad_norm_var": 1.3177083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.333, |
| "loss/crossentropy": 2.0519870311021804, |
| "loss/hidden": 3.42421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1872571600601077, |
| "step": 4030 |
| }, |
| { |
| "epoch": 0.101, |
| "grad_norm": 29.5, |
| "grad_norm_var": 1.2785807291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3466, |
| "loss/crossentropy": 2.0663713179528713, |
| "loss/hidden": 3.39921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18582073990255593, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.10125, |
| "grad_norm": 30.375, |
| "grad_norm_var": 1.9577473958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3812, |
| "loss/crossentropy": 2.1256399258971213, |
| "loss/hidden": 3.30390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19628962082788348, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.1015, |
| "grad_norm": 30.625, |
| "grad_norm_var": 0.53125, |
| "learning_rate": 0.0001, |
| "loss": 7.3726, |
| "loss/crossentropy": 2.1235328309237955, |
| "loss/hidden": 3.3703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18646292947232723, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.10175, |
| "grad_norm": 29.0, |
| "grad_norm_var": 3.19255952647709e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4564, |
| "loss/crossentropy": 2.0213126331567763, |
| "loss/hidden": 3.496875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19607899691909553, |
| "step": 4070 |
| }, |
| { |
| "epoch": 0.102, |
| "grad_norm": 28.75, |
| "grad_norm_var": 3.48515625, |
| "learning_rate": 0.0001, |
| "loss": 7.3886, |
| "loss/crossentropy": 2.0899658009409903, |
| "loss/hidden": 3.340625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1851665174588561, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.10225, |
| "grad_norm": 29.5, |
| "grad_norm_var": 1.8692057291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4838, |
| "loss/crossentropy": 2.027493818849325, |
| "loss/hidden": 3.49765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19640162959694862, |
| "step": 4090 |
| }, |
| { |
| "epoch": 0.1025, |
| "grad_norm": 29.125, |
| "grad_norm_var": 11.762434895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5099, |
| "loss/crossentropy": 2.056584618985653, |
| "loss/hidden": 3.32265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17638762388378382, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.10275, |
| "grad_norm": 30.125, |
| "grad_norm_var": 12.459375, |
| "learning_rate": 0.0001, |
| "loss": 7.5255, |
| "loss/crossentropy": 2.0713445380330087, |
| "loss/hidden": 3.36953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18587317056953906, |
| "step": 4110 |
| }, |
| { |
| "epoch": 0.103, |
| "grad_norm": 33.25, |
| "grad_norm_var": 1.9958333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4437, |
| "loss/crossentropy": 2.2338072419166566, |
| "loss/hidden": 3.36171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18814200926572083, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.10325, |
| "grad_norm": 32.75, |
| "grad_norm_var": 3.1259765625, |
| "learning_rate": 0.0001, |
| "loss": 7.3184, |
| "loss/crossentropy": 2.0210259817540646, |
| "loss/hidden": 3.3671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18816483654081823, |
| "step": 4130 |
| }, |
| { |
| "epoch": 0.1035, |
| "grad_norm": 29.5, |
| "grad_norm_var": 2.870247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5124, |
| "loss/crossentropy": 2.0151045128703116, |
| "loss/hidden": 3.371484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19255878478288652, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.10375, |
| "grad_norm": 30.625, |
| "grad_norm_var": 1.3926432291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5096, |
| "loss/crossentropy": 1.9808883003890514, |
| "loss/hidden": 3.449609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19115560222417116, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.104, |
| "grad_norm": 30.75, |
| "grad_norm_var": 1.6979166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.549, |
| "loss/crossentropy": 2.1932784736156465, |
| "loss/hidden": 3.39765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20479805655777455, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.10425, |
| "grad_norm": 30.125, |
| "grad_norm_var": 2.3333333333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.3875, |
| "loss/crossentropy": 1.8820222720503808, |
| "loss/hidden": 3.337109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17310038600116967, |
| "step": 4170 |
| }, |
| { |
| "epoch": 0.1045, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.7728515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4212, |
| "loss/crossentropy": 2.082476270198822, |
| "loss/hidden": 3.334765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19099258184432982, |
| "step": 4180 |
| }, |
| { |
| "epoch": 0.10475, |
| "grad_norm": 30.875, |
| "grad_norm_var": 11.408268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4991, |
| "loss/crossentropy": 2.287242355942726, |
| "loss/hidden": 3.375390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1982285875827074, |
| "step": 4190 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 28.75, |
| "grad_norm_var": 2.999739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5959, |
| "loss/crossentropy": 2.1783332407474516, |
| "loss/hidden": 3.422265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2117959801107645, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.10525, |
| "grad_norm": 30.0, |
| "grad_norm_var": 4.708268229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.3363, |
| "loss/crossentropy": 1.955865352600813, |
| "loss/hidden": 3.409375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18177355360239744, |
| "step": 4210 |
| }, |
| { |
| "epoch": 0.1055, |
| "grad_norm": 30.625, |
| "grad_norm_var": 3.0254557291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4673, |
| "loss/crossentropy": 1.833389012515545, |
| "loss/hidden": 3.394921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1878132861107588, |
| "step": 4220 |
| }, |
| { |
| "epoch": 0.10575, |
| "grad_norm": 32.0, |
| "grad_norm_var": 3.05, |
| "learning_rate": 0.0001, |
| "loss": 7.3969, |
| "loss/crossentropy": 1.9096243590116502, |
| "loss/hidden": 3.35625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17025592969730496, |
| "step": 4230 |
| }, |
| { |
| "epoch": 0.106, |
| "grad_norm": 30.875, |
| "grad_norm_var": 1.82265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4638, |
| "loss/crossentropy": 2.0454175233840943, |
| "loss/hidden": 3.436328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20515710916370153, |
| "step": 4240 |
| }, |
| { |
| "epoch": 0.10625, |
| "grad_norm": 30.125, |
| "grad_norm_var": 3.1333333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5126, |
| "loss/crossentropy": 2.089062933623791, |
| "loss/hidden": 3.4328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19156677946448325, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.1065, |
| "grad_norm": 29.0, |
| "grad_norm_var": 4.311393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4468, |
| "loss/crossentropy": 2.0564094200730323, |
| "loss/hidden": 3.433984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19553639348596336, |
| "step": 4260 |
| }, |
| { |
| "epoch": 0.10675, |
| "grad_norm": 32.0, |
| "grad_norm_var": 3.2587890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4186, |
| "loss/crossentropy": 2.13806764036417, |
| "loss/hidden": 3.3859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19822277761995793, |
| "step": 4270 |
| }, |
| { |
| "epoch": 0.107, |
| "grad_norm": 28.0, |
| "grad_norm_var": 1.6926432291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4595, |
| "loss/crossentropy": 2.0767486467957497, |
| "loss/hidden": 3.41953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1884168043732643, |
| "step": 4280 |
| }, |
| { |
| "epoch": 0.10725, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.3059895833333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4481, |
| "loss/crossentropy": 2.033916361629963, |
| "loss/hidden": 3.45, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20558829829096795, |
| "step": 4290 |
| }, |
| { |
| "epoch": 0.1075, |
| "grad_norm": 31.0, |
| "grad_norm_var": 2.9375, |
| "learning_rate": 0.0001, |
| "loss": 7.4871, |
| "loss/crossentropy": 2.078028707951307, |
| "loss/hidden": 3.37578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.188079852424562, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.10775, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.1020833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5379, |
| "loss/crossentropy": 2.003500834107399, |
| "loss/hidden": 3.544921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20521650360897184, |
| "step": 4310 |
| }, |
| { |
| "epoch": 0.108, |
| "grad_norm": 29.625, |
| "grad_norm_var": 2.8447916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3536, |
| "loss/crossentropy": 2.043112625181675, |
| "loss/hidden": 3.380859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19910661596804857, |
| "step": 4320 |
| }, |
| { |
| "epoch": 0.10825, |
| "grad_norm": 28.5, |
| "grad_norm_var": 4.000455729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.3717, |
| "loss/crossentropy": 2.1422011658549307, |
| "loss/hidden": 3.38671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19375871792435645, |
| "step": 4330 |
| }, |
| { |
| "epoch": 0.1085, |
| "grad_norm": 29.0, |
| "grad_norm_var": 3.6259765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5021, |
| "loss/crossentropy": 2.131446525454521, |
| "loss/hidden": 3.480078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2063008865341544, |
| "step": 4340 |
| }, |
| { |
| "epoch": 0.10875, |
| "grad_norm": 32.0, |
| "grad_norm_var": 5.9525390625, |
| "learning_rate": 0.0001, |
| "loss": 7.4749, |
| "loss/crossentropy": 2.085691845417023, |
| "loss/hidden": 3.359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1889802658930421, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.109, |
| "grad_norm": 30.75, |
| "grad_norm_var": 3.154166666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3816, |
| "loss/crossentropy": 1.8972876839339734, |
| "loss/hidden": 3.319140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17174729090183974, |
| "step": 4360 |
| }, |
| { |
| "epoch": 0.10925, |
| "grad_norm": 29.875, |
| "grad_norm_var": 1.7509765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4444, |
| "loss/crossentropy": 2.127763804793358, |
| "loss/hidden": 3.401953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18679574280977249, |
| "step": 4370 |
| }, |
| { |
| "epoch": 0.1095, |
| "grad_norm": 29.875, |
| "grad_norm_var": 2.16015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4682, |
| "loss/crossentropy": 2.1872297644615175, |
| "loss/hidden": 3.31328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.183891461789608, |
| "step": 4380 |
| }, |
| { |
| "epoch": 0.10975, |
| "grad_norm": 28.875, |
| "grad_norm_var": 3.3692057291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.429, |
| "loss/crossentropy": 2.19267495572567, |
| "loss/hidden": 3.394140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20111876968294382, |
| "step": 4390 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 29.375, |
| "grad_norm_var": 1.6858723958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.556, |
| "loss/crossentropy": 2.1324411287903784, |
| "loss/hidden": 3.4359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2090261412784457, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.11025, |
| "grad_norm": 33.5, |
| "grad_norm_var": 3.374739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4081, |
| "loss/crossentropy": 1.9800483137369156, |
| "loss/hidden": 3.584375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19881114605814218, |
| "step": 4410 |
| }, |
| { |
| "epoch": 0.1105, |
| "grad_norm": 31.75, |
| "grad_norm_var": 4.13515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4904, |
| "loss/crossentropy": 2.053773292154074, |
| "loss/hidden": 3.323828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18270381446927786, |
| "step": 4420 |
| }, |
| { |
| "epoch": 0.11075, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.3499348958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4509, |
| "loss/crossentropy": 2.0641689248383046, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19245190378278493, |
| "step": 4430 |
| }, |
| { |
| "epoch": 0.111, |
| "grad_norm": 33.25, |
| "grad_norm_var": 3.158333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3576, |
| "loss/crossentropy": 2.073286408931017, |
| "loss/hidden": 3.37265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1892416624352336, |
| "step": 4440 |
| }, |
| { |
| "epoch": 0.11125, |
| "grad_norm": 35.75, |
| "grad_norm_var": 6.167122395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.456, |
| "loss/crossentropy": 2.191167525947094, |
| "loss/hidden": 3.3140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19327596500515937, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.1115, |
| "grad_norm": 28.0, |
| "grad_norm_var": 6.762239583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4254, |
| "loss/crossentropy": 1.9917161837220192, |
| "loss/hidden": 3.48828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18673346154391765, |
| "step": 4460 |
| }, |
| { |
| "epoch": 0.11175, |
| "grad_norm": 31.0, |
| "grad_norm_var": 2.763541666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4458, |
| "loss/crossentropy": 2.0167058646678924, |
| "loss/hidden": 3.477734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20151916183531285, |
| "step": 4470 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 30.5, |
| "grad_norm_var": 7.175455729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4057, |
| "loss/crossentropy": 2.013149876892567, |
| "loss/hidden": 3.405859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1819242848083377, |
| "step": 4480 |
| }, |
| { |
| "epoch": 0.11225, |
| "grad_norm": 43.25, |
| "grad_norm_var": 13.478580729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4416, |
| "loss/crossentropy": 2.111778366565704, |
| "loss/hidden": 3.4, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20088088884949684, |
| "step": 4490 |
| }, |
| { |
| "epoch": 0.1125, |
| "grad_norm": 30.125, |
| "grad_norm_var": 11.905143229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4435, |
| "loss/crossentropy": 2.0223396182060243, |
| "loss/hidden": 3.408984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1967620700597763, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.11275, |
| "grad_norm": 28.375, |
| "grad_norm_var": 2.2978515625, |
| "learning_rate": 0.0001, |
| "loss": 7.3969, |
| "loss/crossentropy": 1.9966137878596784, |
| "loss/hidden": 3.416796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19062119219452142, |
| "step": 4510 |
| }, |
| { |
| "epoch": 0.113, |
| "grad_norm": 29.75, |
| "grad_norm_var": 3.1759765625, |
| "learning_rate": 0.0001, |
| "loss": 7.2845, |
| "loss/crossentropy": 1.8878834903240205, |
| "loss/hidden": 3.346484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.16922880560159684, |
| "step": 4520 |
| }, |
| { |
| "epoch": 0.11325, |
| "grad_norm": 33.5, |
| "grad_norm_var": 3.78515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5223, |
| "loss/crossentropy": 2.0424712359905244, |
| "loss/hidden": 3.42109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18261839263141155, |
| "step": 4530 |
| }, |
| { |
| "epoch": 0.1135, |
| "grad_norm": 41.75, |
| "grad_norm_var": 13.172330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4917, |
| "loss/crossentropy": 2.1800880253314974, |
| "loss/hidden": 3.419140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1875661849975586, |
| "step": 4540 |
| }, |
| { |
| "epoch": 0.11375, |
| "grad_norm": 29.5, |
| "grad_norm_var": 13.737239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4929, |
| "loss/crossentropy": 2.1130245834589005, |
| "loss/hidden": 3.514453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20742647554725407, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.114, |
| "grad_norm": 31.875, |
| "grad_norm_var": 3.1447265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4885, |
| "loss/crossentropy": 2.0878429099917413, |
| "loss/hidden": 3.4625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19807947240769863, |
| "step": 4560 |
| }, |
| { |
| "epoch": 0.11425, |
| "grad_norm": 32.0, |
| "grad_norm_var": 1.9080729166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.412, |
| "loss/crossentropy": 2.045598204433918, |
| "loss/hidden": 3.43984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19935160782188177, |
| "step": 4570 |
| }, |
| { |
| "epoch": 0.1145, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.703285650940459e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4112, |
| "loss/crossentropy": 1.9612677067518234, |
| "loss/hidden": 3.484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1939171139150858, |
| "step": 4580 |
| }, |
| { |
| "epoch": 0.11475, |
| "grad_norm": 30.125, |
| "grad_norm_var": 9.067708333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4109, |
| "loss/crossentropy": 2.066862888634205, |
| "loss/hidden": 3.440625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20057452656328678, |
| "step": 4590 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 29.25, |
| "grad_norm_var": 6.670833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3857, |
| "loss/crossentropy": 2.0378803849220275, |
| "loss/hidden": 3.495703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19217969439923763, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.11525, |
| "grad_norm": 32.0, |
| "grad_norm_var": 8.108268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4449, |
| "loss/crossentropy": 1.9883966132998467, |
| "loss/hidden": 3.378515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1796421378850937, |
| "step": 4610 |
| }, |
| { |
| "epoch": 0.1155, |
| "grad_norm": 28.5, |
| "grad_norm_var": 2.8853515625, |
| "learning_rate": 0.0001, |
| "loss": 7.43, |
| "loss/crossentropy": 2.2122382700443266, |
| "loss/hidden": 3.434765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20737907551229, |
| "step": 4620 |
| }, |
| { |
| "epoch": 0.11575, |
| "grad_norm": 30.375, |
| "grad_norm_var": 3.7968098958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.3858, |
| "loss/crossentropy": 2.0896764233708383, |
| "loss/hidden": 3.540234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20905990786850454, |
| "step": 4630 |
| }, |
| { |
| "epoch": 0.116, |
| "grad_norm": 27.5, |
| "grad_norm_var": 3.6879557291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5145, |
| "loss/crossentropy": 2.104724445939064, |
| "loss/hidden": 3.3796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19548750538378953, |
| "step": 4640 |
| }, |
| { |
| "epoch": 0.11625, |
| "grad_norm": 29.875, |
| "grad_norm_var": 8.7056640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4009, |
| "loss/crossentropy": 2.155320603400469, |
| "loss/hidden": 3.47578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2002986514940858, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.1165, |
| "grad_norm": 27.0, |
| "grad_norm_var": 5.1541015625, |
| "learning_rate": 0.0001, |
| "loss": 7.3193, |
| "loss/crossentropy": 2.085461828112602, |
| "loss/hidden": 3.38671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1905359473079443, |
| "step": 4660 |
| }, |
| { |
| "epoch": 0.11675, |
| "grad_norm": 30.5, |
| "grad_norm_var": 1.5926432291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3125, |
| "loss/crossentropy": 1.9927285239100456, |
| "loss/hidden": 3.411328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17640038076788186, |
| "step": 4670 |
| }, |
| { |
| "epoch": 0.117, |
| "grad_norm": 33.75, |
| "grad_norm_var": 4.747330729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.469, |
| "loss/crossentropy": 2.1633560836315153, |
| "loss/hidden": 3.324609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1862495567649603, |
| "step": 4680 |
| }, |
| { |
| "epoch": 0.11725, |
| "grad_norm": 28.25, |
| "grad_norm_var": 7.198372395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4318, |
| "loss/crossentropy": 2.2390024289488792, |
| "loss/hidden": 3.430078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2097862558439374, |
| "step": 4690 |
| }, |
| { |
| "epoch": 0.1175, |
| "grad_norm": 31.375, |
| "grad_norm_var": 5.760872395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4669, |
| "loss/crossentropy": 2.0608770951628683, |
| "loss/hidden": 3.4390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19615320730954408, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.11775, |
| "grad_norm": 34.25, |
| "grad_norm_var": 4.1894735190686346e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4596, |
| "loss/crossentropy": 2.0900899082422257, |
| "loss/hidden": 3.360546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17933723451569678, |
| "step": 4710 |
| }, |
| { |
| "epoch": 0.118, |
| "grad_norm": 29.625, |
| "grad_norm_var": 58.10729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3979, |
| "loss/crossentropy": 2.094898019731045, |
| "loss/hidden": 3.46875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20720194689929486, |
| "step": 4720 |
| }, |
| { |
| "epoch": 0.11825, |
| "grad_norm": 30.25, |
| "grad_norm_var": 1.98515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4519, |
| "loss/crossentropy": 2.083225329220295, |
| "loss/hidden": 3.426171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20777787994593383, |
| "step": 4730 |
| }, |
| { |
| "epoch": 0.1185, |
| "grad_norm": 30.375, |
| "grad_norm_var": 4.818684895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4795, |
| "loss/crossentropy": 2.1974314540624618, |
| "loss/hidden": 3.38046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19978385213762523, |
| "step": 4740 |
| }, |
| { |
| "epoch": 0.11875, |
| "grad_norm": 32.5, |
| "grad_norm_var": 3.439322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3843, |
| "loss/crossentropy": 1.9562335655093193, |
| "loss/hidden": 3.39140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18924889974296094, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.119, |
| "grad_norm": 30.625, |
| "grad_norm_var": 1.3015402743274143e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5729, |
| "loss/crossentropy": 2.0693807609379293, |
| "loss/hidden": 3.339453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18801879994571208, |
| "step": 4760 |
| }, |
| { |
| "epoch": 0.11925, |
| "grad_norm": 35.25, |
| "grad_norm_var": 258.8791015625, |
| "learning_rate": 0.0001, |
| "loss": 7.3013, |
| "loss/crossentropy": 2.0631250627338886, |
| "loss/hidden": 3.3703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18974527437239885, |
| "step": 4770 |
| }, |
| { |
| "epoch": 0.1195, |
| "grad_norm": 28.625, |
| "grad_norm_var": 301.52233072916664, |
| "learning_rate": 0.0001, |
| "loss": 7.4639, |
| "loss/crossentropy": 2.1473939388990404, |
| "loss/hidden": 3.3984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19722200892865657, |
| "step": 4780 |
| }, |
| { |
| "epoch": 0.11975, |
| "grad_norm": 31.125, |
| "grad_norm_var": 25.472330729166668, |
| "learning_rate": 0.0001, |
| "loss": 7.3161, |
| "loss/crossentropy": 2.1767601929605007, |
| "loss/hidden": 3.380078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20041130091995002, |
| "step": 4790 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 29.5, |
| "grad_norm_var": 2.8580729166666665, |
| "learning_rate": 0.0001, |
| "loss": 7.3077, |
| "loss/crossentropy": 2.0214909121394156, |
| "loss/hidden": 3.38828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19553480856120586, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.12025, |
| "grad_norm": 34.25, |
| "grad_norm_var": 2.3666015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4537, |
| "loss/crossentropy": 2.092876334488392, |
| "loss/hidden": 3.276171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19079044535756112, |
| "step": 4810 |
| }, |
| { |
| "epoch": 0.1205, |
| "grad_norm": 28.75, |
| "grad_norm_var": 2.1494140625, |
| "learning_rate": 0.0001, |
| "loss": 7.3579, |
| "loss/crossentropy": 2.159788618981838, |
| "loss/hidden": 3.447265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20938555523753166, |
| "step": 4820 |
| }, |
| { |
| "epoch": 0.12075, |
| "grad_norm": 31.625, |
| "grad_norm_var": 1.2635411529466906e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.3822, |
| "loss/crossentropy": 2.221826246380806, |
| "loss/hidden": 3.3140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18899439387023448, |
| "step": 4830 |
| }, |
| { |
| "epoch": 0.121, |
| "grad_norm": 29.375, |
| "grad_norm_var": 7.171875, |
| "learning_rate": 0.0001, |
| "loss": 7.3649, |
| "loss/crossentropy": 2.2076950490474703, |
| "loss/hidden": 3.321875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1911212421953678, |
| "step": 4840 |
| }, |
| { |
| "epoch": 0.12125, |
| "grad_norm": 28.875, |
| "grad_norm_var": 5.397916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.2934, |
| "loss/crossentropy": 2.1398009806871414, |
| "loss/hidden": 3.276953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18104367554187775, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.1215, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.292122395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3944, |
| "loss/crossentropy": 2.0568679124116898, |
| "loss/hidden": 3.31953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19066975675523282, |
| "step": 4860 |
| }, |
| { |
| "epoch": 0.12175, |
| "grad_norm": 31.75, |
| "grad_norm_var": 1.5145182291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5365, |
| "loss/crossentropy": 2.2600763499736787, |
| "loss/hidden": 3.419921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20988074019551278, |
| "step": 4870 |
| }, |
| { |
| "epoch": 0.122, |
| "grad_norm": 30.125, |
| "grad_norm_var": 0.8442057291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4425, |
| "loss/crossentropy": 2.087808459997177, |
| "loss/hidden": 3.397265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20126468148082494, |
| "step": 4880 |
| }, |
| { |
| "epoch": 0.12225, |
| "grad_norm": 29.25, |
| "grad_norm_var": 1.9455729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4649, |
| "loss/crossentropy": 2.089573635160923, |
| "loss/hidden": 3.3890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18984669484198094, |
| "step": 4890 |
| }, |
| { |
| "epoch": 0.1225, |
| "grad_norm": 29.125, |
| "grad_norm_var": 2.7552083333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4894, |
| "loss/crossentropy": 2.1424145482480528, |
| "loss/hidden": 3.47890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20886036530137062, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.12275, |
| "grad_norm": 31.0, |
| "grad_norm_var": 4.751497395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5033, |
| "loss/crossentropy": 2.104494086652994, |
| "loss/hidden": 3.41875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1945918256416917, |
| "step": 4910 |
| }, |
| { |
| "epoch": 0.123, |
| "grad_norm": 28.125, |
| "grad_norm_var": 5.330989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4954, |
| "loss/crossentropy": 2.0843611776828768, |
| "loss/hidden": 3.358203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1925347488373518, |
| "step": 4920 |
| }, |
| { |
| "epoch": 0.12325, |
| "grad_norm": 28.625, |
| "grad_norm_var": 3.8166015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4404, |
| "loss/crossentropy": 2.205425333976746, |
| "loss/hidden": 3.3359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18580489940941333, |
| "step": 4930 |
| }, |
| { |
| "epoch": 0.1235, |
| "grad_norm": 29.375, |
| "grad_norm_var": 14.980208333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.3481, |
| "loss/crossentropy": 1.9896500617265702, |
| "loss/hidden": 3.39609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1904701752588153, |
| "step": 4940 |
| }, |
| { |
| "epoch": 0.12375, |
| "grad_norm": 32.75, |
| "grad_norm_var": 19.178580729166665, |
| "learning_rate": 0.0001, |
| "loss": 7.5252, |
| "loss/crossentropy": 2.1207278318703175, |
| "loss/hidden": 3.484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19760717861354352, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.124, |
| "grad_norm": 32.5, |
| "grad_norm_var": 17.264583333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.2678, |
| "loss/crossentropy": 1.9271991185843944, |
| "loss/hidden": 3.419921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19860625620931388, |
| "step": 4960 |
| }, |
| { |
| "epoch": 0.12425, |
| "grad_norm": 28.625, |
| "grad_norm_var": 11.196809895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3703, |
| "loss/crossentropy": 2.0659097760915754, |
| "loss/hidden": 3.287109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18224728610366583, |
| "step": 4970 |
| }, |
| { |
| "epoch": 0.1245, |
| "grad_norm": 37.75, |
| "grad_norm_var": 10.03515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5041, |
| "loss/crossentropy": 1.9809176340699195, |
| "loss/hidden": 3.41796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19965030066668987, |
| "step": 4980 |
| }, |
| { |
| "epoch": 0.12475, |
| "grad_norm": 27.125, |
| "grad_norm_var": 11.567708333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.327, |
| "loss/crossentropy": 2.0197409205138683, |
| "loss/hidden": 3.368359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18525638189166785, |
| "step": 4990 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 34.75, |
| "grad_norm_var": 8.558268229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.393, |
| "loss/crossentropy": 2.100055608153343, |
| "loss/hidden": 3.391015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19607669236138464, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.12525, |
| "grad_norm": 29.75, |
| "grad_norm_var": 5.602083333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4165, |
| "loss/crossentropy": 1.9898378394544125, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18131834492087365, |
| "step": 5010 |
| }, |
| { |
| "epoch": 0.1255, |
| "grad_norm": 34.75, |
| "grad_norm_var": 5.866666666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.555, |
| "loss/crossentropy": 2.086017055809498, |
| "loss/hidden": 3.446484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1889553153887391, |
| "step": 5020 |
| }, |
| { |
| "epoch": 0.12575, |
| "grad_norm": 33.25, |
| "grad_norm_var": 9.083268229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.4098, |
| "loss/crossentropy": 2.1133529357612133, |
| "loss/hidden": 3.365234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1881294794380665, |
| "step": 5030 |
| }, |
| { |
| "epoch": 0.126, |
| "grad_norm": 37.25, |
| "grad_norm_var": 14.268489583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4117, |
| "loss/crossentropy": 2.1818468660116195, |
| "loss/hidden": 3.3390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19351670220494271, |
| "step": 5040 |
| }, |
| { |
| "epoch": 0.12625, |
| "grad_norm": 27.75, |
| "grad_norm_var": 15.270572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.448, |
| "loss/crossentropy": 2.133790023624897, |
| "loss/hidden": 3.267578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17900315206497908, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.1265, |
| "grad_norm": 27.75, |
| "grad_norm_var": 12.469791666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4514, |
| "loss/crossentropy": 2.013399636745453, |
| "loss/hidden": 3.502734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19984339475631713, |
| "step": 5060 |
| }, |
| { |
| "epoch": 0.12675, |
| "grad_norm": 28.625, |
| "grad_norm_var": 6.479622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4411, |
| "loss/crossentropy": 2.1552533119916917, |
| "loss/hidden": 3.4265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20999168753623962, |
| "step": 5070 |
| }, |
| { |
| "epoch": 0.127, |
| "grad_norm": 30.125, |
| "grad_norm_var": 4.280989583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4238, |
| "loss/crossentropy": 2.1047082796692846, |
| "loss/hidden": 3.44375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19757428932935, |
| "step": 5080 |
| }, |
| { |
| "epoch": 0.12725, |
| "grad_norm": 29.25, |
| "grad_norm_var": 3.971875, |
| "learning_rate": 0.0001, |
| "loss": 7.547, |
| "loss/crossentropy": 2.2064288735389708, |
| "loss/hidden": 3.472265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2037733059376478, |
| "step": 5090 |
| }, |
| { |
| "epoch": 0.1275, |
| "grad_norm": 30.125, |
| "grad_norm_var": 3.809309895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5359, |
| "loss/crossentropy": 2.307460626959801, |
| "loss/hidden": 3.405078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2208320491015911, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.12775, |
| "grad_norm": 30.0, |
| "grad_norm_var": 6.887434895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.3695, |
| "loss/crossentropy": 2.1241589702665804, |
| "loss/hidden": 3.36640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18603499811142682, |
| "step": 5110 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 30.125, |
| "grad_norm_var": 1.8353515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4045, |
| "loss/crossentropy": 2.1248120576143266, |
| "loss/hidden": 3.364453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19218573588877916, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.12825, |
| "grad_norm": 30.875, |
| "grad_norm_var": 18.843489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.3683, |
| "loss/crossentropy": 2.0221078641712666, |
| "loss/hidden": 3.396484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19441262539476156, |
| "step": 5130 |
| }, |
| { |
| "epoch": 0.1285, |
| "grad_norm": 30.25, |
| "grad_norm_var": 19.755989583333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4467, |
| "loss/crossentropy": 2.0746863678097727, |
| "loss/hidden": 3.41015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20940047055482863, |
| "step": 5140 |
| }, |
| { |
| "epoch": 0.12875, |
| "grad_norm": 29.75, |
| "grad_norm_var": 7.226497395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4125, |
| "loss/crossentropy": 2.127023458480835, |
| "loss/hidden": 3.3984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19320496991276742, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.129, |
| "grad_norm": 30.875, |
| "grad_norm_var": 8.332747395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.3609, |
| "loss/crossentropy": 2.0404578357934953, |
| "loss/hidden": 3.271484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18524497244507074, |
| "step": 5160 |
| }, |
| { |
| "epoch": 0.12925, |
| "grad_norm": 30.375, |
| "grad_norm_var": 5.566080729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3806, |
| "loss/crossentropy": 2.05174797475338, |
| "loss/hidden": 3.4125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20412184661254287, |
| "step": 5170 |
| }, |
| { |
| "epoch": 0.1295, |
| "grad_norm": 30.875, |
| "grad_norm_var": 72.65201822916667, |
| "learning_rate": 0.0001, |
| "loss": 7.4596, |
| "loss/crossentropy": 2.0945761643350123, |
| "loss/hidden": 3.388671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1920377543196082, |
| "step": 5180 |
| }, |
| { |
| "epoch": 0.12975, |
| "grad_norm": 34.25, |
| "grad_norm_var": 1.8330729166666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3953, |
| "loss/crossentropy": 2.0848742216825484, |
| "loss/hidden": 3.548046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22283064387738705, |
| "step": 5190 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.4244140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5061, |
| "loss/crossentropy": 1.997230054438114, |
| "loss/hidden": 3.488671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.198976163379848, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.13025, |
| "grad_norm": 42.0, |
| "grad_norm_var": 9.762239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.523, |
| "loss/crossentropy": 2.169138702750206, |
| "loss/hidden": 3.458984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2180183682590723, |
| "step": 5210 |
| }, |
| { |
| "epoch": 0.1305, |
| "grad_norm": 27.75, |
| "grad_norm_var": 13.330989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4174, |
| "loss/crossentropy": 2.0436717979609966, |
| "loss/hidden": 3.4265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18874377477914095, |
| "step": 5220 |
| }, |
| { |
| "epoch": 0.13075, |
| "grad_norm": 33.5, |
| "grad_norm_var": 5.186393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4735, |
| "loss/crossentropy": 2.1061771392822264, |
| "loss/hidden": 3.3125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1841479053720832, |
| "step": 5230 |
| }, |
| { |
| "epoch": 0.131, |
| "grad_norm": 28.25, |
| "grad_norm_var": 3.480208333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4379, |
| "loss/crossentropy": 1.9957973182201385, |
| "loss/hidden": 3.46171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19976749327033758, |
| "step": 5240 |
| }, |
| { |
| "epoch": 0.13125, |
| "grad_norm": 31.0, |
| "grad_norm_var": 2.2249348958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.445, |
| "loss/crossentropy": 2.089694794267416, |
| "loss/hidden": 3.344921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18123079631477595, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.1315, |
| "grad_norm": 30.625, |
| "grad_norm_var": 2.062239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3525, |
| "loss/crossentropy": 2.096596322953701, |
| "loss/hidden": 3.3875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17838086038827897, |
| "step": 5260 |
| }, |
| { |
| "epoch": 0.13175, |
| "grad_norm": 27.75, |
| "grad_norm_var": 2.6619140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4042, |
| "loss/crossentropy": 2.086874121427536, |
| "loss/hidden": 3.419921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18623477015644313, |
| "step": 5270 |
| }, |
| { |
| "epoch": 0.132, |
| "grad_norm": 31.125, |
| "grad_norm_var": 1.8416015625, |
| "learning_rate": 0.0001, |
| "loss": 7.3634, |
| "loss/crossentropy": 2.131754931807518, |
| "loss/hidden": 3.416796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20106223467737436, |
| "step": 5280 |
| }, |
| { |
| "epoch": 0.13225, |
| "grad_norm": 32.5, |
| "grad_norm_var": 6.598372395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4969, |
| "loss/crossentropy": 2.1548122704029082, |
| "loss/hidden": 3.419921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20439809635281564, |
| "step": 5290 |
| }, |
| { |
| "epoch": 0.1325, |
| "grad_norm": 28.125, |
| "grad_norm_var": 5.252083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4373, |
| "loss/crossentropy": 2.1770398393273354, |
| "loss/hidden": 3.40078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19656166546046733, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.13275, |
| "grad_norm": 29.875, |
| "grad_norm_var": 1.2473307291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4308, |
| "loss/crossentropy": 2.152033807337284, |
| "loss/hidden": 3.3046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18985964702442287, |
| "step": 5310 |
| }, |
| { |
| "epoch": 0.133, |
| "grad_norm": 31.625, |
| "grad_norm_var": 1.0041666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4327, |
| "loss/crossentropy": 2.087932828068733, |
| "loss/hidden": 3.3390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.184254783205688, |
| "step": 5320 |
| }, |
| { |
| "epoch": 0.13325, |
| "grad_norm": 31.0, |
| "grad_norm_var": 2.4344770491390623e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.3366, |
| "loss/crossentropy": 2.034941144287586, |
| "loss/hidden": 3.38359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1904723599553108, |
| "step": 5330 |
| }, |
| { |
| "epoch": 0.1335, |
| "grad_norm": 31.5, |
| "grad_norm_var": 6.255989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4492, |
| "loss/crossentropy": 2.152811796963215, |
| "loss/hidden": 3.385546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18841406889259815, |
| "step": 5340 |
| }, |
| { |
| "epoch": 0.13375, |
| "grad_norm": 31.625, |
| "grad_norm_var": 1.6822916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.465, |
| "loss/crossentropy": 2.1877569228410723, |
| "loss/hidden": 3.36796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19606791157275438, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.134, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.17890625, |
| "learning_rate": 0.0001, |
| "loss": 7.3975, |
| "loss/crossentropy": 2.0353255167603495, |
| "loss/hidden": 3.340234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1815076546743512, |
| "step": 5360 |
| }, |
| { |
| "epoch": 0.13425, |
| "grad_norm": 31.0, |
| "grad_norm_var": 12.4525390625, |
| "learning_rate": 0.0001, |
| "loss": 7.3462, |
| "loss/crossentropy": 1.919140312820673, |
| "loss/hidden": 3.370703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18229803508147596, |
| "step": 5370 |
| }, |
| { |
| "epoch": 0.1345, |
| "grad_norm": 29.625, |
| "grad_norm_var": 7.26640625, |
| "learning_rate": 0.0001, |
| "loss": 7.506, |
| "loss/crossentropy": 2.11019846200943, |
| "loss/hidden": 3.43046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19277678560465575, |
| "step": 5380 |
| }, |
| { |
| "epoch": 0.13475, |
| "grad_norm": 32.0, |
| "grad_norm_var": 0.9275390625, |
| "learning_rate": 0.0001, |
| "loss": 7.3668, |
| "loss/crossentropy": 2.1108837127685547, |
| "loss/hidden": 3.493359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20013203900307416, |
| "step": 5390 |
| }, |
| { |
| "epoch": 0.135, |
| "grad_norm": 28.125, |
| "grad_norm_var": 1.7330729166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5524, |
| "loss/crossentropy": 2.16382010653615, |
| "loss/hidden": 3.371484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1924815428443253, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.13525, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.5893229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5393, |
| "loss/crossentropy": 2.0622613176703455, |
| "loss/hidden": 3.498828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20035731326788664, |
| "step": 5410 |
| }, |
| { |
| "epoch": 0.1355, |
| "grad_norm": 29.625, |
| "grad_norm_var": 2.8499348958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.3466, |
| "loss/crossentropy": 2.169532992690802, |
| "loss/hidden": 3.381640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2085475005209446, |
| "step": 5420 |
| }, |
| { |
| "epoch": 0.13575, |
| "grad_norm": 31.0, |
| "grad_norm_var": 2.8212890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4299, |
| "loss/crossentropy": 2.056758251786232, |
| "loss/hidden": 3.460546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19217969793826342, |
| "step": 5430 |
| }, |
| { |
| "epoch": 0.136, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.9749348958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4334, |
| "loss/crossentropy": 2.1805212616920473, |
| "loss/hidden": 3.437109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21980819348245859, |
| "step": 5440 |
| }, |
| { |
| "epoch": 0.13625, |
| "grad_norm": 29.5, |
| "grad_norm_var": 2.0218098958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5382, |
| "loss/crossentropy": 2.1516773015260697, |
| "loss/hidden": 3.50703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.208776849322021, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.1365, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.082747395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4684, |
| "loss/crossentropy": 2.1602507561445234, |
| "loss/hidden": 3.366796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1842126866802573, |
| "step": 5460 |
| }, |
| { |
| "epoch": 0.13675, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.6197265625, |
| "learning_rate": 0.0001, |
| "loss": 7.3823, |
| "loss/crossentropy": 2.081377077102661, |
| "loss/hidden": 3.38359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1906685210764408, |
| "step": 5470 |
| }, |
| { |
| "epoch": 0.137, |
| "grad_norm": 30.875, |
| "grad_norm_var": 2.21875, |
| "learning_rate": 0.0001, |
| "loss": 7.4128, |
| "loss/crossentropy": 2.138934540748596, |
| "loss/hidden": 3.436328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18631890565156936, |
| "step": 5480 |
| }, |
| { |
| "epoch": 0.13725, |
| "grad_norm": 31.0, |
| "grad_norm_var": 3.958268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.568, |
| "loss/crossentropy": 2.02208868265152, |
| "loss/hidden": 3.43828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19863407909870148, |
| "step": 5490 |
| }, |
| { |
| "epoch": 0.1375, |
| "grad_norm": 33.0, |
| "grad_norm_var": 6.1775390625, |
| "learning_rate": 0.0001, |
| "loss": 7.3806, |
| "loss/crossentropy": 2.0247954726219177, |
| "loss/hidden": 3.408984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18538292730227113, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.13775, |
| "grad_norm": 30.625, |
| "grad_norm_var": 6.076041666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.6241, |
| "loss/crossentropy": 2.2269895624369385, |
| "loss/hidden": 3.445703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.197488261340186, |
| "step": 5510 |
| }, |
| { |
| "epoch": 0.138, |
| "grad_norm": 30.25, |
| "grad_norm_var": 4.1947265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4968, |
| "loss/crossentropy": 2.077942840754986, |
| "loss/hidden": 3.420703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19135653134435415, |
| "step": 5520 |
| }, |
| { |
| "epoch": 0.13825, |
| "grad_norm": 27.75, |
| "grad_norm_var": 2.701041666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3842, |
| "loss/crossentropy": 2.104434663057327, |
| "loss/hidden": 3.37890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18928063409402968, |
| "step": 5530 |
| }, |
| { |
| "epoch": 0.1385, |
| "grad_norm": 31.5, |
| "grad_norm_var": 6.4087890625, |
| "learning_rate": 0.0001, |
| "loss": 7.443, |
| "loss/crossentropy": 2.0420807294547556, |
| "loss/hidden": 3.362890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18584198467433452, |
| "step": 5540 |
| }, |
| { |
| "epoch": 0.13875, |
| "grad_norm": 32.5, |
| "grad_norm_var": 4.7556640625, |
| "learning_rate": 0.0001, |
| "loss": 7.418, |
| "loss/crossentropy": 2.0941856279969215, |
| "loss/hidden": 3.311328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18290557386353612, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.139, |
| "grad_norm": 34.0, |
| "grad_norm_var": 3.0947916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3594, |
| "loss/crossentropy": 2.1482032746076585, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19914243686944247, |
| "step": 5560 |
| }, |
| { |
| "epoch": 0.13925, |
| "grad_norm": 31.875, |
| "grad_norm_var": 4.449739583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5652, |
| "loss/crossentropy": 2.150991679728031, |
| "loss/hidden": 3.4984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20609580241143705, |
| "step": 5570 |
| }, |
| { |
| "epoch": 0.1395, |
| "grad_norm": 30.125, |
| "grad_norm_var": 8.527083333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.3112, |
| "loss/crossentropy": 2.1712302803993224, |
| "loss/hidden": 3.38203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18877983894199132, |
| "step": 5580 |
| }, |
| { |
| "epoch": 0.13975, |
| "grad_norm": 28.5, |
| "grad_norm_var": 2.1197265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4033, |
| "loss/crossentropy": 2.1502134561538697, |
| "loss/hidden": 3.4453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21474836114794016, |
| "step": 5590 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 30.25, |
| "grad_norm_var": 3.6809895833333335, |
| "learning_rate": 0.0001, |
| "loss": 7.5089, |
| "loss/crossentropy": 2.230164831876755, |
| "loss/hidden": 3.409375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2127472611144185, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.14025, |
| "grad_norm": 30.625, |
| "grad_norm_var": 52.13333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5328, |
| "loss/crossentropy": 2.1207681491971018, |
| "loss/hidden": 3.459765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20428987089544534, |
| "step": 5610 |
| }, |
| { |
| "epoch": 0.1405, |
| "grad_norm": 30.625, |
| "grad_norm_var": 2.037239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3972, |
| "loss/crossentropy": 2.065328547358513, |
| "loss/hidden": 3.43671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19245364069938659, |
| "step": 5620 |
| }, |
| { |
| "epoch": 0.14075, |
| "grad_norm": 29.375, |
| "grad_norm_var": 2.40625, |
| "learning_rate": 0.0001, |
| "loss": 7.2803, |
| "loss/crossentropy": 2.0791175961494446, |
| "loss/hidden": 3.380078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1857742078602314, |
| "step": 5630 |
| }, |
| { |
| "epoch": 0.141, |
| "grad_norm": 29.0, |
| "grad_norm_var": 2.687239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3324, |
| "loss/crossentropy": 2.054654690623283, |
| "loss/hidden": 3.31328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18879605047404766, |
| "step": 5640 |
| }, |
| { |
| "epoch": 0.14125, |
| "grad_norm": 27.625, |
| "grad_norm_var": 3.06015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4804, |
| "loss/crossentropy": 2.163857588917017, |
| "loss/hidden": 3.434375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2018537001684308, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.1415, |
| "grad_norm": 31.625, |
| "grad_norm_var": 3.54140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4337, |
| "loss/crossentropy": 2.1230690620839594, |
| "loss/hidden": 3.33046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18610329292714595, |
| "step": 5660 |
| }, |
| { |
| "epoch": 0.14175, |
| "grad_norm": 30.0, |
| "grad_norm_var": 2.0171223958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.484, |
| "loss/crossentropy": 2.1232656478881835, |
| "loss/hidden": 3.306640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18209880087524652, |
| "step": 5670 |
| }, |
| { |
| "epoch": 0.142, |
| "grad_norm": 30.625, |
| "grad_norm_var": 3.6244140625, |
| "learning_rate": 0.0001, |
| "loss": 7.3601, |
| "loss/crossentropy": 1.9925632011145353, |
| "loss/hidden": 3.487109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18384792990982532, |
| "step": 5680 |
| }, |
| { |
| "epoch": 0.14225, |
| "grad_norm": 28.875, |
| "grad_norm_var": 4.595247395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.3492, |
| "loss/crossentropy": 1.9747695334255695, |
| "loss/hidden": 3.4203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18710496351122857, |
| "step": 5690 |
| }, |
| { |
| "epoch": 0.1425, |
| "grad_norm": 30.0, |
| "grad_norm_var": 5.160872395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.1869, |
| "loss/crossentropy": 1.9229816131293773, |
| "loss/hidden": 3.390234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1764959843829274, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.14275, |
| "grad_norm": 31.375, |
| "grad_norm_var": 1.1177083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.5353, |
| "loss/crossentropy": 2.1759460479021073, |
| "loss/hidden": 3.442578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2011772884055972, |
| "step": 5710 |
| }, |
| { |
| "epoch": 0.143, |
| "grad_norm": 30.875, |
| "grad_norm_var": 19.32265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4364, |
| "loss/crossentropy": 1.9983633741736413, |
| "loss/hidden": 3.44453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2036839971318841, |
| "step": 5720 |
| }, |
| { |
| "epoch": 0.14325, |
| "grad_norm": 31.75, |
| "grad_norm_var": 6.330989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3473, |
| "loss/crossentropy": 2.2608693316578865, |
| "loss/hidden": 3.395703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20173839703202248, |
| "step": 5730 |
| }, |
| { |
| "epoch": 0.1435, |
| "grad_norm": 29.0, |
| "grad_norm_var": 30.8306640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4861, |
| "loss/crossentropy": 2.191919285058975, |
| "loss/hidden": 3.378125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1848000530153513, |
| "step": 5740 |
| }, |
| { |
| "epoch": 0.14375, |
| "grad_norm": 32.5, |
| "grad_norm_var": 8.167643229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5179, |
| "loss/crossentropy": 2.098912109434605, |
| "loss/hidden": 3.544921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22364525627344847, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 30.5, |
| "grad_norm_var": 2.5479166666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4241, |
| "loss/crossentropy": 2.089163874089718, |
| "loss/hidden": 3.39296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19125983892008663, |
| "step": 5760 |
| }, |
| { |
| "epoch": 0.14425, |
| "grad_norm": 29.875, |
| "grad_norm_var": 8.223958333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.45, |
| "loss/crossentropy": 2.2600366115570067, |
| "loss/hidden": 3.444140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1984367400407791, |
| "step": 5770 |
| }, |
| { |
| "epoch": 0.1445, |
| "grad_norm": 32.0, |
| "grad_norm_var": 14.806705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3632, |
| "loss/crossentropy": 1.9510320864617825, |
| "loss/hidden": 3.347265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18547183061018585, |
| "step": 5780 |
| }, |
| { |
| "epoch": 0.14475, |
| "grad_norm": 30.0, |
| "grad_norm_var": 9.655989583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5503, |
| "loss/crossentropy": 2.143619356304407, |
| "loss/hidden": 3.4, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20010631643235682, |
| "step": 5790 |
| }, |
| { |
| "epoch": 0.145, |
| "grad_norm": 28.375, |
| "grad_norm_var": 9.556184895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5076, |
| "loss/crossentropy": 1.9316529139876366, |
| "loss/hidden": 3.40546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18306834027171134, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.14525, |
| "grad_norm": 37.25, |
| "grad_norm_var": 10.718489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5004, |
| "loss/crossentropy": 2.136544609069824, |
| "loss/hidden": 3.3453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20682645812630654, |
| "step": 5810 |
| }, |
| { |
| "epoch": 0.1455, |
| "grad_norm": 34.75, |
| "grad_norm_var": 9.395572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4109, |
| "loss/crossentropy": 2.0811544865369798, |
| "loss/hidden": 3.346484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.187607554346323, |
| "step": 5820 |
| }, |
| { |
| "epoch": 0.14575, |
| "grad_norm": 30.125, |
| "grad_norm_var": 11.476041666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4615, |
| "loss/crossentropy": 2.2464685887098312, |
| "loss/hidden": 3.419140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21834317222237587, |
| "step": 5830 |
| }, |
| { |
| "epoch": 0.146, |
| "grad_norm": 31.875, |
| "grad_norm_var": 8.106705729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.379, |
| "loss/crossentropy": 2.1494223892688753, |
| "loss/hidden": 3.4875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20599585752934219, |
| "step": 5840 |
| }, |
| { |
| "epoch": 0.14625, |
| "grad_norm": 32.25, |
| "grad_norm_var": 119.0541015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4223, |
| "loss/crossentropy": 2.013238602876663, |
| "loss/hidden": 3.355859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17934355642646552, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.1465, |
| "grad_norm": 55.5, |
| "grad_norm_var": 40.619205729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.4766, |
| "loss/crossentropy": 2.1309464499354362, |
| "loss/hidden": 3.40625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1953899236395955, |
| "step": 5860 |
| }, |
| { |
| "epoch": 0.14675, |
| "grad_norm": 31.125, |
| "grad_norm_var": 51.764322916666664, |
| "learning_rate": 0.0001, |
| "loss": 7.5385, |
| "loss/crossentropy": 2.203585295379162, |
| "loss/hidden": 3.40625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20035494081676006, |
| "step": 5870 |
| }, |
| { |
| "epoch": 0.147, |
| "grad_norm": 30.125, |
| "grad_norm_var": 8.2541015625, |
| "learning_rate": 0.0001, |
| "loss": 7.3861, |
| "loss/crossentropy": 2.057890709489584, |
| "loss/hidden": 3.3515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18680873457342387, |
| "step": 5880 |
| }, |
| { |
| "epoch": 0.14725, |
| "grad_norm": 30.25, |
| "grad_norm_var": 4.555989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4283, |
| "loss/crossentropy": 2.049139867722988, |
| "loss/hidden": 3.369921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1856512013822794, |
| "step": 5890 |
| }, |
| { |
| "epoch": 0.1475, |
| "grad_norm": 30.375, |
| "grad_norm_var": 10.570768229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4651, |
| "loss/crossentropy": 2.0553019613027574, |
| "loss/hidden": 3.404296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18676785845309496, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.14775, |
| "grad_norm": 30.125, |
| "grad_norm_var": 14.59140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4727, |
| "loss/crossentropy": 2.0098409935832025, |
| "loss/hidden": 3.48125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20080684809945523, |
| "step": 5910 |
| }, |
| { |
| "epoch": 0.148, |
| "grad_norm": 28.625, |
| "grad_norm_var": 8.489322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4203, |
| "loss/crossentropy": 2.2615666806697847, |
| "loss/hidden": 3.3, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18850413355976342, |
| "step": 5920 |
| }, |
| { |
| "epoch": 0.14825, |
| "grad_norm": 30.0, |
| "grad_norm_var": 3.6233723958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4441, |
| "loss/crossentropy": 2.178256964683533, |
| "loss/hidden": 3.29609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19248567353934048, |
| "step": 5930 |
| }, |
| { |
| "epoch": 0.1485, |
| "grad_norm": 31.0, |
| "grad_norm_var": 3.455208333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4573, |
| "loss/crossentropy": 2.246034747362137, |
| "loss/hidden": 3.45859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2096735591068864, |
| "step": 5940 |
| }, |
| { |
| "epoch": 0.14875, |
| "grad_norm": 30.625, |
| "grad_norm_var": 3.8494140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4811, |
| "loss/crossentropy": 2.180899788439274, |
| "loss/hidden": 3.341796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19460927378386259, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.149, |
| "grad_norm": 30.5, |
| "grad_norm_var": 3.4385416666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3829, |
| "loss/crossentropy": 2.258976912498474, |
| "loss/hidden": 3.332421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19133044108748437, |
| "step": 5960 |
| }, |
| { |
| "epoch": 0.14925, |
| "grad_norm": 29.0, |
| "grad_norm_var": 13.80390625, |
| "learning_rate": 0.0001, |
| "loss": 7.5384, |
| "loss/crossentropy": 2.012222741544247, |
| "loss/hidden": 3.4640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19505210760980846, |
| "step": 5970 |
| }, |
| { |
| "epoch": 0.1495, |
| "grad_norm": 27.875, |
| "grad_norm_var": 12.9369140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4033, |
| "loss/crossentropy": 2.0392286255955696, |
| "loss/hidden": 3.397265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1930427584797144, |
| "step": 5980 |
| }, |
| { |
| "epoch": 0.14975, |
| "grad_norm": 28.25, |
| "grad_norm_var": 18.174739583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.399, |
| "loss/crossentropy": 1.9029529005289079, |
| "loss/hidden": 3.462109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.198425155505538, |
| "step": 5990 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 31.875, |
| "grad_norm_var": 3.0737770860662226e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4994, |
| "loss/crossentropy": 1.8985859856009484, |
| "loss/hidden": 3.4609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1951824951916933, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.15025, |
| "grad_norm": 36.0, |
| "grad_norm_var": 3.073777086665239e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4659, |
| "loss/crossentropy": 2.097201645374298, |
| "loss/hidden": 3.351171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18253911342471837, |
| "step": 6010 |
| }, |
| { |
| "epoch": 0.1505, |
| "grad_norm": 27.875, |
| "grad_norm_var": 6.801041666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.2415, |
| "loss/crossentropy": 2.0210610911250115, |
| "loss/hidden": 3.484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.197306059114635, |
| "step": 6020 |
| }, |
| { |
| "epoch": 0.15075, |
| "grad_norm": 31.0, |
| "grad_norm_var": 14.46640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4519, |
| "loss/crossentropy": 2.1985476523637772, |
| "loss/hidden": 3.475, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20262509360909461, |
| "step": 6030 |
| }, |
| { |
| "epoch": 0.151, |
| "grad_norm": 30.625, |
| "grad_norm_var": 6.254622395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.3353, |
| "loss/crossentropy": 2.0093181416392327, |
| "loss/hidden": 3.316015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17620250331237913, |
| "step": 6040 |
| }, |
| { |
| "epoch": 0.15125, |
| "grad_norm": 31.625, |
| "grad_norm_var": 56.4291015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4034, |
| "loss/crossentropy": 2.177773226797581, |
| "loss/hidden": 3.41875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20441538300365208, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.1515, |
| "grad_norm": 26.75, |
| "grad_norm_var": 55.889322916666664, |
| "learning_rate": 0.0001, |
| "loss": 7.3245, |
| "loss/crossentropy": 2.1666259437799456, |
| "loss/hidden": 3.392578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19311951845884323, |
| "step": 6060 |
| }, |
| { |
| "epoch": 0.15175, |
| "grad_norm": 30.25, |
| "grad_norm_var": 91.0103515625, |
| "learning_rate": 0.0001, |
| "loss": 7.368, |
| "loss/crossentropy": 2.063462796807289, |
| "loss/hidden": 3.373046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1834208857268095, |
| "step": 6070 |
| }, |
| { |
| "epoch": 0.152, |
| "grad_norm": 30.5, |
| "grad_norm_var": 18.1212890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4335, |
| "loss/crossentropy": 1.9907098844647408, |
| "loss/hidden": 3.53671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20707368329167367, |
| "step": 6080 |
| }, |
| { |
| "epoch": 0.15225, |
| "grad_norm": 38.25, |
| "grad_norm_var": 11.470247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3789, |
| "loss/crossentropy": 2.083692157268524, |
| "loss/hidden": 3.465625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1846569798886776, |
| "step": 6090 |
| }, |
| { |
| "epoch": 0.1525, |
| "grad_norm": 28.25, |
| "grad_norm_var": 21.829622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3767, |
| "loss/crossentropy": 2.1113929279148578, |
| "loss/hidden": 3.375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1990992769598961, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.15275, |
| "grad_norm": 33.75, |
| "grad_norm_var": 20.676497395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.339, |
| "loss/crossentropy": 2.1296695113182067, |
| "loss/hidden": 3.385546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1936045665293932, |
| "step": 6110 |
| }, |
| { |
| "epoch": 0.153, |
| "grad_norm": 36.25, |
| "grad_norm_var": 7.8166015625, |
| "learning_rate": 0.0001, |
| "loss": 7.5027, |
| "loss/crossentropy": 2.1011226207017897, |
| "loss/hidden": 3.471875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20695240292698144, |
| "step": 6120 |
| }, |
| { |
| "epoch": 0.15325, |
| "grad_norm": 30.625, |
| "grad_norm_var": 2.1304840750224113e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.506, |
| "loss/crossentropy": 2.2427969723939896, |
| "loss/hidden": 3.40625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1966634625568986, |
| "step": 6130 |
| }, |
| { |
| "epoch": 0.1535, |
| "grad_norm": 34.25, |
| "grad_norm_var": 36.542643229166664, |
| "learning_rate": 0.0001, |
| "loss": 7.4413, |
| "loss/crossentropy": 2.0855264641344546, |
| "loss/hidden": 3.384765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19436217453330756, |
| "step": 6140 |
| }, |
| { |
| "epoch": 0.15375, |
| "grad_norm": 32.75, |
| "grad_norm_var": 10.153580729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.3096, |
| "loss/crossentropy": 2.0322439685463904, |
| "loss/hidden": 3.3453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1753252800554037, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.154, |
| "grad_norm": 31.0, |
| "grad_norm_var": 10.216080729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.2481, |
| "loss/crossentropy": 2.074477408081293, |
| "loss/hidden": 3.346875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17578690703958272, |
| "step": 6160 |
| }, |
| { |
| "epoch": 0.15425, |
| "grad_norm": 33.25, |
| "grad_norm_var": 28.79765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4403, |
| "loss/crossentropy": 2.0863804474473, |
| "loss/hidden": 3.431640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20699662044644357, |
| "step": 6170 |
| }, |
| { |
| "epoch": 0.1545, |
| "grad_norm": 33.5, |
| "grad_norm_var": 24.84375, |
| "learning_rate": 0.0001, |
| "loss": 7.4609, |
| "loss/crossentropy": 2.0696858704090118, |
| "loss/hidden": 3.458984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2128069180995226, |
| "step": 6180 |
| }, |
| { |
| "epoch": 0.15475, |
| "grad_norm": 31.375, |
| "grad_norm_var": 5.099739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.2575, |
| "loss/crossentropy": 2.182169410586357, |
| "loss/hidden": 3.31796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.181897877715528, |
| "step": 6190 |
| }, |
| { |
| "epoch": 0.155, |
| "grad_norm": 34.25, |
| "grad_norm_var": 4.699934895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4975, |
| "loss/crossentropy": 2.165008749067783, |
| "loss/hidden": 3.492578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19800901636481286, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.15525, |
| "grad_norm": 33.75, |
| "grad_norm_var": 2.7738932291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3992, |
| "loss/crossentropy": 2.0653695166110992, |
| "loss/hidden": 3.406640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19180236533284187, |
| "step": 6210 |
| }, |
| { |
| "epoch": 0.1555, |
| "grad_norm": 35.25, |
| "grad_norm_var": 5.31015625, |
| "learning_rate": 0.0001, |
| "loss": 7.3638, |
| "loss/crossentropy": 2.1244568385183813, |
| "loss/hidden": 3.33125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18238217020407319, |
| "step": 6220 |
| }, |
| { |
| "epoch": 0.15575, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.2817057291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5073, |
| "loss/crossentropy": 2.1881898671388624, |
| "loss/hidden": 3.498828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2045454490929842, |
| "step": 6230 |
| }, |
| { |
| "epoch": 0.156, |
| "grad_norm": 28.25, |
| "grad_norm_var": 2.6572265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4609, |
| "loss/crossentropy": 2.14600064009428, |
| "loss/hidden": 3.4265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18945380430668593, |
| "step": 6240 |
| }, |
| { |
| "epoch": 0.15625, |
| "grad_norm": 30.75, |
| "grad_norm_var": 35.73118489583333, |
| "learning_rate": 0.0001, |
| "loss": 7.3786, |
| "loss/crossentropy": 2.168429624289274, |
| "loss/hidden": 3.296484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18439108245074748, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.1565, |
| "grad_norm": 52.5, |
| "grad_norm_var": 64.9962890625, |
| "learning_rate": 0.0001, |
| "loss": 7.3511, |
| "loss/crossentropy": 2.1293379329144955, |
| "loss/hidden": 3.414453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1828605517745018, |
| "step": 6260 |
| }, |
| { |
| "epoch": 0.15675, |
| "grad_norm": 29.25, |
| "grad_norm_var": 59.703125, |
| "learning_rate": 0.0001, |
| "loss": 7.3978, |
| "loss/crossentropy": 1.8641120925545693, |
| "loss/hidden": 3.430078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18577109538018705, |
| "step": 6270 |
| }, |
| { |
| "epoch": 0.157, |
| "grad_norm": 28.75, |
| "grad_norm_var": 32.1994140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4066, |
| "loss/crossentropy": 2.0997040398418902, |
| "loss/hidden": 3.344921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18688563201576472, |
| "step": 6280 |
| }, |
| { |
| "epoch": 0.15725, |
| "grad_norm": 31.375, |
| "grad_norm_var": 17.302018229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.4043, |
| "loss/crossentropy": 1.9712626039981842, |
| "loss/hidden": 3.45078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20054549565538765, |
| "step": 6290 |
| }, |
| { |
| "epoch": 0.1575, |
| "grad_norm": 31.5, |
| "grad_norm_var": 17.8431640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4502, |
| "loss/crossentropy": 2.0252815186977386, |
| "loss/hidden": 3.3890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18488127905875446, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.15775, |
| "grad_norm": 30.75, |
| "grad_norm_var": 7.995572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3829, |
| "loss/crossentropy": 2.030302118510008, |
| "loss/hidden": 3.400390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19101340658962726, |
| "step": 6310 |
| }, |
| { |
| "epoch": 0.158, |
| "grad_norm": 30.125, |
| "grad_norm_var": 5.805143229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.3852, |
| "loss/crossentropy": 1.9795936658978461, |
| "loss/hidden": 3.45703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19110800279304385, |
| "step": 6320 |
| }, |
| { |
| "epoch": 0.15825, |
| "grad_norm": 34.0, |
| "grad_norm_var": 6.91640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4417, |
| "loss/crossentropy": 2.0620448149740698, |
| "loss/hidden": 3.454296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.194018579646945, |
| "step": 6330 |
| }, |
| { |
| "epoch": 0.1585, |
| "grad_norm": 28.125, |
| "grad_norm_var": 31.058268229166668, |
| "learning_rate": 0.0001, |
| "loss": 7.4142, |
| "loss/crossentropy": 2.012200343608856, |
| "loss/hidden": 3.4921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21260247621685266, |
| "step": 6340 |
| }, |
| { |
| "epoch": 0.15875, |
| "grad_norm": 36.75, |
| "grad_norm_var": 35.18118489583333, |
| "learning_rate": 0.0001, |
| "loss": 7.3776, |
| "loss/crossentropy": 1.9757203698158263, |
| "loss/hidden": 3.382421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.200297892652452, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.159, |
| "grad_norm": 31.125, |
| "grad_norm_var": 17.764583333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4762, |
| "loss/crossentropy": 2.195678301155567, |
| "loss/hidden": 3.434375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1989523505792022, |
| "step": 6360 |
| }, |
| { |
| "epoch": 0.15925, |
| "grad_norm": 29.625, |
| "grad_norm_var": 12.851041666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4661, |
| "loss/crossentropy": 2.0537394002079963, |
| "loss/hidden": 3.421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20311654023826123, |
| "step": 6370 |
| }, |
| { |
| "epoch": 0.1595, |
| "grad_norm": 30.0, |
| "grad_norm_var": 10.0994140625, |
| "learning_rate": 0.0001, |
| "loss": 7.2759, |
| "loss/crossentropy": 2.02838040292263, |
| "loss/hidden": 3.45859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19148585237562657, |
| "step": 6380 |
| }, |
| { |
| "epoch": 0.15975, |
| "grad_norm": 39.0, |
| "grad_norm_var": 2324.6707682291667, |
| "learning_rate": 0.0001, |
| "loss": 7.3973, |
| "loss/crossentropy": 2.0951177358627318, |
| "loss/hidden": 3.430859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2158473737537861, |
| "step": 6390 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 40.0, |
| "grad_norm_var": 21.121809895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.2877, |
| "loss/crossentropy": 1.878954614698887, |
| "loss/hidden": 3.390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18514612764120103, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.16025, |
| "grad_norm": 33.5, |
| "grad_norm_var": 23.1666015625, |
| "learning_rate": 0.0001, |
| "loss": 7.3598, |
| "loss/crossentropy": 2.123918867111206, |
| "loss/hidden": 3.3140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18728599287569522, |
| "step": 6410 |
| }, |
| { |
| "epoch": 0.1605, |
| "grad_norm": 29.25, |
| "grad_norm_var": 11.230143229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.476, |
| "loss/crossentropy": 2.168968527019024, |
| "loss/hidden": 3.342578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20153266489505767, |
| "step": 6420 |
| }, |
| { |
| "epoch": 0.16075, |
| "grad_norm": 30.0, |
| "grad_norm_var": 90.8056640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4289, |
| "loss/crossentropy": 2.0426762118935584, |
| "loss/hidden": 3.367578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19033107869327068, |
| "step": 6430 |
| }, |
| { |
| "epoch": 0.161, |
| "grad_norm": 38.25, |
| "grad_norm_var": 15.570247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3992, |
| "loss/crossentropy": 2.0535445332527162, |
| "loss/hidden": 3.453515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19420330366119742, |
| "step": 6440 |
| }, |
| { |
| "epoch": 0.16125, |
| "grad_norm": 45.5, |
| "grad_norm_var": 30.326822916666668, |
| "learning_rate": 0.0001, |
| "loss": 7.3881, |
| "loss/crossentropy": 1.949498599767685, |
| "loss/hidden": 3.337890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18119702748954297, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.1615, |
| "grad_norm": 30.625, |
| "grad_norm_var": 96.64837239583333, |
| "learning_rate": 0.0001, |
| "loss": 7.432, |
| "loss/crossentropy": 2.2534308552742006, |
| "loss/hidden": 3.334375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20010371711105107, |
| "step": 6460 |
| }, |
| { |
| "epoch": 0.16175, |
| "grad_norm": 34.0, |
| "grad_norm_var": 82.66015625, |
| "learning_rate": 0.0001, |
| "loss": 7.3555, |
| "loss/crossentropy": 2.114310759305954, |
| "loss/hidden": 3.387109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19822147954255342, |
| "step": 6470 |
| }, |
| { |
| "epoch": 0.162, |
| "grad_norm": 29.875, |
| "grad_norm_var": 8.4541015625, |
| "learning_rate": 0.0001, |
| "loss": 7.1893, |
| "loss/crossentropy": 2.062894639372826, |
| "loss/hidden": 3.348828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17771479729562997, |
| "step": 6480 |
| }, |
| { |
| "epoch": 0.16225, |
| "grad_norm": 30.375, |
| "grad_norm_var": 15.72265625, |
| "learning_rate": 0.0001, |
| "loss": 7.3665, |
| "loss/crossentropy": 2.0109994761645793, |
| "loss/hidden": 3.45625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19815738410688938, |
| "step": 6490 |
| }, |
| { |
| "epoch": 0.1625, |
| "grad_norm": 33.75, |
| "grad_norm_var": 162.74680989583334, |
| "learning_rate": 0.0001, |
| "loss": 7.4105, |
| "loss/crossentropy": 2.100720961391926, |
| "loss/hidden": 3.26484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18080853056162596, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.16275, |
| "grad_norm": 36.0, |
| "grad_norm_var": 10.530143229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4993, |
| "loss/crossentropy": 2.208073277771473, |
| "loss/hidden": 3.3328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18481182418763636, |
| "step": 6510 |
| }, |
| { |
| "epoch": 0.163, |
| "grad_norm": 37.0, |
| "grad_norm_var": 8.981705729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.5196, |
| "loss/crossentropy": 2.2666310742497444, |
| "loss/hidden": 3.40703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20527655016630889, |
| "step": 6520 |
| }, |
| { |
| "epoch": 0.16325, |
| "grad_norm": 29.625, |
| "grad_norm_var": 13.959830729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4239, |
| "loss/crossentropy": 2.2184250116348267, |
| "loss/hidden": 3.384765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19118925426155328, |
| "step": 6530 |
| }, |
| { |
| "epoch": 0.1635, |
| "grad_norm": 29.375, |
| "grad_norm_var": 8.820833333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3298, |
| "loss/crossentropy": 2.119840921461582, |
| "loss/hidden": 3.39296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20046985391527414, |
| "step": 6540 |
| }, |
| { |
| "epoch": 0.16375, |
| "grad_norm": 31.375, |
| "grad_norm_var": 3.1910807291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.4507, |
| "loss/crossentropy": 2.109931045770645, |
| "loss/hidden": 3.387890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18326662238687277, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.164, |
| "grad_norm": 35.25, |
| "grad_norm_var": 8.6625, |
| "learning_rate": 0.0001, |
| "loss": 7.43, |
| "loss/crossentropy": 2.090756069123745, |
| "loss/hidden": 3.348046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1922204466536641, |
| "step": 6560 |
| }, |
| { |
| "epoch": 0.16425, |
| "grad_norm": 30.625, |
| "grad_norm_var": 14.567643229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.1796, |
| "loss/crossentropy": 1.9266313910484314, |
| "loss/hidden": 3.465234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20681370329111814, |
| "step": 6570 |
| }, |
| { |
| "epoch": 0.1645, |
| "grad_norm": 37.5, |
| "grad_norm_var": 12.9447265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5097, |
| "loss/crossentropy": 1.875116103887558, |
| "loss/hidden": 3.49921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2045787101611495, |
| "step": 6580 |
| }, |
| { |
| "epoch": 0.16475, |
| "grad_norm": 29.875, |
| "grad_norm_var": 5.706184895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.329, |
| "loss/crossentropy": 2.116366655379534, |
| "loss/hidden": 3.4546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18577212654054165, |
| "step": 6590 |
| }, |
| { |
| "epoch": 0.165, |
| "grad_norm": 33.75, |
| "grad_norm_var": 2.3650390625, |
| "learning_rate": 0.0001, |
| "loss": 7.3765, |
| "loss/crossentropy": 2.0037689693272114, |
| "loss/hidden": 3.38125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1971780034713447, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.16525, |
| "grad_norm": 29.625, |
| "grad_norm_var": 4.601497395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4001, |
| "loss/crossentropy": 2.1523181863129137, |
| "loss/hidden": 3.405859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.192273567058146, |
| "step": 6610 |
| }, |
| { |
| "epoch": 0.1655, |
| "grad_norm": 29.875, |
| "grad_norm_var": 7.6525390625, |
| "learning_rate": 0.0001, |
| "loss": 7.3804, |
| "loss/crossentropy": 2.0919234342873096, |
| "loss/hidden": 3.39609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20757663380354643, |
| "step": 6620 |
| }, |
| { |
| "epoch": 0.16575, |
| "grad_norm": 31.875, |
| "grad_norm_var": 6.917708333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.3889, |
| "loss/crossentropy": 2.035097151994705, |
| "loss/hidden": 3.31484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19357213731855155, |
| "step": 6630 |
| }, |
| { |
| "epoch": 0.166, |
| "grad_norm": 28.125, |
| "grad_norm_var": 2.4607245906905574e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5149, |
| "loss/crossentropy": 2.114539227634668, |
| "loss/hidden": 3.358984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19631449952721597, |
| "step": 6640 |
| }, |
| { |
| "epoch": 0.16625, |
| "grad_norm": 28.375, |
| "grad_norm_var": 2.4607245908931773e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.3157, |
| "loss/crossentropy": 2.0170676171779633, |
| "loss/hidden": 3.376171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1893145913258195, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.1665, |
| "grad_norm": 28.0, |
| "grad_norm_var": 32.25807291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.2947, |
| "loss/crossentropy": 1.9412188947200775, |
| "loss/hidden": 3.41953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18349691890180111, |
| "step": 6660 |
| }, |
| { |
| "epoch": 0.16675, |
| "grad_norm": 30.25, |
| "grad_norm_var": 48.2375, |
| "learning_rate": 0.0001, |
| "loss": 7.4086, |
| "loss/crossentropy": 2.1157006829977036, |
| "loss/hidden": 3.410546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1936411712318659, |
| "step": 6670 |
| }, |
| { |
| "epoch": 0.167, |
| "grad_norm": 29.75, |
| "grad_norm_var": 36.18333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4096, |
| "loss/crossentropy": 1.9384170174598694, |
| "loss/hidden": 3.46015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1831628430634737, |
| "step": 6680 |
| }, |
| { |
| "epoch": 0.16725, |
| "grad_norm": 30.0, |
| "grad_norm_var": 62.6150390625, |
| "learning_rate": 0.0001, |
| "loss": 7.4604, |
| "loss/crossentropy": 2.152873657643795, |
| "loss/hidden": 3.421484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20049556214362382, |
| "step": 6690 |
| }, |
| { |
| "epoch": 0.1675, |
| "grad_norm": 29.75, |
| "grad_norm_var": 28.671875, |
| "learning_rate": 0.0001, |
| "loss": 7.5739, |
| "loss/crossentropy": 2.1935679107904433, |
| "loss/hidden": 3.359765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20188184324651956, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.16775, |
| "grad_norm": 28.25, |
| "grad_norm_var": 2.278580729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4709, |
| "loss/crossentropy": 2.062030902504921, |
| "loss/hidden": 3.4328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18874304387718438, |
| "step": 6710 |
| }, |
| { |
| "epoch": 0.168, |
| "grad_norm": 29.125, |
| "grad_norm_var": 3.3150390625, |
| "learning_rate": 0.0001, |
| "loss": 7.3655, |
| "loss/crossentropy": 1.999978879839182, |
| "loss/hidden": 3.3875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18526637642644345, |
| "step": 6720 |
| }, |
| { |
| "epoch": 0.16825, |
| "grad_norm": 35.25, |
| "grad_norm_var": 6.237239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4037, |
| "loss/crossentropy": 2.0561595499515533, |
| "loss/hidden": 3.41015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20046296287328005, |
| "step": 6730 |
| }, |
| { |
| "epoch": 0.1685, |
| "grad_norm": 28.75, |
| "grad_norm_var": 5.5619140625, |
| "learning_rate": 0.0001, |
| "loss": 7.368, |
| "loss/crossentropy": 2.0664093092083933, |
| "loss/hidden": 3.375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1885912848636508, |
| "step": 6740 |
| }, |
| { |
| "epoch": 0.16875, |
| "grad_norm": 30.875, |
| "grad_norm_var": 5.843489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4899, |
| "loss/crossentropy": 2.1205774366855623, |
| "loss/hidden": 3.491796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21412673257291318, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.169, |
| "grad_norm": 32.0, |
| "grad_norm_var": 6.3775390625, |
| "learning_rate": 0.0001, |
| "loss": 7.3708, |
| "loss/crossentropy": 2.0314668610692026, |
| "loss/hidden": 3.33203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18550706487149, |
| "step": 6760 |
| }, |
| { |
| "epoch": 0.16925, |
| "grad_norm": 32.5, |
| "grad_norm_var": 7.378059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4055, |
| "loss/crossentropy": 2.1428691864013674, |
| "loss/hidden": 3.506640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21024896781891583, |
| "step": 6770 |
| }, |
| { |
| "epoch": 0.1695, |
| "grad_norm": 31.375, |
| "grad_norm_var": 3.9791666666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.4444, |
| "loss/crossentropy": 1.9500044576823712, |
| "loss/hidden": 3.434375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19028451843187213, |
| "step": 6780 |
| }, |
| { |
| "epoch": 0.16975, |
| "grad_norm": 29.75, |
| "grad_norm_var": 3.2853515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4369, |
| "loss/crossentropy": 2.1563921123743057, |
| "loss/hidden": 3.3328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18290520180016756, |
| "step": 6790 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 38.0, |
| "grad_norm_var": 8.0947265625, |
| "learning_rate": 0.0001, |
| "loss": 7.3756, |
| "loss/crossentropy": 2.133736363053322, |
| "loss/hidden": 3.346875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1851572971791029, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.17025, |
| "grad_norm": 34.75, |
| "grad_norm_var": 14.757747395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.2181, |
| "loss/crossentropy": 2.054655596613884, |
| "loss/hidden": 3.349609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18349520340561867, |
| "step": 6810 |
| }, |
| { |
| "epoch": 0.1705, |
| "grad_norm": 30.375, |
| "grad_norm_var": 5.17890625, |
| "learning_rate": 0.0001, |
| "loss": 7.303, |
| "loss/crossentropy": 2.024763736128807, |
| "loss/hidden": 3.429296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19044207576662303, |
| "step": 6820 |
| }, |
| { |
| "epoch": 0.17075, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.7955729166666665, |
| "learning_rate": 0.0001, |
| "loss": 7.4884, |
| "loss/crossentropy": 1.9924081854522229, |
| "loss/hidden": 3.49609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20656490996479987, |
| "step": 6830 |
| }, |
| { |
| "epoch": 0.171, |
| "grad_norm": 30.875, |
| "grad_norm_var": 3.215559895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4845, |
| "loss/crossentropy": 2.1104799427092074, |
| "loss/hidden": 3.334375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18316805781796575, |
| "step": 6840 |
| }, |
| { |
| "epoch": 0.17125, |
| "grad_norm": 31.625, |
| "grad_norm_var": 25.610872395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3771, |
| "loss/crossentropy": 2.02793128117919, |
| "loss/hidden": 3.342578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17605492258444427, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.1715, |
| "grad_norm": 29.375, |
| "grad_norm_var": 34.992122395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.5157, |
| "loss/crossentropy": 2.1305345237255096, |
| "loss/hidden": 3.436328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19113806802779437, |
| "step": 6860 |
| }, |
| { |
| "epoch": 0.17175, |
| "grad_norm": 28.375, |
| "grad_norm_var": 12.389322916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.2752, |
| "loss/crossentropy": 2.0788576349616052, |
| "loss/hidden": 3.388671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19617441901937127, |
| "step": 6870 |
| }, |
| { |
| "epoch": 0.172, |
| "grad_norm": 27.625, |
| "grad_norm_var": 17.758072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4366, |
| "loss/crossentropy": 2.068412736058235, |
| "loss/hidden": 3.355078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18466003462672234, |
| "step": 6880 |
| }, |
| { |
| "epoch": 0.17225, |
| "grad_norm": 30.375, |
| "grad_norm_var": 19.836393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5047, |
| "loss/crossentropy": 2.07881121635437, |
| "loss/hidden": 3.380078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19641269743442535, |
| "step": 6890 |
| }, |
| { |
| "epoch": 0.1725, |
| "grad_norm": 29.625, |
| "grad_norm_var": 14.70625, |
| "learning_rate": 0.0001, |
| "loss": 7.4086, |
| "loss/crossentropy": 1.84702168405056, |
| "loss/hidden": 3.469140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1932983512058854, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.17275, |
| "grad_norm": 31.875, |
| "grad_norm_var": 31.006705729166665, |
| "learning_rate": 0.0001, |
| "loss": 7.3715, |
| "loss/crossentropy": 2.099086304008961, |
| "loss/hidden": 3.406640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20250021573156118, |
| "step": 6910 |
| }, |
| { |
| "epoch": 0.173, |
| "grad_norm": 29.0, |
| "grad_norm_var": 26.07265625, |
| "learning_rate": 0.0001, |
| "loss": 7.3343, |
| "loss/crossentropy": 2.0957569405436516, |
| "loss/hidden": 3.346875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2010388659313321, |
| "step": 6920 |
| }, |
| { |
| "epoch": 0.17325, |
| "grad_norm": 33.0, |
| "grad_norm_var": 3.753059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4937, |
| "loss/crossentropy": 2.071177572757006, |
| "loss/hidden": 3.445703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19133195597678423, |
| "step": 6930 |
| }, |
| { |
| "epoch": 0.1735, |
| "grad_norm": 31.125, |
| "grad_norm_var": 5.88515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4757, |
| "loss/crossentropy": 2.0803056344389916, |
| "loss/hidden": 3.51875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20995833892375232, |
| "step": 6940 |
| }, |
| { |
| "epoch": 0.17375, |
| "grad_norm": 28.5, |
| "grad_norm_var": 7.328059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.2293, |
| "loss/crossentropy": 1.9285863403230905, |
| "loss/hidden": 3.283203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1665677004493773, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.174, |
| "grad_norm": 27.375, |
| "grad_norm_var": 11.585872395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.313, |
| "loss/crossentropy": 2.0258478805422784, |
| "loss/hidden": 3.342578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1821833540685475, |
| "step": 6960 |
| }, |
| { |
| "epoch": 0.17425, |
| "grad_norm": 30.5, |
| "grad_norm_var": 10.760416666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5413, |
| "loss/crossentropy": 2.1308654129505156, |
| "loss/hidden": 3.43203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21524183861911297, |
| "step": 6970 |
| }, |
| { |
| "epoch": 0.1745, |
| "grad_norm": 29.625, |
| "grad_norm_var": 4.138541666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4692, |
| "loss/crossentropy": 2.1182237058877944, |
| "loss/hidden": 3.362890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18946228343993426, |
| "step": 6980 |
| }, |
| { |
| "epoch": 0.17475, |
| "grad_norm": 31.0, |
| "grad_norm_var": 5.499934895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4658, |
| "loss/crossentropy": 2.0863646306097507, |
| "loss/hidden": 3.38984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2054815970361233, |
| "step": 6990 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 31.375, |
| "grad_norm_var": 1.7067057291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3196, |
| "loss/crossentropy": 2.1002516582608224, |
| "loss/hidden": 3.40546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19273097421973945, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.17525, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.7999348958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.3643, |
| "loss/crossentropy": 2.015849883854389, |
| "loss/hidden": 3.383984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18935495987534523, |
| "step": 7010 |
| }, |
| { |
| "epoch": 0.1755, |
| "grad_norm": 28.875, |
| "grad_norm_var": 3.3645833333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.3973, |
| "loss/crossentropy": 2.072261115908623, |
| "loss/hidden": 3.482421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19815812185406684, |
| "step": 7020 |
| }, |
| { |
| "epoch": 0.17575, |
| "grad_norm": 29.375, |
| "grad_norm_var": 8.442708333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4193, |
| "loss/crossentropy": 2.1367180705070496, |
| "loss/hidden": 3.31328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1970324844121933, |
| "step": 7030 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 30.375, |
| "grad_norm_var": 5.426822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.518, |
| "loss/crossentropy": 2.210773140192032, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1955376474186778, |
| "step": 7040 |
| }, |
| { |
| "epoch": 0.17625, |
| "grad_norm": 30.125, |
| "grad_norm_var": 3.1791015625, |
| "learning_rate": 0.0001, |
| "loss": 7.3883, |
| "loss/crossentropy": 2.1343745410442354, |
| "loss/hidden": 3.342578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19158909022808074, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.1765, |
| "grad_norm": 29.25, |
| "grad_norm_var": 4.112434895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4451, |
| "loss/crossentropy": 1.9646480686962604, |
| "loss/hidden": 3.465234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19925388041883707, |
| "step": 7060 |
| }, |
| { |
| "epoch": 0.17675, |
| "grad_norm": 31.5, |
| "grad_norm_var": 3.3275390625, |
| "learning_rate": 0.0001, |
| "loss": 7.4874, |
| "loss/crossentropy": 2.1882633604109287, |
| "loss/hidden": 3.373828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19836742132902146, |
| "step": 7070 |
| }, |
| { |
| "epoch": 0.177, |
| "grad_norm": 34.75, |
| "grad_norm_var": 5.7291015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4907, |
| "loss/crossentropy": 2.2362487465143204, |
| "loss/hidden": 3.33203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19153916742652655, |
| "step": 7080 |
| }, |
| { |
| "epoch": 0.17725, |
| "grad_norm": 29.5, |
| "grad_norm_var": 5.002083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4459, |
| "loss/crossentropy": 2.164398466050625, |
| "loss/hidden": 3.43828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20325577780604362, |
| "step": 7090 |
| }, |
| { |
| "epoch": 0.1775, |
| "grad_norm": 32.25, |
| "grad_norm_var": 21.282747395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4933, |
| "loss/crossentropy": 2.1975876331329345, |
| "loss/hidden": 3.455078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19638306740671396, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.17775, |
| "grad_norm": 28.5, |
| "grad_norm_var": 32.18020833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.358, |
| "loss/crossentropy": 1.9753169894218445, |
| "loss/hidden": 3.416015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17774544414132834, |
| "step": 7110 |
| }, |
| { |
| "epoch": 0.178, |
| "grad_norm": 30.0, |
| "grad_norm_var": 11.808333333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4718, |
| "loss/crossentropy": 2.0591190218925477, |
| "loss/hidden": 3.4578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19963474106043577, |
| "step": 7120 |
| }, |
| { |
| "epoch": 0.17825, |
| "grad_norm": 29.25, |
| "grad_norm_var": 7.753580729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3979, |
| "loss/crossentropy": 2.062809920310974, |
| "loss/hidden": 3.437109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19096513669937848, |
| "step": 7130 |
| }, |
| { |
| "epoch": 0.1785, |
| "grad_norm": 30.125, |
| "grad_norm_var": 6.6025390625, |
| "learning_rate": 0.0001, |
| "loss": 7.3036, |
| "loss/crossentropy": 2.0240534149110316, |
| "loss/hidden": 3.4453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18496394343674183, |
| "step": 7140 |
| }, |
| { |
| "epoch": 0.17875, |
| "grad_norm": 31.375, |
| "grad_norm_var": 2.4942057291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4778, |
| "loss/crossentropy": 2.124583348631859, |
| "loss/hidden": 3.44921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2003519142046571, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.179, |
| "grad_norm": 29.0, |
| "grad_norm_var": 13.8978515625, |
| "learning_rate": 0.0001, |
| "loss": 7.3866, |
| "loss/crossentropy": 2.035899819433689, |
| "loss/hidden": 3.41484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18300745636224747, |
| "step": 7160 |
| }, |
| { |
| "epoch": 0.17925, |
| "grad_norm": 28.125, |
| "grad_norm_var": 18.375455729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3499, |
| "loss/crossentropy": 2.086082286387682, |
| "loss/hidden": 3.26875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17294995756819845, |
| "step": 7170 |
| }, |
| { |
| "epoch": 0.1795, |
| "grad_norm": 28.5, |
| "grad_norm_var": 22.834830729166665, |
| "learning_rate": 0.0001, |
| "loss": 7.4265, |
| "loss/crossentropy": 2.0105697728693483, |
| "loss/hidden": 3.393359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19119318593293427, |
| "step": 7180 |
| }, |
| { |
| "epoch": 0.17975, |
| "grad_norm": 33.0, |
| "grad_norm_var": 19.51640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4722, |
| "loss/crossentropy": 2.1506593719124796, |
| "loss/hidden": 3.387109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19607089888304471, |
| "step": 7190 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 31.5, |
| "grad_norm_var": 9.854622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3205, |
| "loss/crossentropy": 2.0767677523195744, |
| "loss/hidden": 3.436328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19086614530533552, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.18025, |
| "grad_norm": 27.625, |
| "grad_norm_var": 8.885416666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.2908, |
| "loss/crossentropy": 2.230179136991501, |
| "loss/hidden": 3.26953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18500201255083085, |
| "step": 7210 |
| }, |
| { |
| "epoch": 0.1805, |
| "grad_norm": 38.25, |
| "grad_norm_var": 8.60625, |
| "learning_rate": 0.0001, |
| "loss": 7.4664, |
| "loss/crossentropy": 2.192136238515377, |
| "loss/hidden": 3.380859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1965922711417079, |
| "step": 7220 |
| }, |
| { |
| "epoch": 0.18075, |
| "grad_norm": 33.0, |
| "grad_norm_var": 7.258333333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.3796, |
| "loss/crossentropy": 2.010923378914595, |
| "loss/hidden": 3.3609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19027266185730696, |
| "step": 7230 |
| }, |
| { |
| "epoch": 0.181, |
| "grad_norm": 39.75, |
| "grad_norm_var": 11.3353515625, |
| "learning_rate": 0.0001, |
| "loss": 7.5537, |
| "loss/crossentropy": 2.055556283891201, |
| "loss/hidden": 3.47421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19257053220644593, |
| "step": 7240 |
| }, |
| { |
| "epoch": 0.18125, |
| "grad_norm": 27.5, |
| "grad_norm_var": 14.424739583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4216, |
| "loss/crossentropy": 2.136477355659008, |
| "loss/hidden": 3.350390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20195687096565962, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.1815, |
| "grad_norm": 30.125, |
| "grad_norm_var": 9.70390625, |
| "learning_rate": 0.0001, |
| "loss": 7.4656, |
| "loss/crossentropy": 2.0134367659687995, |
| "loss/hidden": 3.523046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.22354123163968326, |
| "step": 7260 |
| }, |
| { |
| "epoch": 0.18175, |
| "grad_norm": 30.0, |
| "grad_norm_var": 6.324739583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.3959, |
| "loss/crossentropy": 2.1245115220546724, |
| "loss/hidden": 3.3609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19102834183722733, |
| "step": 7270 |
| }, |
| { |
| "epoch": 0.182, |
| "grad_norm": 29.125, |
| "grad_norm_var": 18.903580729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.3761, |
| "loss/crossentropy": 2.035867254436016, |
| "loss/hidden": 3.373046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19426564145833253, |
| "step": 7280 |
| }, |
| { |
| "epoch": 0.18225, |
| "grad_norm": 30.5, |
| "grad_norm_var": 20.406705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3884, |
| "loss/crossentropy": 1.9805133253335954, |
| "loss/hidden": 3.370703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19129701480269432, |
| "step": 7290 |
| }, |
| { |
| "epoch": 0.1825, |
| "grad_norm": 31.375, |
| "grad_norm_var": 5.842708333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3833, |
| "loss/crossentropy": 2.0621849209070207, |
| "loss/hidden": 3.417578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19919742476195096, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.18275, |
| "grad_norm": 47.0, |
| "grad_norm_var": 2.675771470406222e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.2357, |
| "loss/crossentropy": 2.1282688602805138, |
| "loss/hidden": 3.298046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18128441767767073, |
| "step": 7310 |
| }, |
| { |
| "epoch": 0.183, |
| "grad_norm": 29.25, |
| "grad_norm_var": 28.158333333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4531, |
| "loss/crossentropy": 2.1078659296035767, |
| "loss/hidden": 3.580078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2171280149370432, |
| "step": 7320 |
| }, |
| { |
| "epoch": 0.18325, |
| "grad_norm": 35.25, |
| "grad_norm_var": 25.365625, |
| "learning_rate": 0.0001, |
| "loss": 7.3538, |
| "loss/crossentropy": 2.200744313001633, |
| "loss/hidden": 3.37421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19498275145888327, |
| "step": 7330 |
| }, |
| { |
| "epoch": 0.1835, |
| "grad_norm": 28.375, |
| "grad_norm_var": 23.859830729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3437, |
| "loss/crossentropy": 1.9968993581831456, |
| "loss/hidden": 3.3859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1875888810493052, |
| "step": 7340 |
| }, |
| { |
| "epoch": 0.18375, |
| "grad_norm": 28.0, |
| "grad_norm_var": 8.989518229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.3984, |
| "loss/crossentropy": 2.0748091831803324, |
| "loss/hidden": 3.412109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1947355069220066, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.184, |
| "grad_norm": 29.875, |
| "grad_norm_var": 7.728125, |
| "learning_rate": 0.0001, |
| "loss": 7.3008, |
| "loss/crossentropy": 2.067877373099327, |
| "loss/hidden": 3.313671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18336500320583582, |
| "step": 7360 |
| }, |
| { |
| "epoch": 0.18425, |
| "grad_norm": 29.25, |
| "grad_norm_var": 13.189583333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4547, |
| "loss/crossentropy": 2.152864509820938, |
| "loss/hidden": 3.451953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2102669222280383, |
| "step": 7370 |
| }, |
| { |
| "epoch": 0.1845, |
| "grad_norm": 28.5, |
| "grad_norm_var": 8.7556640625, |
| "learning_rate": 0.0001, |
| "loss": 7.286, |
| "loss/crossentropy": 1.9991149730980395, |
| "loss/hidden": 3.327734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18364950213581324, |
| "step": 7380 |
| }, |
| { |
| "epoch": 0.18475, |
| "grad_norm": 31.75, |
| "grad_norm_var": 7.770572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3546, |
| "loss/crossentropy": 2.0513292245566843, |
| "loss/hidden": 3.355078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1970413200557232, |
| "step": 7390 |
| }, |
| { |
| "epoch": 0.185, |
| "grad_norm": 30.125, |
| "grad_norm_var": 7.620572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5116, |
| "loss/crossentropy": 2.102216296643019, |
| "loss/hidden": 3.51875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21091360161080958, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.18525, |
| "grad_norm": 32.75, |
| "grad_norm_var": 9.191666666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3421, |
| "loss/crossentropy": 1.9926266744732857, |
| "loss/hidden": 3.455078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19832582902163268, |
| "step": 7410 |
| }, |
| { |
| "epoch": 0.1855, |
| "grad_norm": 29.5, |
| "grad_norm_var": 5.945833333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4563, |
| "loss/crossentropy": 2.141331580281258, |
| "loss/hidden": 3.408203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20020943265408278, |
| "step": 7420 |
| }, |
| { |
| "epoch": 0.18575, |
| "grad_norm": 31.0, |
| "grad_norm_var": 7.198372395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4227, |
| "loss/crossentropy": 1.9694693490862847, |
| "loss/hidden": 3.421484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18249646089971067, |
| "step": 7430 |
| }, |
| { |
| "epoch": 0.186, |
| "grad_norm": 29.375, |
| "grad_norm_var": 5.9697265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4119, |
| "loss/crossentropy": 2.1407265037298204, |
| "loss/hidden": 3.327734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17720893137156962, |
| "step": 7440 |
| }, |
| { |
| "epoch": 0.18625, |
| "grad_norm": 29.875, |
| "grad_norm_var": 0.8072916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3441, |
| "loss/crossentropy": 2.124198019504547, |
| "loss/hidden": 3.424609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1946978410705924, |
| "step": 7450 |
| }, |
| { |
| "epoch": 0.1865, |
| "grad_norm": 32.75, |
| "grad_norm_var": 1.9634765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4652, |
| "loss/crossentropy": 2.131994958221912, |
| "loss/hidden": 3.419921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19461573138833047, |
| "step": 7460 |
| }, |
| { |
| "epoch": 0.18675, |
| "grad_norm": 30.875, |
| "grad_norm_var": 2.7280598958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3278, |
| "loss/crossentropy": 2.117748848348856, |
| "loss/hidden": 3.2578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18584235943853855, |
| "step": 7470 |
| }, |
| { |
| "epoch": 0.187, |
| "grad_norm": 31.625, |
| "grad_norm_var": 2.466080729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5817, |
| "loss/crossentropy": 2.1364282086491584, |
| "loss/hidden": 3.460546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21069204956293106, |
| "step": 7480 |
| }, |
| { |
| "epoch": 0.18725, |
| "grad_norm": 31.25, |
| "grad_norm_var": 3.012434895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.5197, |
| "loss/crossentropy": 2.0523312032222747, |
| "loss/hidden": 3.50078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21221144162118435, |
| "step": 7490 |
| }, |
| { |
| "epoch": 0.1875, |
| "grad_norm": 33.75, |
| "grad_norm_var": 4.995768229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4317, |
| "loss/crossentropy": 2.0852270901203154, |
| "loss/hidden": 3.359765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17908250950276852, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.18775, |
| "grad_norm": 30.875, |
| "grad_norm_var": 2.4518229166666665, |
| "learning_rate": 0.0001, |
| "loss": 7.5444, |
| "loss/crossentropy": 2.059383874386549, |
| "loss/hidden": 3.436328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19440155941992998, |
| "step": 7510 |
| }, |
| { |
| "epoch": 0.188, |
| "grad_norm": 27.625, |
| "grad_norm_var": 2.851822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5199, |
| "loss/crossentropy": 2.1382876858115196, |
| "loss/hidden": 3.4078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19888029601424934, |
| "step": 7520 |
| }, |
| { |
| "epoch": 0.18825, |
| "grad_norm": 29.375, |
| "grad_norm_var": 3.073372395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3012, |
| "loss/crossentropy": 2.0625696159899234, |
| "loss/hidden": 3.387109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18495072829537093, |
| "step": 7530 |
| }, |
| { |
| "epoch": 0.1885, |
| "grad_norm": 32.0, |
| "grad_norm_var": 2.2122395833333335, |
| "learning_rate": 0.0001, |
| "loss": 7.3643, |
| "loss/crossentropy": 2.124967637658119, |
| "loss/hidden": 3.3328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18375679664313793, |
| "step": 7540 |
| }, |
| { |
| "epoch": 0.18875, |
| "grad_norm": 29.125, |
| "grad_norm_var": 3.2681640625, |
| "learning_rate": 0.0001, |
| "loss": 7.351, |
| "loss/crossentropy": 2.0680116668343542, |
| "loss/hidden": 3.41875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18827605471014977, |
| "step": 7550 |
| }, |
| { |
| "epoch": 0.189, |
| "grad_norm": 31.75, |
| "grad_norm_var": 1.5337890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4207, |
| "loss/crossentropy": 2.079096484184265, |
| "loss/hidden": 3.443359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20160295628011227, |
| "step": 7560 |
| }, |
| { |
| "epoch": 0.18925, |
| "grad_norm": 30.625, |
| "grad_norm_var": 18.2197265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4789, |
| "loss/crossentropy": 2.058067685365677, |
| "loss/hidden": 3.46640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21216327100992202, |
| "step": 7570 |
| }, |
| { |
| "epoch": 0.1895, |
| "grad_norm": 34.25, |
| "grad_norm_var": 14.415625, |
| "learning_rate": 0.0001, |
| "loss": 7.4987, |
| "loss/crossentropy": 2.0142914205789566, |
| "loss/hidden": 3.580078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20794902741909027, |
| "step": 7580 |
| }, |
| { |
| "epoch": 0.18975, |
| "grad_norm": 30.5, |
| "grad_norm_var": 1.9809895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4585, |
| "loss/crossentropy": 2.299562671780586, |
| "loss/hidden": 3.347265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19880922697484493, |
| "step": 7590 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 30.875, |
| "grad_norm_var": 15.101822916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3204, |
| "loss/crossentropy": 2.1472302600741386, |
| "loss/hidden": 3.384375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18796155080199242, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.19025, |
| "grad_norm": 29.0, |
| "grad_norm_var": 2.5940733610451533e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5335, |
| "loss/crossentropy": 2.1664531916379928, |
| "loss/hidden": 3.3984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18874028734862805, |
| "step": 7610 |
| }, |
| { |
| "epoch": 0.1905, |
| "grad_norm": 29.125, |
| "grad_norm_var": 0.8843098958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4846, |
| "loss/crossentropy": 2.0765088513493537, |
| "loss/hidden": 3.34375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18674521408975125, |
| "step": 7620 |
| }, |
| { |
| "epoch": 0.19075, |
| "grad_norm": 27.25, |
| "grad_norm_var": 4.381184895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3401, |
| "loss/crossentropy": 1.7713539503514766, |
| "loss/hidden": 3.515234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1749590938910842, |
| "step": 7630 |
| }, |
| { |
| "epoch": 0.191, |
| "grad_norm": 33.5, |
| "grad_norm_var": 5.26640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4838, |
| "loss/crossentropy": 1.922049730271101, |
| "loss/hidden": 3.388671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18183694053441285, |
| "step": 7640 |
| }, |
| { |
| "epoch": 0.19125, |
| "grad_norm": 30.5, |
| "grad_norm_var": 4.030989583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.3887, |
| "loss/crossentropy": 2.094516658782959, |
| "loss/hidden": 3.452734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18757299687713386, |
| "step": 7650 |
| }, |
| { |
| "epoch": 0.1915, |
| "grad_norm": 32.0, |
| "grad_norm_var": 1.5035807291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4445, |
| "loss/crossentropy": 1.999229770898819, |
| "loss/hidden": 3.516015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19523975029587745, |
| "step": 7660 |
| }, |
| { |
| "epoch": 0.19175, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.0275390625, |
| "learning_rate": 0.0001, |
| "loss": 7.4626, |
| "loss/crossentropy": 2.0366951674222946, |
| "loss/hidden": 3.42265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18243371956050397, |
| "step": 7670 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.5931640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5274, |
| "loss/crossentropy": 2.0489632681012155, |
| "loss/hidden": 3.564453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21230401135981083, |
| "step": 7680 |
| }, |
| { |
| "epoch": 0.19225, |
| "grad_norm": 32.75, |
| "grad_norm_var": 3.738997395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4128, |
| "loss/crossentropy": 2.0826203912496566, |
| "loss/hidden": 3.373828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18644160348922015, |
| "step": 7690 |
| }, |
| { |
| "epoch": 0.1925, |
| "grad_norm": 29.875, |
| "grad_norm_var": 4.351041666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5195, |
| "loss/crossentropy": 2.1674430795013904, |
| "loss/hidden": 3.3640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19499621093273162, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.19275, |
| "grad_norm": 28.625, |
| "grad_norm_var": 4.939322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4236, |
| "loss/crossentropy": 2.092283549904823, |
| "loss/hidden": 3.39453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19506504610180855, |
| "step": 7710 |
| }, |
| { |
| "epoch": 0.193, |
| "grad_norm": 33.0, |
| "grad_norm_var": 9.080208333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3501, |
| "loss/crossentropy": 2.0526101261377336, |
| "loss/hidden": 3.51328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19770189765840768, |
| "step": 7720 |
| }, |
| { |
| "epoch": 0.19325, |
| "grad_norm": 29.0, |
| "grad_norm_var": 9.731184895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4607, |
| "loss/crossentropy": 2.05912861302495, |
| "loss/hidden": 3.41328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19287334326654673, |
| "step": 7730 |
| }, |
| { |
| "epoch": 0.1935, |
| "grad_norm": 30.5, |
| "grad_norm_var": 1.8052083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4243, |
| "loss/crossentropy": 1.8983285859227181, |
| "loss/hidden": 3.4203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18626301139593124, |
| "step": 7740 |
| }, |
| { |
| "epoch": 0.19375, |
| "grad_norm": 40.75, |
| "grad_norm_var": 9.1541015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4332, |
| "loss/crossentropy": 2.099681233614683, |
| "loss/hidden": 3.40546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20556394904851913, |
| "step": 7750 |
| }, |
| { |
| "epoch": 0.194, |
| "grad_norm": 37.0, |
| "grad_norm_var": 12.584375, |
| "learning_rate": 0.0001, |
| "loss": 7.516, |
| "loss/crossentropy": 2.0436215907335282, |
| "loss/hidden": 3.317578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17505232142284513, |
| "step": 7760 |
| }, |
| { |
| "epoch": 0.19425, |
| "grad_norm": 30.125, |
| "grad_norm_var": 5.2681640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5486, |
| "loss/crossentropy": 2.0449838273227217, |
| "loss/hidden": 3.46875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19324074545875192, |
| "step": 7770 |
| }, |
| { |
| "epoch": 0.1945, |
| "grad_norm": 30.125, |
| "grad_norm_var": 1.4518229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4052, |
| "loss/crossentropy": 2.1020638972520826, |
| "loss/hidden": 3.3234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1987349819391966, |
| "step": 7780 |
| }, |
| { |
| "epoch": 0.19475, |
| "grad_norm": 32.25, |
| "grad_norm_var": 1.525, |
| "learning_rate": 0.0001, |
| "loss": 7.3148, |
| "loss/crossentropy": 2.0913542471826077, |
| "loss/hidden": 3.38515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1900358498096466, |
| "step": 7790 |
| }, |
| { |
| "epoch": 0.195, |
| "grad_norm": 29.875, |
| "grad_norm_var": 1.2280598958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.43, |
| "loss/crossentropy": 1.9448820307850838, |
| "loss/hidden": 3.423828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18228193083778024, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.19525, |
| "grad_norm": 28.75, |
| "grad_norm_var": 2.3791015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4772, |
| "loss/crossentropy": 2.0547610491514208, |
| "loss/hidden": 3.459765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19370344914495946, |
| "step": 7810 |
| }, |
| { |
| "epoch": 0.1955, |
| "grad_norm": 31.375, |
| "grad_norm_var": 6.208072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4448, |
| "loss/crossentropy": 2.0798824220895766, |
| "loss/hidden": 3.431640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18972196318209172, |
| "step": 7820 |
| }, |
| { |
| "epoch": 0.19575, |
| "grad_norm": 29.5, |
| "grad_norm_var": 6.248893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.5358, |
| "loss/crossentropy": 2.2324195951223373, |
| "loss/hidden": 3.4375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2047037549316883, |
| "step": 7830 |
| }, |
| { |
| "epoch": 0.196, |
| "grad_norm": 29.375, |
| "grad_norm_var": 4.453059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3114, |
| "loss/crossentropy": 2.1020479179918765, |
| "loss/hidden": 3.373046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19328910131007432, |
| "step": 7840 |
| }, |
| { |
| "epoch": 0.19625, |
| "grad_norm": 33.5, |
| "grad_norm_var": 590.0817057291666, |
| "learning_rate": 0.0001, |
| "loss": 7.4281, |
| "loss/crossentropy": 2.0953447744250298, |
| "loss/hidden": 3.374609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1854689259082079, |
| "step": 7850 |
| }, |
| { |
| "epoch": 0.1965, |
| "grad_norm": 33.0, |
| "grad_norm_var": 625.6192057291667, |
| "learning_rate": 0.0001, |
| "loss": 7.5283, |
| "loss/crossentropy": 2.061315707862377, |
| "loss/hidden": 3.429296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18506519980728625, |
| "step": 7860 |
| }, |
| { |
| "epoch": 0.19675, |
| "grad_norm": 32.0, |
| "grad_norm_var": 69.38430989583334, |
| "learning_rate": 0.0001, |
| "loss": 7.5314, |
| "loss/crossentropy": 2.1605025470256805, |
| "loss/hidden": 3.3890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20316088199615479, |
| "step": 7870 |
| }, |
| { |
| "epoch": 0.197, |
| "grad_norm": 31.375, |
| "grad_norm_var": 1.1067057291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4244, |
| "loss/crossentropy": 2.1977868393063544, |
| "loss/hidden": 3.430078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1992826245725155, |
| "step": 7880 |
| }, |
| { |
| "epoch": 0.19725, |
| "grad_norm": 29.25, |
| "grad_norm_var": 9.578059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.502, |
| "loss/crossentropy": 2.0480964958667753, |
| "loss/hidden": 3.636328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21483200527727603, |
| "step": 7890 |
| }, |
| { |
| "epoch": 0.1975, |
| "grad_norm": 30.875, |
| "grad_norm_var": 3.6372395833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.548, |
| "loss/crossentropy": 2.157261362671852, |
| "loss/hidden": 3.484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2084518164396286, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.19775, |
| "grad_norm": 28.5, |
| "grad_norm_var": 2.095572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3864, |
| "loss/crossentropy": 2.1441849052906035, |
| "loss/hidden": 3.414453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20142039898782968, |
| "step": 7910 |
| }, |
| { |
| "epoch": 0.198, |
| "grad_norm": 37.5, |
| "grad_norm_var": 16.170247395833332, |
| "learning_rate": 0.0001, |
| "loss": 7.429, |
| "loss/crossentropy": 2.001983726769686, |
| "loss/hidden": 3.448828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19066998092457652, |
| "step": 7920 |
| }, |
| { |
| "epoch": 0.19825, |
| "grad_norm": 29.375, |
| "grad_norm_var": 15.561393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4093, |
| "loss/crossentropy": 2.225774070620537, |
| "loss/hidden": 3.36796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1993710033595562, |
| "step": 7930 |
| }, |
| { |
| "epoch": 0.1985, |
| "grad_norm": 34.0, |
| "grad_norm_var": 25.343489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4932, |
| "loss/crossentropy": 2.0083594918251038, |
| "loss/hidden": 3.3453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19412665143609048, |
| "step": 7940 |
| }, |
| { |
| "epoch": 0.19875, |
| "grad_norm": 31.625, |
| "grad_norm_var": 23.877083333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.441, |
| "loss/crossentropy": 2.08128562271595, |
| "loss/hidden": 3.412109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17889103144407273, |
| "step": 7950 |
| }, |
| { |
| "epoch": 0.199, |
| "grad_norm": 27.875, |
| "grad_norm_var": 1.5921223958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4251, |
| "loss/crossentropy": 2.1019147261977196, |
| "loss/hidden": 3.571484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2211546439677477, |
| "step": 7960 |
| }, |
| { |
| "epoch": 0.19925, |
| "grad_norm": 32.0, |
| "grad_norm_var": 3.3059895833333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4213, |
| "loss/crossentropy": 2.14247687458992, |
| "loss/hidden": 3.420703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18870262056589127, |
| "step": 7970 |
| }, |
| { |
| "epoch": 0.1995, |
| "grad_norm": 31.5, |
| "grad_norm_var": 2.0947265625, |
| "learning_rate": 0.0001, |
| "loss": 7.5788, |
| "loss/crossentropy": 2.1896591186523438, |
| "loss/hidden": 3.479296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20883973222225904, |
| "step": 7980 |
| }, |
| { |
| "epoch": 0.19975, |
| "grad_norm": 28.25, |
| "grad_norm_var": 2.982291666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5217, |
| "loss/crossentropy": 2.1884095311164855, |
| "loss/hidden": 3.44296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2022842913866043, |
| "step": 7990 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 28.5, |
| "grad_norm_var": 2.379622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4975, |
| "loss/crossentropy": 2.225523295998573, |
| "loss/hidden": 3.4203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2032675376161933, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.20025, |
| "grad_norm": 30.625, |
| "grad_norm_var": 4.181705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3874, |
| "loss/crossentropy": 1.9566738605499268, |
| "loss/hidden": 3.576953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19702311754226684, |
| "step": 8010 |
| }, |
| { |
| "epoch": 0.2005, |
| "grad_norm": 32.75, |
| "grad_norm_var": 6.3509765625, |
| "learning_rate": 0.0001, |
| "loss": 7.4528, |
| "loss/crossentropy": 2.1517204724252226, |
| "loss/hidden": 3.466015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2005010774359107, |
| "step": 8020 |
| }, |
| { |
| "epoch": 0.20075, |
| "grad_norm": 30.875, |
| "grad_norm_var": 5.44140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5306, |
| "loss/crossentropy": 2.0300184957683087, |
| "loss/hidden": 3.375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18887464031577111, |
| "step": 8030 |
| }, |
| { |
| "epoch": 0.201, |
| "grad_norm": 31.75, |
| "grad_norm_var": 3.1302083333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4025, |
| "loss/crossentropy": 2.0889772072434427, |
| "loss/hidden": 3.27109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17016669576987625, |
| "step": 8040 |
| }, |
| { |
| "epoch": 0.20125, |
| "grad_norm": 27.125, |
| "grad_norm_var": 94.78125, |
| "learning_rate": 0.0001, |
| "loss": 7.4379, |
| "loss/crossentropy": 2.158484524488449, |
| "loss/hidden": 3.410546875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19672231934964657, |
| "step": 8050 |
| }, |
| { |
| "epoch": 0.2015, |
| "grad_norm": 38.0, |
| "grad_norm_var": 13.220572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4314, |
| "loss/crossentropy": 1.9623262777924537, |
| "loss/hidden": 3.339453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1833876773715019, |
| "step": 8060 |
| }, |
| { |
| "epoch": 0.20175, |
| "grad_norm": 29.875, |
| "grad_norm_var": 7.858268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4662, |
| "loss/crossentropy": 2.2177498638629913, |
| "loss/hidden": 3.383203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1954023003578186, |
| "step": 8070 |
| }, |
| { |
| "epoch": 0.202, |
| "grad_norm": 28.875, |
| "grad_norm_var": 7.627083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4792, |
| "loss/crossentropy": 2.1019868202507497, |
| "loss/hidden": 3.44375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1995641984976828, |
| "step": 8080 |
| }, |
| { |
| "epoch": 0.20225, |
| "grad_norm": 44.75, |
| "grad_norm_var": 20.4306640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5664, |
| "loss/crossentropy": 2.299755599349737, |
| "loss/hidden": 3.3015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19075682908296585, |
| "step": 8090 |
| }, |
| { |
| "epoch": 0.2025, |
| "grad_norm": 37.75, |
| "grad_norm_var": 2.5671221292944763e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4526, |
| "loss/crossentropy": 2.131952489167452, |
| "loss/hidden": 3.475390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1955398641526699, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.20275, |
| "grad_norm": 31.25, |
| "grad_norm_var": 20.342643229166665, |
| "learning_rate": 0.0001, |
| "loss": 7.4687, |
| "loss/crossentropy": 1.9825796701014042, |
| "loss/hidden": 3.475, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20226136669516565, |
| "step": 8110 |
| }, |
| { |
| "epoch": 0.203, |
| "grad_norm": 27.125, |
| "grad_norm_var": 15.192643229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.1604, |
| "loss/crossentropy": 2.0296560734510423, |
| "loss/hidden": 3.387109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1786259189248085, |
| "step": 8120 |
| }, |
| { |
| "epoch": 0.20325, |
| "grad_norm": 31.0, |
| "grad_norm_var": 7.1775390625, |
| "learning_rate": 0.0001, |
| "loss": 7.2815, |
| "loss/crossentropy": 2.104996609687805, |
| "loss/hidden": 3.405859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19307580199092628, |
| "step": 8130 |
| }, |
| { |
| "epoch": 0.2035, |
| "grad_norm": 34.75, |
| "grad_norm_var": 6.820572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3774, |
| "loss/crossentropy": 2.1900447353720667, |
| "loss/hidden": 3.401953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20103423558175565, |
| "step": 8140 |
| }, |
| { |
| "epoch": 0.20375, |
| "grad_norm": 31.0, |
| "grad_norm_var": 5.48125, |
| "learning_rate": 0.0001, |
| "loss": 7.3628, |
| "loss/crossentropy": 2.0671978294849396, |
| "loss/hidden": 3.338671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18412660714238882, |
| "step": 8150 |
| }, |
| { |
| "epoch": 0.204, |
| "grad_norm": 41.25, |
| "grad_norm_var": 14.245572916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.5547, |
| "loss/crossentropy": 2.05537860840559, |
| "loss/hidden": 3.48515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1820721985772252, |
| "step": 8160 |
| }, |
| { |
| "epoch": 0.20425, |
| "grad_norm": 29.125, |
| "grad_norm_var": 13.8625, |
| "learning_rate": 0.0001, |
| "loss": 7.345, |
| "loss/crossentropy": 1.9336151838302613, |
| "loss/hidden": 3.564453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20152895338833332, |
| "step": 8170 |
| }, |
| { |
| "epoch": 0.2045, |
| "grad_norm": 28.125, |
| "grad_norm_var": 7.096875, |
| "learning_rate": 0.0001, |
| "loss": 7.5005, |
| "loss/crossentropy": 2.055295965075493, |
| "loss/hidden": 3.35234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1865659100934863, |
| "step": 8180 |
| }, |
| { |
| "epoch": 0.20475, |
| "grad_norm": 28.25, |
| "grad_norm_var": 8.805989583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5324, |
| "loss/crossentropy": 1.9976115971803665, |
| "loss/hidden": 3.508203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2114253517240286, |
| "step": 8190 |
| }, |
| { |
| "epoch": 0.205, |
| "grad_norm": 29.875, |
| "grad_norm_var": 12.435416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4232, |
| "loss/crossentropy": 2.1122905567288397, |
| "loss/hidden": 3.400390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21649520397186278, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.20525, |
| "grad_norm": 28.125, |
| "grad_norm_var": 10.055208333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4875, |
| "loss/crossentropy": 2.1132961876690386, |
| "loss/hidden": 3.47734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2026100393384695, |
| "step": 8210 |
| }, |
| { |
| "epoch": 0.2055, |
| "grad_norm": 29.125, |
| "grad_norm_var": 6.793489583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.3834, |
| "loss/crossentropy": 2.191230720281601, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19885572660714387, |
| "step": 8220 |
| }, |
| { |
| "epoch": 0.20575, |
| "grad_norm": 30.75, |
| "grad_norm_var": 4.630208333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4544, |
| "loss/crossentropy": 2.080083931982517, |
| "loss/hidden": 3.474609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2282587742432952, |
| "step": 8230 |
| }, |
| { |
| "epoch": 0.206, |
| "grad_norm": 31.375, |
| "grad_norm_var": 11.458268229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.449, |
| "loss/crossentropy": 2.1384637162089346, |
| "loss/hidden": 3.346484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1842938730493188, |
| "step": 8240 |
| }, |
| { |
| "epoch": 0.20625, |
| "grad_norm": 31.875, |
| "grad_norm_var": 1.7087890625, |
| "learning_rate": 0.0001, |
| "loss": 7.3946, |
| "loss/crossentropy": 2.1355771869421005, |
| "loss/hidden": 3.264453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18052869867533444, |
| "step": 8250 |
| }, |
| { |
| "epoch": 0.2065, |
| "grad_norm": 31.75, |
| "grad_norm_var": 13.0447265625, |
| "learning_rate": 0.0001, |
| "loss": 7.6316, |
| "loss/crossentropy": 2.0203320410102608, |
| "loss/hidden": 3.445703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.192235934920609, |
| "step": 8260 |
| }, |
| { |
| "epoch": 0.20675, |
| "grad_norm": 29.5, |
| "grad_norm_var": 12.270247395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.4572, |
| "loss/crossentropy": 1.9869592547416688, |
| "loss/hidden": 3.44140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.202651490829885, |
| "step": 8270 |
| }, |
| { |
| "epoch": 0.207, |
| "grad_norm": 30.125, |
| "grad_norm_var": 1.4822916666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5227, |
| "loss/crossentropy": 2.051181730628014, |
| "loss/hidden": 3.43984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20994498692452906, |
| "step": 8280 |
| }, |
| { |
| "epoch": 0.20725, |
| "grad_norm": 43.75, |
| "grad_norm_var": 12.776822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3664, |
| "loss/crossentropy": 1.9618318520486355, |
| "loss/hidden": 3.558984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19053993374109268, |
| "step": 8290 |
| }, |
| { |
| "epoch": 0.2075, |
| "grad_norm": 30.5, |
| "grad_norm_var": 14.92890625, |
| "learning_rate": 0.0001, |
| "loss": 7.318, |
| "loss/crossentropy": 2.0114831268787383, |
| "loss/hidden": 3.459375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20329152811318635, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.20775, |
| "grad_norm": 34.0, |
| "grad_norm_var": 2.8869140625, |
| "learning_rate": 0.0001, |
| "loss": 7.5489, |
| "loss/crossentropy": 2.1187786638736723, |
| "loss/hidden": 3.512890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20839224103838205, |
| "step": 8310 |
| }, |
| { |
| "epoch": 0.208, |
| "grad_norm": 27.875, |
| "grad_norm_var": 2.565559895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3443, |
| "loss/crossentropy": 2.1263110756874086, |
| "loss/hidden": 3.32109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18627664018422366, |
| "step": 8320 |
| }, |
| { |
| "epoch": 0.20825, |
| "grad_norm": 30.375, |
| "grad_norm_var": 2.755989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3857, |
| "loss/crossentropy": 1.944861602783203, |
| "loss/hidden": 3.468359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18570939563214778, |
| "step": 8330 |
| }, |
| { |
| "epoch": 0.2085, |
| "grad_norm": 31.125, |
| "grad_norm_var": 2.340625, |
| "learning_rate": 0.0001, |
| "loss": 7.5344, |
| "loss/crossentropy": 2.1811724051833155, |
| "loss/hidden": 3.41875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19298948515206577, |
| "step": 8340 |
| }, |
| { |
| "epoch": 0.20875, |
| "grad_norm": 31.5, |
| "grad_norm_var": 1.56015625, |
| "learning_rate": 0.0001, |
| "loss": 7.3826, |
| "loss/crossentropy": 2.152976579964161, |
| "loss/hidden": 3.305859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.190250195749104, |
| "step": 8350 |
| }, |
| { |
| "epoch": 0.209, |
| "grad_norm": 28.875, |
| "grad_norm_var": 1.3416015625, |
| "learning_rate": 0.0001, |
| "loss": 7.323, |
| "loss/crossentropy": 2.2053099036216737, |
| "loss/hidden": 3.344140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1929330924525857, |
| "step": 8360 |
| }, |
| { |
| "epoch": 0.20925, |
| "grad_norm": 29.875, |
| "grad_norm_var": 1.65625, |
| "learning_rate": 0.0001, |
| "loss": 7.4376, |
| "loss/crossentropy": 2.011850906908512, |
| "loss/hidden": 3.442578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20794765576720237, |
| "step": 8370 |
| }, |
| { |
| "epoch": 0.2095, |
| "grad_norm": 29.25, |
| "grad_norm_var": 2.381705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4518, |
| "loss/crossentropy": 2.284806078672409, |
| "loss/hidden": 3.412109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2203810729086399, |
| "step": 8380 |
| }, |
| { |
| "epoch": 0.20975, |
| "grad_norm": 43.25, |
| "grad_norm_var": 17.695247395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.418, |
| "loss/crossentropy": 2.1161764934659004, |
| "loss/hidden": 3.475390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1863908626139164, |
| "step": 8390 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 29.625, |
| "grad_norm_var": 19.8697265625, |
| "learning_rate": 0.0001, |
| "loss": 7.4564, |
| "loss/crossentropy": 2.0293585821986198, |
| "loss/hidden": 3.373046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1884205201640725, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.21025, |
| "grad_norm": 27.25, |
| "grad_norm_var": 11.422330729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4317, |
| "loss/crossentropy": 2.2351802065968513, |
| "loss/hidden": 3.376953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20424611177295446, |
| "step": 8410 |
| }, |
| { |
| "epoch": 0.2105, |
| "grad_norm": 30.125, |
| "grad_norm_var": 10.734830729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3989, |
| "loss/crossentropy": 2.1349810734391212, |
| "loss/hidden": 3.3828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18979013338685036, |
| "step": 8420 |
| }, |
| { |
| "epoch": 0.21075, |
| "grad_norm": 30.875, |
| "grad_norm_var": 2.387955729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3957, |
| "loss/crossentropy": 2.080636392533779, |
| "loss/hidden": 3.371875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18893024744465947, |
| "step": 8430 |
| }, |
| { |
| "epoch": 0.211, |
| "grad_norm": 28.75, |
| "grad_norm_var": 20.089518229166668, |
| "learning_rate": 0.0001, |
| "loss": 7.4847, |
| "loss/crossentropy": 1.9267802774906158, |
| "loss/hidden": 3.31484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18156335428357123, |
| "step": 8440 |
| }, |
| { |
| "epoch": 0.21125, |
| "grad_norm": 29.125, |
| "grad_norm_var": 2.2650390625, |
| "learning_rate": 0.0001, |
| "loss": 7.3839, |
| "loss/crossentropy": 2.14048397988081, |
| "loss/hidden": 3.419921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1889717074111104, |
| "step": 8450 |
| }, |
| { |
| "epoch": 0.2115, |
| "grad_norm": 28.375, |
| "grad_norm_var": 3.283124099188418e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4661, |
| "loss/crossentropy": 2.1406930878758432, |
| "loss/hidden": 3.699609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20904620084911585, |
| "step": 8460 |
| }, |
| { |
| "epoch": 0.21175, |
| "grad_norm": 30.75, |
| "grad_norm_var": 23.509830729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.2867, |
| "loss/crossentropy": 2.1153231114149094, |
| "loss/hidden": 3.342578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18998019583523273, |
| "step": 8470 |
| }, |
| { |
| "epoch": 0.212, |
| "grad_norm": 31.0, |
| "grad_norm_var": 2.3139973958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3997, |
| "loss/crossentropy": 2.169650764763355, |
| "loss/hidden": 3.276953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17556187361478806, |
| "step": 8480 |
| }, |
| { |
| "epoch": 0.21225, |
| "grad_norm": 29.625, |
| "grad_norm_var": 2.312239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4486, |
| "loss/crossentropy": 2.128412726521492, |
| "loss/hidden": 3.37734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1975807584822178, |
| "step": 8490 |
| }, |
| { |
| "epoch": 0.2125, |
| "grad_norm": 30.75, |
| "grad_norm_var": 2.903125, |
| "learning_rate": 0.0001, |
| "loss": 7.4526, |
| "loss/crossentropy": 2.029564914107323, |
| "loss/hidden": 3.408984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19610330546274782, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.21275, |
| "grad_norm": 31.375, |
| "grad_norm_var": 4.004622395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.2845, |
| "loss/crossentropy": 1.9631854377686977, |
| "loss/hidden": 3.409765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1855375848710537, |
| "step": 8510 |
| }, |
| { |
| "epoch": 0.213, |
| "grad_norm": 30.0, |
| "grad_norm_var": 3.476822916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4124, |
| "loss/crossentropy": 2.1494273841381073, |
| "loss/hidden": 3.380078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19540442936122418, |
| "step": 8520 |
| }, |
| { |
| "epoch": 0.21325, |
| "grad_norm": 31.625, |
| "grad_norm_var": 0.9452473958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4659, |
| "loss/crossentropy": 2.0508621491491796, |
| "loss/hidden": 3.3078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18023168351501226, |
| "step": 8530 |
| }, |
| { |
| "epoch": 0.2135, |
| "grad_norm": 31.25, |
| "grad_norm_var": 2.5809895833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.4522, |
| "loss/crossentropy": 2.090715576708317, |
| "loss/hidden": 3.3875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18382459450513125, |
| "step": 8540 |
| }, |
| { |
| "epoch": 0.21375, |
| "grad_norm": 28.75, |
| "grad_norm_var": 2.986168000807314e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4917, |
| "loss/crossentropy": 2.177123633027077, |
| "loss/hidden": 3.359765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.197673611715436, |
| "step": 8550 |
| }, |
| { |
| "epoch": 0.214, |
| "grad_norm": 27.75, |
| "grad_norm_var": 4.035872395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.3008, |
| "loss/crossentropy": 2.028589369356632, |
| "loss/hidden": 3.383984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.196718043461442, |
| "step": 8560 |
| }, |
| { |
| "epoch": 0.21425, |
| "grad_norm": 30.5, |
| "grad_norm_var": 4.758072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3829, |
| "loss/crossentropy": 2.02908306196332, |
| "loss/hidden": 3.3703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1757409404963255, |
| "step": 8570 |
| }, |
| { |
| "epoch": 0.2145, |
| "grad_norm": 30.875, |
| "grad_norm_var": 1.9754557291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4107, |
| "loss/crossentropy": 2.0411842301487924, |
| "loss/hidden": 3.580078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2309743857011199, |
| "step": 8580 |
| }, |
| { |
| "epoch": 0.21475, |
| "grad_norm": 30.0, |
| "grad_norm_var": 25.7244140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4038, |
| "loss/crossentropy": 2.1026074662804604, |
| "loss/hidden": 3.490234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.202506691403687, |
| "step": 8590 |
| }, |
| { |
| "epoch": 0.215, |
| "grad_norm": 28.625, |
| "grad_norm_var": 3.386458333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3559, |
| "loss/crossentropy": 2.1690615713596344, |
| "loss/hidden": 3.388671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19095612335950135, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.21525, |
| "grad_norm": 29.0, |
| "grad_norm_var": 23.880989583333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4164, |
| "loss/crossentropy": 2.099227898567915, |
| "loss/hidden": 3.376953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19162636022083462, |
| "step": 8610 |
| }, |
| { |
| "epoch": 0.2155, |
| "grad_norm": 34.25, |
| "grad_norm_var": 23.880208333333332, |
| "learning_rate": 0.0001, |
| "loss": 7.3725, |
| "loss/crossentropy": 1.9689884655177594, |
| "loss/hidden": 3.52265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1830376190133393, |
| "step": 8620 |
| }, |
| { |
| "epoch": 0.21575, |
| "grad_norm": 29.875, |
| "grad_norm_var": 2.77890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4511, |
| "loss/crossentropy": 2.0263702854514123, |
| "loss/hidden": 3.351171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18029460608959197, |
| "step": 8630 |
| }, |
| { |
| "epoch": 0.216, |
| "grad_norm": 32.0, |
| "grad_norm_var": 23.001822916666665, |
| "learning_rate": 0.0001, |
| "loss": 7.3863, |
| "loss/crossentropy": 1.9046964697539805, |
| "loss/hidden": 3.44140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1935629203915596, |
| "step": 8640 |
| }, |
| { |
| "epoch": 0.21625, |
| "grad_norm": 33.5, |
| "grad_norm_var": 25.009375, |
| "learning_rate": 0.0001, |
| "loss": 7.3083, |
| "loss/crossentropy": 2.129426471889019, |
| "loss/hidden": 3.448046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20037918202579022, |
| "step": 8650 |
| }, |
| { |
| "epoch": 0.2165, |
| "grad_norm": 28.375, |
| "grad_norm_var": 12.242643229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4603, |
| "loss/crossentropy": 2.2266604125499727, |
| "loss/hidden": 3.33203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18952292017638683, |
| "step": 8660 |
| }, |
| { |
| "epoch": 0.21675, |
| "grad_norm": 34.75, |
| "grad_norm_var": 19.517708333333335, |
| "learning_rate": 0.0001, |
| "loss": 7.4031, |
| "loss/crossentropy": 1.9756697475910188, |
| "loss/hidden": 3.529296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20053059812635182, |
| "step": 8670 |
| }, |
| { |
| "epoch": 0.217, |
| "grad_norm": 29.25, |
| "grad_norm_var": 8.315625, |
| "learning_rate": 0.0001, |
| "loss": 7.4013, |
| "loss/crossentropy": 2.13729098290205, |
| "loss/hidden": 3.3859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19440573658794164, |
| "step": 8680 |
| }, |
| { |
| "epoch": 0.21725, |
| "grad_norm": 31.5, |
| "grad_norm_var": 1.9936848958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.2982, |
| "loss/crossentropy": 2.0935462579131126, |
| "loss/hidden": 3.453125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.200434254668653, |
| "step": 8690 |
| }, |
| { |
| "epoch": 0.2175, |
| "grad_norm": 31.625, |
| "grad_norm_var": 26.66640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4596, |
| "loss/crossentropy": 2.0081637501716614, |
| "loss/hidden": 3.42734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.196201959438622, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.21775, |
| "grad_norm": 28.125, |
| "grad_norm_var": 1.869627142435242e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4726, |
| "loss/crossentropy": 2.057238683104515, |
| "loss/hidden": 3.372265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18527965154498816, |
| "step": 8710 |
| }, |
| { |
| "epoch": 0.218, |
| "grad_norm": 30.25, |
| "grad_norm_var": 4.806705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.424, |
| "loss/crossentropy": 2.225848586857319, |
| "loss/hidden": 3.4, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20839223694056272, |
| "step": 8720 |
| }, |
| { |
| "epoch": 0.21825, |
| "grad_norm": 29.5, |
| "grad_norm_var": 3.2572265625, |
| "learning_rate": 0.0001, |
| "loss": 7.3713, |
| "loss/crossentropy": 1.9737806752324105, |
| "loss/hidden": 3.413671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18166052605956792, |
| "step": 8730 |
| }, |
| { |
| "epoch": 0.2185, |
| "grad_norm": 31.25, |
| "grad_norm_var": 3.49765625, |
| "learning_rate": 0.0001, |
| "loss": 7.3192, |
| "loss/crossentropy": 2.1625877559185027, |
| "loss/hidden": 3.3765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.196309875510633, |
| "step": 8740 |
| }, |
| { |
| "epoch": 0.21875, |
| "grad_norm": 31.625, |
| "grad_norm_var": 15.733333333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3994, |
| "loss/crossentropy": 2.1306996777653695, |
| "loss/hidden": 3.3328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17798179090023042, |
| "step": 8750 |
| }, |
| { |
| "epoch": 0.219, |
| "grad_norm": 31.25, |
| "grad_norm_var": 5.465625, |
| "learning_rate": 0.0001, |
| "loss": 7.4225, |
| "loss/crossentropy": 2.168031161278486, |
| "loss/hidden": 3.423046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19842574130743743, |
| "step": 8760 |
| }, |
| { |
| "epoch": 0.21925, |
| "grad_norm": 29.5, |
| "grad_norm_var": 3.457747395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4515, |
| "loss/crossentropy": 2.081401216983795, |
| "loss/hidden": 3.3875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1922367751598358, |
| "step": 8770 |
| }, |
| { |
| "epoch": 0.2195, |
| "grad_norm": 28.625, |
| "grad_norm_var": 2.9389973958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4641, |
| "loss/crossentropy": 1.9896476596593857, |
| "loss/hidden": 3.367578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18900877684354783, |
| "step": 8780 |
| }, |
| { |
| "epoch": 0.21975, |
| "grad_norm": 29.375, |
| "grad_norm_var": 3.5931640625, |
| "learning_rate": 0.0001, |
| "loss": 7.5034, |
| "loss/crossentropy": 2.131504286080599, |
| "loss/hidden": 3.4234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1981994620524347, |
| "step": 8790 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 29.75, |
| "grad_norm_var": 17.612955729166668, |
| "learning_rate": 0.0001, |
| "loss": 7.3602, |
| "loss/crossentropy": 2.367118790745735, |
| "loss/hidden": 3.357421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20705808699131012, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.22025, |
| "grad_norm": 30.625, |
| "grad_norm_var": 3.588997395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3512, |
| "loss/crossentropy": 1.915165586769581, |
| "loss/hidden": 3.41328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18030493911355733, |
| "step": 8810 |
| }, |
| { |
| "epoch": 0.2205, |
| "grad_norm": 31.625, |
| "grad_norm_var": 2.732291666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3832, |
| "loss/crossentropy": 2.1191729307174683, |
| "loss/hidden": 3.37734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20246538575738668, |
| "step": 8820 |
| }, |
| { |
| "epoch": 0.22075, |
| "grad_norm": 32.0, |
| "grad_norm_var": 2.034309895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4184, |
| "loss/crossentropy": 2.280582541972399, |
| "loss/hidden": 3.355078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1913912059739232, |
| "step": 8830 |
| }, |
| { |
| "epoch": 0.221, |
| "grad_norm": 28.75, |
| "grad_norm_var": 1.4629557291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.5177, |
| "loss/crossentropy": 2.07154730707407, |
| "loss/hidden": 3.334765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18417379464954137, |
| "step": 8840 |
| }, |
| { |
| "epoch": 0.22125, |
| "grad_norm": 29.25, |
| "grad_norm_var": 2.0306640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4098, |
| "loss/crossentropy": 2.0918928742408753, |
| "loss/hidden": 3.478515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21339783817529678, |
| "step": 8850 |
| }, |
| { |
| "epoch": 0.2215, |
| "grad_norm": 34.75, |
| "grad_norm_var": 2.4098307291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.3907, |
| "loss/crossentropy": 2.0262165658175944, |
| "loss/hidden": 3.4796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1963033676147461, |
| "step": 8860 |
| }, |
| { |
| "epoch": 0.22175, |
| "grad_norm": 29.625, |
| "grad_norm_var": 3.1041015625, |
| "learning_rate": 0.0001, |
| "loss": 7.3643, |
| "loss/crossentropy": 2.119324280321598, |
| "loss/hidden": 3.421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19243048634380103, |
| "step": 8870 |
| }, |
| { |
| "epoch": 0.222, |
| "grad_norm": 29.25, |
| "grad_norm_var": 3.3593098958333334, |
| "learning_rate": 0.0001, |
| "loss": 7.335, |
| "loss/crossentropy": 2.1042064100503923, |
| "loss/hidden": 3.43671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2055317424237728, |
| "step": 8880 |
| }, |
| { |
| "epoch": 0.22225, |
| "grad_norm": 29.0, |
| "grad_norm_var": 4.35390625, |
| "learning_rate": 0.0001, |
| "loss": 7.3974, |
| "loss/crossentropy": 2.110988216102123, |
| "loss/hidden": 3.344140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18536690715700388, |
| "step": 8890 |
| }, |
| { |
| "epoch": 0.2225, |
| "grad_norm": 30.0, |
| "grad_norm_var": 6.014322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4155, |
| "loss/crossentropy": 2.033397987484932, |
| "loss/hidden": 3.367578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19111265633255242, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.22275, |
| "grad_norm": 32.25, |
| "grad_norm_var": 4.747916666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4631, |
| "loss/crossentropy": 2.090746468305588, |
| "loss/hidden": 3.3609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20997797157615422, |
| "step": 8910 |
| }, |
| { |
| "epoch": 0.223, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.0247395833333335, |
| "learning_rate": 0.0001, |
| "loss": 7.3647, |
| "loss/crossentropy": 2.107650229334831, |
| "loss/hidden": 3.41171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18095682561397552, |
| "step": 8920 |
| }, |
| { |
| "epoch": 0.22325, |
| "grad_norm": 33.0, |
| "grad_norm_var": 4.983072916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4161, |
| "loss/crossentropy": 2.1033721581101417, |
| "loss/hidden": 3.394921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1837721960619092, |
| "step": 8930 |
| }, |
| { |
| "epoch": 0.2235, |
| "grad_norm": 30.0, |
| "grad_norm_var": 5.160872395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.381, |
| "loss/crossentropy": 2.2039036631584166, |
| "loss/hidden": 3.41953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20040026511996983, |
| "step": 8940 |
| }, |
| { |
| "epoch": 0.22375, |
| "grad_norm": 32.75, |
| "grad_norm_var": 5.625455729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.4581, |
| "loss/crossentropy": 2.0015091970562935, |
| "loss/hidden": 3.4046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1968998895958066, |
| "step": 8950 |
| }, |
| { |
| "epoch": 0.224, |
| "grad_norm": 30.125, |
| "grad_norm_var": 7.976497395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.446, |
| "loss/crossentropy": 2.1770635031163694, |
| "loss/hidden": 3.290234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18164771795272827, |
| "step": 8960 |
| }, |
| { |
| "epoch": 0.22425, |
| "grad_norm": 26.875, |
| "grad_norm_var": 2.037239583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.2677, |
| "loss/crossentropy": 1.9846746385097505, |
| "loss/hidden": 3.45859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19175144601613284, |
| "step": 8970 |
| }, |
| { |
| "epoch": 0.2245, |
| "grad_norm": 30.25, |
| "grad_norm_var": 2.8353515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4645, |
| "loss/crossentropy": 2.169223573803902, |
| "loss/hidden": 3.33359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18991702441126107, |
| "step": 8980 |
| }, |
| { |
| "epoch": 0.22475, |
| "grad_norm": 29.875, |
| "grad_norm_var": 10.3228515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4607, |
| "loss/crossentropy": 2.1323930069804193, |
| "loss/hidden": 3.348046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18631141390651465, |
| "step": 8990 |
| }, |
| { |
| "epoch": 0.225, |
| "grad_norm": 29.625, |
| "grad_norm_var": 11.431705729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4203, |
| "loss/crossentropy": 2.1948125064373016, |
| "loss/hidden": 3.420703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21075339019298553, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.22525, |
| "grad_norm": 31.0, |
| "grad_norm_var": 3.21875, |
| "learning_rate": 0.0001, |
| "loss": 7.3937, |
| "loss/crossentropy": 2.061431697010994, |
| "loss/hidden": 3.357421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20084240343421697, |
| "step": 9010 |
| }, |
| { |
| "epoch": 0.2255, |
| "grad_norm": 30.0, |
| "grad_norm_var": 9.7125, |
| "learning_rate": 0.0001, |
| "loss": 7.5364, |
| "loss/crossentropy": 2.1422473564743996, |
| "loss/hidden": 3.38359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20135180205106734, |
| "step": 9020 |
| }, |
| { |
| "epoch": 0.22575, |
| "grad_norm": 29.0, |
| "grad_norm_var": 10.320247395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.3638, |
| "loss/crossentropy": 2.064805781841278, |
| "loss/hidden": 3.509375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19638751186430453, |
| "step": 9030 |
| }, |
| { |
| "epoch": 0.226, |
| "grad_norm": 29.0, |
| "grad_norm_var": 7.7306640625, |
| "learning_rate": 0.0001, |
| "loss": 7.367, |
| "loss/crossentropy": 2.0203323513269424, |
| "loss/hidden": 3.351953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1730576554313302, |
| "step": 9040 |
| }, |
| { |
| "epoch": 0.22625, |
| "grad_norm": 28.0, |
| "grad_norm_var": 7.5291015625, |
| "learning_rate": 0.0001, |
| "loss": 7.4083, |
| "loss/crossentropy": 2.1449467122554777, |
| "loss/hidden": 3.428125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20745128113776445, |
| "step": 9050 |
| }, |
| { |
| "epoch": 0.2265, |
| "grad_norm": 29.625, |
| "grad_norm_var": 10.059830729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.3739, |
| "loss/crossentropy": 2.277996188402176, |
| "loss/hidden": 3.359765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1961175424978137, |
| "step": 9060 |
| }, |
| { |
| "epoch": 0.22675, |
| "grad_norm": 29.75, |
| "grad_norm_var": 2.9497395833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.425, |
| "loss/crossentropy": 2.128904873877764, |
| "loss/hidden": 3.307421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1912422338500619, |
| "step": 9070 |
| }, |
| { |
| "epoch": 0.227, |
| "grad_norm": 30.0, |
| "grad_norm_var": 2.4291666666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3724, |
| "loss/crossentropy": 1.8966167330741883, |
| "loss/hidden": 3.315234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1772749178111553, |
| "step": 9080 |
| }, |
| { |
| "epoch": 0.22725, |
| "grad_norm": 30.625, |
| "grad_norm_var": 2.2955729166666665, |
| "learning_rate": 0.0001, |
| "loss": 7.4341, |
| "loss/crossentropy": 1.9993775576353072, |
| "loss/hidden": 3.409375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17548400331288577, |
| "step": 9090 |
| }, |
| { |
| "epoch": 0.2275, |
| "grad_norm": 30.125, |
| "grad_norm_var": 2.3212890625, |
| "learning_rate": 0.0001, |
| "loss": 7.4512, |
| "loss/crossentropy": 2.1553252935409546, |
| "loss/hidden": 3.41953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1904754728078842, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.22775, |
| "grad_norm": 31.875, |
| "grad_norm_var": 1.8264973958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4299, |
| "loss/crossentropy": 2.0791067980229854, |
| "loss/hidden": 3.384765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18264174591749907, |
| "step": 9110 |
| }, |
| { |
| "epoch": 0.228, |
| "grad_norm": 30.625, |
| "grad_norm_var": 8.670572916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3621, |
| "loss/crossentropy": 2.2495081633329392, |
| "loss/hidden": 3.3703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19295338317751884, |
| "step": 9120 |
| }, |
| { |
| "epoch": 0.22825, |
| "grad_norm": 34.25, |
| "grad_norm_var": 9.628580729166666, |
| "learning_rate": 0.0001, |
| "loss": 7.4069, |
| "loss/crossentropy": 2.0221784450113773, |
| "loss/hidden": 3.43046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18131311442703008, |
| "step": 9130 |
| }, |
| { |
| "epoch": 0.2285, |
| "grad_norm": 31.375, |
| "grad_norm_var": 3.8372395833333335, |
| "learning_rate": 0.0001, |
| "loss": 7.3788, |
| "loss/crossentropy": 2.096472094208002, |
| "loss/hidden": 3.533984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19181215222924947, |
| "step": 9140 |
| }, |
| { |
| "epoch": 0.22875, |
| "grad_norm": 32.75, |
| "grad_norm_var": 2.9369140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4625, |
| "loss/crossentropy": 2.161876367032528, |
| "loss/hidden": 3.39765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19645992666482925, |
| "step": 9150 |
| }, |
| { |
| "epoch": 0.229, |
| "grad_norm": 36.5, |
| "grad_norm_var": 13.144791666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3883, |
| "loss/crossentropy": 1.870260328054428, |
| "loss/hidden": 3.421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19089705124497414, |
| "step": 9160 |
| }, |
| { |
| "epoch": 0.22925, |
| "grad_norm": 30.125, |
| "grad_norm_var": 11.746809895833334, |
| "learning_rate": 0.0001, |
| "loss": 7.3778, |
| "loss/crossentropy": 2.304895442724228, |
| "loss/hidden": 3.30390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19963221047073604, |
| "step": 9170 |
| }, |
| { |
| "epoch": 0.2295, |
| "grad_norm": 30.875, |
| "grad_norm_var": 2.64839803006287e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4899, |
| "loss/crossentropy": 2.1439118653535845, |
| "loss/hidden": 3.46484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20680125001817942, |
| "step": 9180 |
| }, |
| { |
| "epoch": 0.22975, |
| "grad_norm": 29.625, |
| "grad_norm_var": 16.839518229166668, |
| "learning_rate": 0.0001, |
| "loss": 7.3153, |
| "loss/crossentropy": 1.9939407154917717, |
| "loss/hidden": 3.4125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19113040501251816, |
| "step": 9190 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 30.125, |
| "grad_norm_var": 1.8309895833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.5357, |
| "loss/crossentropy": 2.088300883769989, |
| "loss/hidden": 3.476953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19538584928959607, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.23025, |
| "grad_norm": 30.75, |
| "grad_norm_var": 1.66640625, |
| "learning_rate": 0.0001, |
| "loss": 7.3255, |
| "loss/crossentropy": 2.0176690459251403, |
| "loss/hidden": 3.361328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1823873495683074, |
| "step": 9210 |
| }, |
| { |
| "epoch": 0.2305, |
| "grad_norm": 32.75, |
| "grad_norm_var": 4.837955729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3334, |
| "loss/crossentropy": 2.061754436790943, |
| "loss/hidden": 3.394921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18769590836018324, |
| "step": 9220 |
| }, |
| { |
| "epoch": 0.23075, |
| "grad_norm": 31.75, |
| "grad_norm_var": 4.683268229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3932, |
| "loss/crossentropy": 2.0443666532635687, |
| "loss/hidden": 3.40390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19146692994982004, |
| "step": 9230 |
| }, |
| { |
| "epoch": 0.231, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.594073359575469e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5992, |
| "loss/crossentropy": 2.189889648556709, |
| "loss/hidden": 3.344921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19348510541021824, |
| "step": 9240 |
| }, |
| { |
| "epoch": 0.23125, |
| "grad_norm": 29.125, |
| "grad_norm_var": 2.594073359702976e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4217, |
| "loss/crossentropy": 2.0103170931339265, |
| "loss/hidden": 3.44140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19076041504740715, |
| "step": 9250 |
| }, |
| { |
| "epoch": 0.2315, |
| "grad_norm": 31.375, |
| "grad_norm_var": 13.5134765625, |
| "learning_rate": 0.0001, |
| "loss": 7.3004, |
| "loss/crossentropy": 2.0597189858555796, |
| "loss/hidden": 3.3421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18948603458702565, |
| "step": 9260 |
| }, |
| { |
| "epoch": 0.23175, |
| "grad_norm": 31.625, |
| "grad_norm_var": 13.402083333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.3946, |
| "loss/crossentropy": 2.0123729363083838, |
| "loss/hidden": 3.3828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1885578565299511, |
| "step": 9270 |
| }, |
| { |
| "epoch": 0.232, |
| "grad_norm": 29.875, |
| "grad_norm_var": 5.879622395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.1578, |
| "loss/crossentropy": 2.2501363843679427, |
| "loss/hidden": 3.286328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19268405642360448, |
| "step": 9280 |
| }, |
| { |
| "epoch": 0.23225, |
| "grad_norm": 30.25, |
| "grad_norm_var": 16.982291666666665, |
| "learning_rate": 0.0001, |
| "loss": 7.3481, |
| "loss/crossentropy": 2.076053886115551, |
| "loss/hidden": 3.571875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21077420487999915, |
| "step": 9290 |
| }, |
| { |
| "epoch": 0.2325, |
| "grad_norm": 29.5, |
| "grad_norm_var": 21.769791666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.2711, |
| "loss/crossentropy": 2.193411388993263, |
| "loss/hidden": 3.33828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18807493168860673, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.23275, |
| "grad_norm": 34.25, |
| "grad_norm_var": 12.355143229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.3819, |
| "loss/crossentropy": 2.166665832698345, |
| "loss/hidden": 3.3203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18620893750339745, |
| "step": 9310 |
| }, |
| { |
| "epoch": 0.233, |
| "grad_norm": 29.875, |
| "grad_norm_var": 3.002083333333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4076, |
| "loss/crossentropy": 2.022654353827238, |
| "loss/hidden": 3.2984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18289813362061977, |
| "step": 9320 |
| }, |
| { |
| "epoch": 0.23325, |
| "grad_norm": 30.0, |
| "grad_norm_var": 3.0509765625, |
| "learning_rate": 0.0001, |
| "loss": 7.3206, |
| "loss/crossentropy": 2.020027980953455, |
| "loss/hidden": 3.425390625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18972874553874136, |
| "step": 9330 |
| }, |
| { |
| "epoch": 0.2335, |
| "grad_norm": 31.125, |
| "grad_norm_var": 2.098893229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4091, |
| "loss/crossentropy": 2.029365235567093, |
| "loss/hidden": 3.43125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.2005317559465766, |
| "step": 9340 |
| }, |
| { |
| "epoch": 0.23375, |
| "grad_norm": 29.5, |
| "grad_norm_var": 10.520768229166666, |
| "learning_rate": 0.0001, |
| "loss": 7.3704, |
| "loss/crossentropy": 1.95634398534894, |
| "loss/hidden": 3.433203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18962019477039577, |
| "step": 9350 |
| }, |
| { |
| "epoch": 0.234, |
| "grad_norm": 27.75, |
| "grad_norm_var": 13.475, |
| "learning_rate": 0.0001, |
| "loss": 7.3797, |
| "loss/crossentropy": 2.063827896118164, |
| "loss/hidden": 3.3609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19280518041923642, |
| "step": 9360 |
| }, |
| { |
| "epoch": 0.23425, |
| "grad_norm": 30.5, |
| "grad_norm_var": 2.0869140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4918, |
| "loss/crossentropy": 2.068470099568367, |
| "loss/hidden": 3.429296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19361184667795897, |
| "step": 9370 |
| }, |
| { |
| "epoch": 0.2345, |
| "grad_norm": 28.0, |
| "grad_norm_var": 6.534830729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4265, |
| "loss/crossentropy": 2.125886672735214, |
| "loss/hidden": 3.401171875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19355848152190447, |
| "step": 9380 |
| }, |
| { |
| "epoch": 0.23475, |
| "grad_norm": 30.375, |
| "grad_norm_var": 2.2770182291666665, |
| "learning_rate": 0.0001, |
| "loss": 7.3769, |
| "loss/crossentropy": 2.2057121500372885, |
| "loss/hidden": 3.28515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1843592157587409, |
| "step": 9390 |
| }, |
| { |
| "epoch": 0.235, |
| "grad_norm": 87.5, |
| "grad_norm_var": 202.43932291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4444, |
| "loss/crossentropy": 2.17584248483181, |
| "loss/hidden": 3.4515625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1986805137246847, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.23525, |
| "grad_norm": 33.0, |
| "grad_norm_var": 205.446875, |
| "learning_rate": 0.0001, |
| "loss": 7.5575, |
| "loss/crossentropy": 1.9989879056811333, |
| "loss/hidden": 3.484375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21514312122017146, |
| "step": 9410 |
| }, |
| { |
| "epoch": 0.2355, |
| "grad_norm": 33.25, |
| "grad_norm_var": 3.4775390625, |
| "learning_rate": 0.0001, |
| "loss": 7.3965, |
| "loss/crossentropy": 2.31248200237751, |
| "loss/hidden": 3.359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20192647576332093, |
| "step": 9420 |
| }, |
| { |
| "epoch": 0.23575, |
| "grad_norm": 34.0, |
| "grad_norm_var": 2.5723307291666666, |
| "learning_rate": 0.0001, |
| "loss": 7.425, |
| "loss/crossentropy": 2.0246524304151534, |
| "loss/hidden": 3.43125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1936120893806219, |
| "step": 9430 |
| }, |
| { |
| "epoch": 0.236, |
| "grad_norm": 31.5, |
| "grad_norm_var": 3.8207682291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3458, |
| "loss/crossentropy": 2.0714851915836334, |
| "loss/hidden": 3.315625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19567677434533834, |
| "step": 9440 |
| }, |
| { |
| "epoch": 0.23625, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.0452473958333335, |
| "learning_rate": 0.0001, |
| "loss": 7.3934, |
| "loss/crossentropy": 2.024892423301935, |
| "loss/hidden": 3.541796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19850265365093947, |
| "step": 9450 |
| }, |
| { |
| "epoch": 0.2365, |
| "grad_norm": 32.5, |
| "grad_norm_var": 3.16640625, |
| "learning_rate": 0.0001, |
| "loss": 7.4192, |
| "loss/crossentropy": 2.146814212203026, |
| "loss/hidden": 3.43828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20048882197588683, |
| "step": 9460 |
| }, |
| { |
| "epoch": 0.23675, |
| "grad_norm": 27.875, |
| "grad_norm_var": 12.364322916666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3745, |
| "loss/crossentropy": 2.0004019677639007, |
| "loss/hidden": 3.409375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19019459020346402, |
| "step": 9470 |
| }, |
| { |
| "epoch": 0.237, |
| "grad_norm": 28.75, |
| "grad_norm_var": 12.885416666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.2858, |
| "loss/crossentropy": 2.0364832431077957, |
| "loss/hidden": 3.357421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18539349418133497, |
| "step": 9480 |
| }, |
| { |
| "epoch": 0.23725, |
| "grad_norm": 26.75, |
| "grad_norm_var": 4.488997395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.2865, |
| "loss/crossentropy": 2.257227724790573, |
| "loss/hidden": 3.247265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1774066084995866, |
| "step": 9490 |
| }, |
| { |
| "epoch": 0.2375, |
| "grad_norm": 28.25, |
| "grad_norm_var": 5.15625, |
| "learning_rate": 0.0001, |
| "loss": 7.3742, |
| "loss/crossentropy": 2.192533364892006, |
| "loss/hidden": 3.319140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18531391881406306, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.23775, |
| "grad_norm": 28.5, |
| "grad_norm_var": 8.7025390625, |
| "learning_rate": 0.0001, |
| "loss": 7.4007, |
| "loss/crossentropy": 1.9954842567443847, |
| "loss/hidden": 3.512890625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21250668447464705, |
| "step": 9510 |
| }, |
| { |
| "epoch": 0.238, |
| "grad_norm": 27.0, |
| "grad_norm_var": 3.2018229166666665, |
| "learning_rate": 0.0001, |
| "loss": 7.3589, |
| "loss/crossentropy": 2.0501152604818342, |
| "loss/hidden": 3.402734375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19774708338081837, |
| "step": 9520 |
| }, |
| { |
| "epoch": 0.23825, |
| "grad_norm": 31.125, |
| "grad_norm_var": 2.471875, |
| "learning_rate": 0.0001, |
| "loss": 7.3611, |
| "loss/crossentropy": 2.1446721121668815, |
| "loss/hidden": 3.40625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19521092902868986, |
| "step": 9530 |
| }, |
| { |
| "epoch": 0.2385, |
| "grad_norm": 33.25, |
| "grad_norm_var": 2.911393229166667, |
| "learning_rate": 0.0001, |
| "loss": 7.4037, |
| "loss/crossentropy": 2.050874675065279, |
| "loss/hidden": 3.4078125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18872325737029313, |
| "step": 9540 |
| }, |
| { |
| "epoch": 0.23875, |
| "grad_norm": 32.25, |
| "grad_norm_var": 1.69375, |
| "learning_rate": 0.0001, |
| "loss": 7.3851, |
| "loss/crossentropy": 2.092629846930504, |
| "loss/hidden": 3.3984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1858744696713984, |
| "step": 9550 |
| }, |
| { |
| "epoch": 0.239, |
| "grad_norm": 30.25, |
| "grad_norm_var": 1.6229166666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.324, |
| "loss/crossentropy": 2.1389416724443437, |
| "loss/hidden": 3.344921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18115091007202863, |
| "step": 9560 |
| }, |
| { |
| "epoch": 0.23925, |
| "grad_norm": 28.375, |
| "grad_norm_var": 0.7893229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3684, |
| "loss/crossentropy": 2.1148220866918566, |
| "loss/hidden": 3.390234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18993774689733983, |
| "step": 9570 |
| }, |
| { |
| "epoch": 0.2395, |
| "grad_norm": 29.125, |
| "grad_norm_var": 1.0260416666666667, |
| "learning_rate": 0.0001, |
| "loss": 7.4711, |
| "loss/crossentropy": 1.987958113849163, |
| "loss/hidden": 3.409375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19200538750737906, |
| "step": 9580 |
| }, |
| { |
| "epoch": 0.23975, |
| "grad_norm": 29.625, |
| "grad_norm_var": 0.6018229166666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3578, |
| "loss/crossentropy": 2.076607885956764, |
| "loss/hidden": 3.36640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20001101978123187, |
| "step": 9590 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 30.75, |
| "grad_norm_var": 1.4124348958333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4458, |
| "loss/crossentropy": 2.1496243715286254, |
| "loss/hidden": 3.357421875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1889566643163562, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.24025, |
| "grad_norm": 29.875, |
| "grad_norm_var": 1.04140625, |
| "learning_rate": 0.0001, |
| "loss": 7.4632, |
| "loss/crossentropy": 2.162689308822155, |
| "loss/hidden": 3.33828125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19204493910074233, |
| "step": 9610 |
| }, |
| { |
| "epoch": 0.2405, |
| "grad_norm": 27.375, |
| "grad_norm_var": 1.5863932291666667, |
| "learning_rate": 0.0001, |
| "loss": 7.3897, |
| "loss/crossentropy": 2.1323146484792233, |
| "loss/hidden": 3.455859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20718522872775794, |
| "step": 9620 |
| }, |
| { |
| "epoch": 0.24075, |
| "grad_norm": 32.25, |
| "grad_norm_var": 2.09375, |
| "learning_rate": 0.0001, |
| "loss": 7.2848, |
| "loss/crossentropy": 2.0804166465997698, |
| "loss/hidden": 3.417578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1966247998178005, |
| "step": 9630 |
| }, |
| { |
| "epoch": 0.241, |
| "grad_norm": 31.25, |
| "grad_norm_var": 1.3559895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.4174, |
| "loss/crossentropy": 2.3036757931113243, |
| "loss/hidden": 3.341015625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18416995517909526, |
| "step": 9640 |
| }, |
| { |
| "epoch": 0.24125, |
| "grad_norm": 33.0, |
| "grad_norm_var": 2.41015625, |
| "learning_rate": 0.0001, |
| "loss": 7.2853, |
| "loss/crossentropy": 1.9936259984970093, |
| "loss/hidden": 3.428125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.21977887880057095, |
| "step": 9650 |
| }, |
| { |
| "epoch": 0.2415, |
| "grad_norm": 32.5, |
| "grad_norm_var": 6.1228515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4938, |
| "loss/crossentropy": 2.2000851720571517, |
| "loss/hidden": 3.39765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20301534831523896, |
| "step": 9660 |
| }, |
| { |
| "epoch": 0.24175, |
| "grad_norm": 31.0, |
| "grad_norm_var": 3.59765625, |
| "learning_rate": 0.0001, |
| "loss": 7.5133, |
| "loss/crossentropy": 2.192278115451336, |
| "loss/hidden": 3.351953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19489070847630502, |
| "step": 9670 |
| }, |
| { |
| "epoch": 0.242, |
| "grad_norm": 28.875, |
| "grad_norm_var": 6.8978515625, |
| "learning_rate": 0.0001, |
| "loss": 7.4247, |
| "loss/crossentropy": 2.194097451120615, |
| "loss/hidden": 3.43046875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20746590523049235, |
| "step": 9680 |
| }, |
| { |
| "epoch": 0.24225, |
| "grad_norm": 29.75, |
| "grad_norm_var": 4.0587890625, |
| "learning_rate": 0.0001, |
| "loss": 7.3413, |
| "loss/crossentropy": 2.1902857303619383, |
| "loss/hidden": 3.187109375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17439354099333287, |
| "step": 9690 |
| }, |
| { |
| "epoch": 0.2425, |
| "grad_norm": 41.5, |
| "grad_norm_var": 3.952541960104418e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.5429, |
| "loss/crossentropy": 2.046304853260517, |
| "loss/hidden": 3.31953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18022917695343493, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.24275, |
| "grad_norm": 30.375, |
| "grad_norm_var": 3.952541960261809e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.2729, |
| "loss/crossentropy": 2.082195009291172, |
| "loss/hidden": 3.5140625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1983122780919075, |
| "step": 9710 |
| }, |
| { |
| "epoch": 0.243, |
| "grad_norm": 28.375, |
| "grad_norm_var": 2.6056640625, |
| "learning_rate": 0.0001, |
| "loss": 7.2271, |
| "loss/crossentropy": 1.970278625190258, |
| "loss/hidden": 3.474609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19305091574788094, |
| "step": 9720 |
| }, |
| { |
| "epoch": 0.24325, |
| "grad_norm": 30.375, |
| "grad_norm_var": 2.0576524262052987e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.3979, |
| "loss/crossentropy": 2.1140229746699335, |
| "loss/hidden": 3.390234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19698369763791562, |
| "step": 9730 |
| }, |
| { |
| "epoch": 0.2435, |
| "grad_norm": 29.0, |
| "grad_norm_var": 2.057652425978177e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.2491, |
| "loss/crossentropy": 1.9204902969300748, |
| "loss/hidden": 3.40703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18290270324796437, |
| "step": 9740 |
| }, |
| { |
| "epoch": 0.24375, |
| "grad_norm": 49.25, |
| "grad_norm_var": 27.333072916666666, |
| "learning_rate": 0.0001, |
| "loss": 7.4168, |
| "loss/crossentropy": 2.203968660533428, |
| "loss/hidden": 3.358203125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19210707377642394, |
| "step": 9750 |
| }, |
| { |
| "epoch": 0.244, |
| "grad_norm": 27.75, |
| "grad_norm_var": 28.5837890625, |
| "learning_rate": 0.0001, |
| "loss": 7.3667, |
| "loss/crossentropy": 2.0645239472389223, |
| "loss/hidden": 3.42578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1849998442456126, |
| "step": 9760 |
| }, |
| { |
| "epoch": 0.24425, |
| "grad_norm": 30.0, |
| "grad_norm_var": 7.441666666666666, |
| "learning_rate": 0.0001, |
| "loss": 7.3958, |
| "loss/crossentropy": 2.0423281893134115, |
| "loss/hidden": 3.46640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19458430632948875, |
| "step": 9770 |
| }, |
| { |
| "epoch": 0.2445, |
| "grad_norm": 28.25, |
| "grad_norm_var": 25.725, |
| "learning_rate": 0.0001, |
| "loss": 7.371, |
| "loss/crossentropy": 2.2162967801094053, |
| "loss/hidden": 3.392578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19894264116883278, |
| "step": 9780 |
| }, |
| { |
| "epoch": 0.24475, |
| "grad_norm": 31.875, |
| "grad_norm_var": 25.701822916666668, |
| "learning_rate": 0.0001, |
| "loss": 7.4136, |
| "loss/crossentropy": 2.052091246843338, |
| "loss/hidden": 3.331640625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19768061954528093, |
| "step": 9790 |
| }, |
| { |
| "epoch": 0.245, |
| "grad_norm": 29.375, |
| "grad_norm_var": 2.4247395833333334, |
| "learning_rate": 0.0001, |
| "loss": 7.3752, |
| "loss/crossentropy": 2.0378331199288366, |
| "loss/hidden": 3.45703125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1888222724199295, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.24525, |
| "grad_norm": 29.75, |
| "grad_norm_var": 17.073958333333334, |
| "learning_rate": 0.0001, |
| "loss": 7.389, |
| "loss/crossentropy": 2.0574826121330263, |
| "loss/hidden": 3.313671875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1817958688363433, |
| "step": 9810 |
| }, |
| { |
| "epoch": 0.2455, |
| "grad_norm": 28.125, |
| "grad_norm_var": 24.198372395833335, |
| "learning_rate": 0.0001, |
| "loss": 7.284, |
| "loss/crossentropy": 2.068347904086113, |
| "loss/hidden": 3.401953125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1905125178396702, |
| "step": 9820 |
| }, |
| { |
| "epoch": 0.24575, |
| "grad_norm": 28.5, |
| "grad_norm_var": 9.8041015625, |
| "learning_rate": 0.0001, |
| "loss": 7.3867, |
| "loss/crossentropy": 2.0113710410892964, |
| "loss/hidden": 3.318359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1825895557180047, |
| "step": 9830 |
| }, |
| { |
| "epoch": 0.246, |
| "grad_norm": 30.5, |
| "grad_norm_var": 1.16015625, |
| "learning_rate": 0.0001, |
| "loss": 7.3496, |
| "loss/crossentropy": 2.0473890252411366, |
| "loss/hidden": 3.408984375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18930337531492114, |
| "step": 9840 |
| }, |
| { |
| "epoch": 0.24625, |
| "grad_norm": 29.875, |
| "grad_norm_var": 1.75, |
| "learning_rate": 0.0001, |
| "loss": 7.4305, |
| "loss/crossentropy": 2.167546259611845, |
| "loss/hidden": 3.344921875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18493952229619026, |
| "step": 9850 |
| }, |
| { |
| "epoch": 0.2465, |
| "grad_norm": 28.375, |
| "grad_norm_var": 2.7625, |
| "learning_rate": 0.0001, |
| "loss": 7.444, |
| "loss/crossentropy": 2.1046394810080526, |
| "loss/hidden": 3.324609375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18062518630176783, |
| "step": 9860 |
| }, |
| { |
| "epoch": 0.24675, |
| "grad_norm": 28.75, |
| "grad_norm_var": 7819.056705729166, |
| "learning_rate": 0.0001, |
| "loss": 7.4016, |
| "loss/crossentropy": 2.0700221791863442, |
| "loss/hidden": 3.372265625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.190355353243649, |
| "step": 9870 |
| }, |
| { |
| "epoch": 0.247, |
| "grad_norm": 29.0, |
| "grad_norm_var": 7843.72265625, |
| "learning_rate": 0.0001, |
| "loss": 7.2875, |
| "loss/crossentropy": 2.057565826922655, |
| "loss/hidden": 3.4234375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18850490506738424, |
| "step": 9880 |
| }, |
| { |
| "epoch": 0.24725, |
| "grad_norm": 32.5, |
| "grad_norm_var": 3.778059895833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4329, |
| "loss/crossentropy": 2.0141181223094464, |
| "loss/hidden": 3.446875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19769613686949014, |
| "step": 9890 |
| }, |
| { |
| "epoch": 0.2475, |
| "grad_norm": 30.375, |
| "grad_norm_var": 6.312239583333334, |
| "learning_rate": 0.0001, |
| "loss": 7.336, |
| "loss/crossentropy": 2.146551664918661, |
| "loss/hidden": 3.34296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.18510441221296786, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.24775, |
| "grad_norm": 33.0, |
| "grad_norm_var": 6.36640625, |
| "learning_rate": 0.0001, |
| "loss": 7.306, |
| "loss/crossentropy": 2.2865438759326935, |
| "loss/hidden": 3.366796875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19417236726731063, |
| "step": 9910 |
| }, |
| { |
| "epoch": 0.248, |
| "grad_norm": 33.5, |
| "grad_norm_var": 2.405989583333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3855, |
| "loss/crossentropy": 2.133404280245304, |
| "loss/hidden": 3.37578125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19065556656569244, |
| "step": 9920 |
| }, |
| { |
| "epoch": 0.24825, |
| "grad_norm": 34.25, |
| "grad_norm_var": 3.1626528109486234e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.4378, |
| "loss/crossentropy": 2.0854051023721696, |
| "loss/hidden": 3.459765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.20351361632347106, |
| "step": 9930 |
| }, |
| { |
| "epoch": 0.2485, |
| "grad_norm": 32.25, |
| "grad_norm_var": 3.1626528099408717e+18, |
| "learning_rate": 0.0001, |
| "loss": 7.3826, |
| "loss/crossentropy": 2.087091060727835, |
| "loss/hidden": 3.39296875, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19672764679417015, |
| "step": 9940 |
| }, |
| { |
| "epoch": 0.24875, |
| "grad_norm": 31.875, |
| "grad_norm_var": 3.729622395833333, |
| "learning_rate": 0.0001, |
| "loss": 7.4303, |
| "loss/crossentropy": 1.9529958970844745, |
| "loss/hidden": 3.461328125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1957404987886548, |
| "step": 9950 |
| }, |
| { |
| "epoch": 0.249, |
| "grad_norm": 27.0, |
| "grad_norm_var": 3.515625, |
| "learning_rate": 0.0001, |
| "loss": 7.2963, |
| "loss/crossentropy": 2.1291355013847353, |
| "loss/hidden": 3.278125, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1832482174038887, |
| "step": 9960 |
| }, |
| { |
| "epoch": 0.24925, |
| "grad_norm": 31.5, |
| "grad_norm_var": 2.403580729166667, |
| "learning_rate": 0.0001, |
| "loss": 7.3011, |
| "loss/crossentropy": 2.073649263381958, |
| "loss/hidden": 3.2859375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.17194083742797375, |
| "step": 9970 |
| }, |
| { |
| "epoch": 0.2495, |
| "grad_norm": 30.5, |
| "grad_norm_var": 3.7934895833333333, |
| "learning_rate": 0.0001, |
| "loss": 7.3253, |
| "loss/crossentropy": 2.312130460143089, |
| "loss/hidden": 3.334765625, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19424791410565376, |
| "step": 9980 |
| }, |
| { |
| "epoch": 0.24975, |
| "grad_norm": 31.875, |
| "grad_norm_var": 6.4666015625, |
| "learning_rate": 0.0001, |
| "loss": 7.3395, |
| "loss/crossentropy": 1.9769040577113628, |
| "loss/hidden": 3.359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.1865438589360565, |
| "step": 9990 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 37.0, |
| "grad_norm_var": 5.488997395833334, |
| "learning_rate": 0.0001, |
| "loss": 7.3121, |
| "loss/crossentropy": 2.0656296610832214, |
| "loss/hidden": 3.418359375, |
| "loss/jsd": 0.0, |
| "loss/logits": 0.19510807991027831, |
| "step": 10000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 40000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 5000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.8575100320088064e+19, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|