diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,120027 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4, + "eval_steps": 2000, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4e-05, + "grad_norm": 456.0, + "learning_rate": 1.18e-05, + "loss": 85.4554, + "loss/crossentropy": 9.650346755981445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 8.066818237304688, + "step": 2 + }, + { + "epoch": 8e-05, + "grad_norm": 416.0, + "learning_rate": 1.3600000000000002e-05, + "loss": 84.3418, + "loss/crossentropy": 9.544375896453857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 7.628942489624023, + "step": 4 + }, + { + "epoch": 0.00012, + "grad_norm": 466.0, + "learning_rate": 1.54e-05, + "loss": 87.2187, + "loss/crossentropy": 9.569977283477783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 7.7909746170043945, + "step": 6 + }, + { + "epoch": 0.00016, + "grad_norm": 247.0, + "learning_rate": 1.72e-05, + "loss": 82.5078, + "loss/crossentropy": 9.06786823272705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 7.3673131465911865, + "step": 8 + }, + { + "epoch": 0.0002, + "grad_norm": 179.0, + "learning_rate": 1.9e-05, + "loss": 78.2757, + "loss/crossentropy": 8.918366432189941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 6.912693023681641, + "step": 10 + }, + { + "epoch": 0.00024, + "grad_norm": 148.0, + "learning_rate": 2.0800000000000004e-05, + "loss": 74.4248, + "loss/crossentropy": 8.443636417388916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 6.567321538925171, + "step": 12 + }, + { + "epoch": 0.00028, + "grad_norm": 131.0, + "learning_rate": 2.2600000000000004e-05, + "loss": 73.0003, + "loss/crossentropy": 8.428278923034668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 6.706400156021118, + "step": 14 + }, + { + "epoch": 0.00032, + "grad_norm": 181.0, + "grad_norm_var": 16279.8625, + "learning_rate": 2.4400000000000004e-05, + "loss": 70.0047, + "loss/crossentropy": 8.216889381408691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 6.056080102920532, + "step": 16 + }, + { + "epoch": 0.00036, + "grad_norm": 90.5, + "grad_norm_var": 14154.148958333333, + "learning_rate": 2.6200000000000003e-05, + "loss": 69.9766, + "loss/crossentropy": 8.191599607467651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 6.429446697235107, + "step": 18 + }, + { + "epoch": 0.0004, + "grad_norm": 52.25, + "grad_norm_var": 12194.27890625, + "learning_rate": 2.8000000000000003e-05, + "loss": 64.3807, + "loss/crossentropy": 7.506032228469849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 5.794633388519287, + "step": 20 + }, + { + "epoch": 0.00044, + "grad_norm": 39.25, + "grad_norm_var": 6249.4875, + "learning_rate": 2.9800000000000006e-05, + "loss": 61.2802, + "loss/crossentropy": 7.152851343154907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 5.261489152908325, + "step": 22 + }, + { + "epoch": 0.00048, + "grad_norm": 57.0, + "grad_norm_var": 4626.8875, + "learning_rate": 3.16e-05, + "loss": 58.3454, + "loss/crossentropy": 6.956738471984863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 5.020895004272461, + "step": 24 + }, + { + "epoch": 0.00052, + "grad_norm": 86.0, + "grad_norm_var": 4244.565625, + "learning_rate": 3.3400000000000005e-05, + "loss": 54.2703, + "loss/crossentropy": 6.686542987823486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 4.801911354064941, + "step": 26 + }, + { + "epoch": 0.00056, + "grad_norm": 110.5, + "grad_norm_var": 3868.5875, + "learning_rate": 3.520000000000001e-05, + "loss": 51.7343, + "loss/crossentropy": 6.4867262840271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 4.5746169090271, + "step": 28 + }, + { + "epoch": 0.0006, + "grad_norm": 50.0, + "grad_norm_var": 3953.82890625, + "learning_rate": 3.7e-05, + "loss": 49.6807, + "loss/crossentropy": 6.364065408706665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 4.230688810348511, + "step": 30 + }, + { + "epoch": 0.00064, + "grad_norm": 68.0, + "grad_norm_var": 3157.4958333333334, + "learning_rate": 3.88e-05, + "loss": 44.7112, + "loss/crossentropy": 5.731794834136963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 3.6969637870788574, + "step": 32 + }, + { + "epoch": 0.00068, + "grad_norm": 50.75, + "grad_norm_var": 500.59973958333336, + "learning_rate": 4.0600000000000004e-05, + "loss": 42.2923, + "loss/crossentropy": 5.553718328475952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 3.7271878719329834, + "step": 34 + }, + { + "epoch": 0.00072, + "grad_norm": 55.5, + "grad_norm_var": 342.2122395833333, + "learning_rate": 4.240000000000001e-05, + "loss": 37.7465, + "loss/crossentropy": 5.023651361465454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 3.2670111656188965, + "step": 36 + }, + { + "epoch": 0.00076, + "grad_norm": 75.5, + "grad_norm_var": 283.37395833333335, + "learning_rate": 4.420000000000001e-05, + "loss": 35.1313, + "loss/crossentropy": 4.921839237213135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 2.950661063194275, + "step": 38 + }, + { + "epoch": 0.0008, + "grad_norm": 44.5, + "grad_norm_var": 299.1958333333333, + "learning_rate": 4.600000000000001e-05, + "loss": 32.3316, + "loss/crossentropy": 4.782621145248413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 2.7473502159118652, + "step": 40 + }, + { + "epoch": 0.00084, + "grad_norm": 36.5, + "grad_norm_var": 361.1372395833333, + "learning_rate": 4.78e-05, + "loss": 28.4104, + "loss/crossentropy": 3.8754972219467163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 2.410745143890381, + "step": 42 + }, + { + "epoch": 0.00088, + "grad_norm": 36.25, + "grad_norm_var": 225.80598958333334, + "learning_rate": 4.96e-05, + "loss": 26.1806, + "loss/crossentropy": 3.9885865449905396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 2.1414425373077393, + "step": 44 + }, + { + "epoch": 0.00092, + "grad_norm": 46.0, + "grad_norm_var": 249.475, + "learning_rate": 5.14e-05, + "loss": 24.4012, + "loss/crossentropy": 3.750515580177307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 2.063339352607727, + "step": 46 + }, + { + "epoch": 0.00096, + "grad_norm": 20.875, + "grad_norm_var": 315.8291015625, + "learning_rate": 5.3200000000000006e-05, + "loss": 22.822, + "loss/crossentropy": 3.7912577390670776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.9196046590805054, + "step": 48 + }, + { + "epoch": 0.001, + "grad_norm": 35.5, + "grad_norm_var": 279.284375, + "learning_rate": 5.500000000000001e-05, + "loss": 21.036, + "loss/crossentropy": 3.777758002281189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.7479944229125977, + "step": 50 + }, + { + "epoch": 0.00104, + "grad_norm": 20.75, + "grad_norm_var": 306.15149739583336, + "learning_rate": 5.680000000000001e-05, + "loss": 20.3608, + "loss/crossentropy": 3.5903185606002808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.5533717274665833, + "step": 52 + }, + { + "epoch": 0.00108, + "grad_norm": 43.75, + "grad_norm_var": 213.07395833333334, + "learning_rate": 5.860000000000001e-05, + "loss": 18.813, + "loss/crossentropy": 3.691780686378479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.4755533933639526, + "step": 54 + }, + { + "epoch": 0.00112, + "grad_norm": 21.25, + "grad_norm_var": 70.690625, + "learning_rate": 6.040000000000001e-05, + "loss": 19.1421, + "loss/crossentropy": 3.557003617286682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.5198156833648682, + "step": 56 + }, + { + "epoch": 0.00116, + "grad_norm": 21.5, + "grad_norm_var": 76.30390625, + "learning_rate": 6.220000000000001e-05, + "loss": 17.2705, + "loss/crossentropy": 3.2730292081832886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.4131997227668762, + "step": 58 + }, + { + "epoch": 0.0012, + "grad_norm": 19.875, + "grad_norm_var": 77.11399739583334, + "learning_rate": 6.400000000000001e-05, + "loss": 16.4712, + "loss/crossentropy": 3.419156074523926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.3277101516723633, + "step": 60 + }, + { + "epoch": 0.00124, + "grad_norm": 25.75, + "grad_norm_var": 48.60520833333333, + "learning_rate": 6.58e-05, + "loss": 16.6219, + "loss/crossentropy": 2.973878502845764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.3438007831573486, + "step": 62 + }, + { + "epoch": 0.00128, + "grad_norm": 34.5, + "grad_norm_var": 53.18020833333333, + "learning_rate": 6.76e-05, + "loss": 15.0929, + "loss/crossentropy": 2.892021059989929, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.1624282002449036, + "step": 64 + }, + { + "epoch": 0.00132, + "grad_norm": 15.4375, + "grad_norm_var": 51.195947265625, + "learning_rate": 6.94e-05, + "loss": 15.1967, + "loss/crossentropy": 2.954660177230835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.116301715373993, + "step": 66 + }, + { + "epoch": 0.00136, + "grad_norm": 32.0, + "grad_norm_var": 51.064306640625, + "learning_rate": 7.120000000000001e-05, + "loss": 14.9397, + "loss/crossentropy": 3.2686156034469604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.1978037357330322, + "step": 68 + }, + { + "epoch": 0.0014, + "grad_norm": 29.25, + "grad_norm_var": 32.794270833333336, + "learning_rate": 7.3e-05, + "loss": 14.4846, + "loss/crossentropy": 2.7956581115722656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.1973016262054443, + "step": 70 + }, + { + "epoch": 0.00144, + "grad_norm": 15.25, + "grad_norm_var": 37.80149739583333, + "learning_rate": 7.48e-05, + "loss": 14.1296, + "loss/crossentropy": 3.08966863155365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.151496708393097, + "step": 72 + }, + { + "epoch": 0.00148, + "grad_norm": 18.625, + "grad_norm_var": 41.88984375, + "learning_rate": 7.66e-05, + "loss": 13.6812, + "loss/crossentropy": 2.949987292289734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.9787414371967316, + "step": 74 + }, + { + "epoch": 0.00152, + "grad_norm": 15.1875, + "grad_norm_var": 49.143489583333334, + "learning_rate": 7.840000000000001e-05, + "loss": 12.8901, + "loss/crossentropy": 3.1161292791366577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.0640113949775696, + "step": 76 + }, + { + "epoch": 0.00156, + "grad_norm": 22.125, + "grad_norm_var": 47.63170572916667, + "learning_rate": 8.020000000000001e-05, + "loss": 13.157, + "loss/crossentropy": 3.3661664724349976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.1353825330734253, + "step": 78 + }, + { + "epoch": 0.0016, + "grad_norm": 16.25, + "grad_norm_var": 35.33274739583333, + "learning_rate": 8.200000000000001e-05, + "loss": 12.9372, + "loss/crossentropy": 2.927241563796997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.9984134435653687, + "step": 80 + }, + { + "epoch": 0.00164, + "grad_norm": 13.625, + "grad_norm_var": 37.53951822916667, + "learning_rate": 8.38e-05, + "loss": 12.0477, + "loss/crossentropy": 3.1273285150527954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.9241160154342651, + "step": 82 + }, + { + "epoch": 0.00168, + "grad_norm": 19.125, + "grad_norm_var": 19.602718098958334, + "learning_rate": 8.560000000000001e-05, + "loss": 11.9084, + "loss/crossentropy": 2.737278938293457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.8822851181030273, + "step": 84 + }, + { + "epoch": 0.00172, + "grad_norm": 13.0625, + "grad_norm_var": 11.107014973958334, + "learning_rate": 8.740000000000001e-05, + "loss": 11.8594, + "loss/crossentropy": 2.4452388286590576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.8693483769893646, + "step": 86 + }, + { + "epoch": 0.00176, + "grad_norm": 18.375, + "grad_norm_var": 11.328369140625, + "learning_rate": 8.92e-05, + "loss": 11.5058, + "loss/crossentropy": 2.89771831035614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.8749278783798218, + "step": 88 + }, + { + "epoch": 0.0018, + "grad_norm": 12.1875, + "grad_norm_var": 13.811572265625, + "learning_rate": 9.1e-05, + "loss": 11.9281, + "loss/crossentropy": 3.0173208713531494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.8714744746685028, + "step": 90 + }, + { + "epoch": 0.00184, + "grad_norm": 20.375, + "grad_norm_var": 16.114306640625, + "learning_rate": 9.28e-05, + "loss": 11.4244, + "loss/crossentropy": 2.588515043258667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.9651442170143127, + "step": 92 + }, + { + "epoch": 0.00188, + "grad_norm": 13.375, + "grad_norm_var": 12.784309895833333, + "learning_rate": 9.46e-05, + "loss": 11.6092, + "loss/crossentropy": 2.8116774559020996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.9143906235694885, + "step": 94 + }, + { + "epoch": 0.00192, + "grad_norm": 16.375, + "grad_norm_var": 13.225764973958333, + "learning_rate": 9.64e-05, + "loss": 10.6723, + "loss/crossentropy": 2.8734441995620728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.814399927854538, + "step": 96 + }, + { + "epoch": 0.00196, + "grad_norm": 10.3125, + "grad_norm_var": 17.228238932291667, + "learning_rate": 9.82e-05, + "loss": 10.6577, + "loss/crossentropy": 2.664194703102112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7905783653259277, + "step": 98 + }, + { + "epoch": 0.002, + "grad_norm": 12.3125, + "grad_norm_var": 17.113785807291666, + "learning_rate": 0.0001, + "loss": 10.6847, + "loss/crossentropy": 2.4851003885269165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.769305944442749, + "step": 100 + }, + { + "epoch": 0.00204, + "grad_norm": 10.75, + "grad_norm_var": 13.9140625, + "learning_rate": 0.0001, + "loss": 10.8119, + "loss/crossentropy": 2.2757182121276855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.8241511881351471, + "step": 102 + }, + { + "epoch": 0.00208, + "grad_norm": 11.375, + "grad_norm_var": 13.615738932291666, + "learning_rate": 0.0001, + "loss": 10.6244, + "loss/crossentropy": 2.7211785316467285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.8518709540367126, + "step": 104 + }, + { + "epoch": 0.00212, + "grad_norm": 11.625, + "grad_norm_var": 14.163395182291667, + "learning_rate": 0.0001, + "loss": 10.5629, + "loss/crossentropy": 2.387019991874695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7568954229354858, + "step": 106 + }, + { + "epoch": 0.00216, + "grad_norm": 9.9375, + "grad_norm_var": 10.788525390625, + "learning_rate": 0.0001, + "loss": 10.1364, + "loss/crossentropy": 2.5363346338272095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7648341059684753, + "step": 108 + }, + { + "epoch": 0.0022, + "grad_norm": 20.0, + "grad_norm_var": 14.898893229166667, + "learning_rate": 0.0001, + "loss": 10.8773, + "loss/crossentropy": 2.8450236320495605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.737193763256073, + "step": 110 + }, + { + "epoch": 0.00224, + "grad_norm": 10.25, + "grad_norm_var": 15.364518229166666, + "learning_rate": 0.0001, + "loss": 9.4554, + "loss/crossentropy": 2.4827451705932617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6924614012241364, + "step": 112 + }, + { + "epoch": 0.00228, + "grad_norm": 10.5625, + "grad_norm_var": 7.461832682291667, + "learning_rate": 0.0001, + "loss": 9.9651, + "loss/crossentropy": 2.093318462371826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7038464546203613, + "step": 114 + }, + { + "epoch": 0.00232, + "grad_norm": 9.6875, + "grad_norm_var": 7.597330729166667, + "learning_rate": 0.0001, + "loss": 10.0297, + "loss/crossentropy": 2.5149790048599243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6980189085006714, + "step": 116 + }, + { + "epoch": 0.00236, + "grad_norm": 11.625, + "grad_norm_var": 6.672509765625, + "learning_rate": 0.0001, + "loss": 9.8176, + "loss/crossentropy": 2.6022276878356934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7005251348018646, + "step": 118 + }, + { + "epoch": 0.0024, + "grad_norm": 7.625, + "grad_norm_var": 7.298160807291667, + "learning_rate": 0.0001, + "loss": 9.5658, + "loss/crossentropy": 2.6836462020874023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7237774729728699, + "step": 120 + }, + { + "epoch": 0.00244, + "grad_norm": 9.5625, + "grad_norm_var": 7.402018229166667, + "learning_rate": 0.0001, + "loss": 9.7376, + "loss/crossentropy": 2.6823805570602417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7570162117481232, + "step": 122 + }, + { + "epoch": 0.00248, + "grad_norm": 11.25, + "grad_norm_var": 7.391259765625, + "learning_rate": 0.0001, + "loss": 9.4713, + "loss/crossentropy": 2.6233514547348022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7183247208595276, + "step": 124 + }, + { + "epoch": 0.00252, + "grad_norm": 9.9375, + "grad_norm_var": 1.0839680989583333, + "learning_rate": 0.0001, + "loss": 9.2243, + "loss/crossentropy": 2.331676959991455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6982125043869019, + "step": 126 + }, + { + "epoch": 0.00256, + "grad_norm": 9.1875, + "grad_norm_var": 0.9687337239583333, + "learning_rate": 0.0001, + "loss": 9.4777, + "loss/crossentropy": 2.429046392440796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.696417510509491, + "step": 128 + }, + { + "epoch": 0.0026, + "grad_norm": 14.8125, + "grad_norm_var": 2.448291015625, + "learning_rate": 0.0001, + "loss": 9.9024, + "loss/crossentropy": 2.5262571573257446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.716008871793747, + "step": 130 + }, + { + "epoch": 0.00264, + "grad_norm": 8.8125, + "grad_norm_var": 2.7044270833333335, + "learning_rate": 0.0001, + "loss": 9.2836, + "loss/crossentropy": 2.187526524066925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.666053056716919, + "step": 132 + }, + { + "epoch": 0.00268, + "grad_norm": 8.9375, + "grad_norm_var": 3.285270182291667, + "learning_rate": 0.0001, + "loss": 9.8338, + "loss/crossentropy": 2.4199057817459106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6709816455841064, + "step": 134 + }, + { + "epoch": 0.00272, + "grad_norm": 9.6875, + "grad_norm_var": 3.4072916666666666, + "learning_rate": 0.0001, + "loss": 9.4225, + "loss/crossentropy": 2.1963008642196655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5963725447654724, + "step": 136 + }, + { + "epoch": 0.00276, + "grad_norm": 8.0625, + "grad_norm_var": 3.556884765625, + "learning_rate": 0.0001, + "loss": 9.44, + "loss/crossentropy": 2.5878132581710815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6362220048904419, + "step": 138 + }, + { + "epoch": 0.0028, + "grad_norm": 9.25, + "grad_norm_var": 3.4852701822916665, + "learning_rate": 0.0001, + "loss": 9.4314, + "loss/crossentropy": 2.7800480127334595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6819457113742828, + "step": 140 + }, + { + "epoch": 0.00284, + "grad_norm": 8.4375, + "grad_norm_var": 3.838997395833333, + "learning_rate": 0.0001, + "loss": 9.1047, + "loss/crossentropy": 2.5055110454559326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.674308180809021, + "step": 142 + }, + { + "epoch": 0.00288, + "grad_norm": 8.25, + "grad_norm_var": 4.078499348958333, + "learning_rate": 0.0001, + "loss": 9.1578, + "loss/crossentropy": 2.8532944917678833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7083481848239899, + "step": 144 + }, + { + "epoch": 0.00292, + "grad_norm": 7.8125, + "grad_norm_var": 2.2759765625, + "learning_rate": 0.0001, + "loss": 8.8023, + "loss/crossentropy": 2.442527174949646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6510869562625885, + "step": 146 + }, + { + "epoch": 0.00296, + "grad_norm": 10.9375, + "grad_norm_var": 2.8544881184895834, + "learning_rate": 0.0001, + "loss": 8.7238, + "loss/crossentropy": 2.516597867012024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6362285614013672, + "step": 148 + }, + { + "epoch": 0.003, + "grad_norm": 7.40625, + "grad_norm_var": 1.8615885416666667, + "learning_rate": 0.0001, + "loss": 8.5543, + "loss/crossentropy": 2.8672900199890137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6459421515464783, + "step": 150 + }, + { + "epoch": 0.00304, + "grad_norm": 8.1875, + "grad_norm_var": 1.7195963541666666, + "learning_rate": 0.0001, + "loss": 8.6403, + "loss/crossentropy": 2.2042795419692993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5720505118370056, + "step": 152 + }, + { + "epoch": 0.00308, + "grad_norm": 7.8125, + "grad_norm_var": 1.876025390625, + "learning_rate": 0.0001, + "loss": 8.768, + "loss/crossentropy": 2.225563883781433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6328675150871277, + "step": 154 + }, + { + "epoch": 0.00312, + "grad_norm": 8.0625, + "grad_norm_var": 1.5925618489583333, + "learning_rate": 0.0001, + "loss": 8.5743, + "loss/crossentropy": 2.3541462421417236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5927431881427765, + "step": 156 + }, + { + "epoch": 0.00316, + "grad_norm": 7.34375, + "grad_norm_var": 1.834619140625, + "learning_rate": 0.0001, + "loss": 8.7329, + "loss/crossentropy": 2.4685616493225098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6299368739128113, + "step": 158 + }, + { + "epoch": 0.0032, + "grad_norm": 9.25, + "grad_norm_var": 1.4429646809895833, + "learning_rate": 0.0001, + "loss": 8.4796, + "loss/crossentropy": 2.4637919664382935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6084515154361725, + "step": 160 + }, + { + "epoch": 0.00324, + "grad_norm": 9.0625, + "grad_norm_var": 1.3692545572916666, + "learning_rate": 0.0001, + "loss": 9.0446, + "loss/crossentropy": 2.598397374153137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.574739396572113, + "step": 162 + }, + { + "epoch": 0.00328, + "grad_norm": 7.15625, + "grad_norm_var": 0.9171834309895833, + "learning_rate": 0.0001, + "loss": 8.1508, + "loss/crossentropy": 2.5183030366897583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5985860526561737, + "step": 164 + }, + { + "epoch": 0.00332, + "grad_norm": 9.125, + "grad_norm_var": 0.9571614583333333, + "learning_rate": 0.0001, + "loss": 8.4296, + "loss/crossentropy": 2.252183437347412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5627379417419434, + "step": 166 + }, + { + "epoch": 0.00336, + "grad_norm": 7.875, + "grad_norm_var": 0.70533447265625, + "learning_rate": 0.0001, + "loss": 8.4549, + "loss/crossentropy": 2.5720516443252563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5945309698581696, + "step": 168 + }, + { + "epoch": 0.0034, + "grad_norm": 9.1875, + "grad_norm_var": 0.8844034830729167, + "learning_rate": 0.0001, + "loss": 8.6096, + "loss/crossentropy": 2.3004332184791565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5401738286018372, + "step": 170 + }, + { + "epoch": 0.00344, + "grad_norm": 7.71875, + "grad_norm_var": 0.948681640625, + "learning_rate": 0.0001, + "loss": 8.5484, + "loss/crossentropy": 2.689734935760498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6384358406066895, + "step": 172 + }, + { + "epoch": 0.00348, + "grad_norm": 6.84375, + "grad_norm_var": 0.9418253580729167, + "learning_rate": 0.0001, + "loss": 8.1888, + "loss/crossentropy": 1.944397747516632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5828913450241089, + "step": 174 + }, + { + "epoch": 0.00352, + "grad_norm": 6.53125, + "grad_norm_var": 0.8817342122395834, + "learning_rate": 0.0001, + "loss": 8.0577, + "loss/crossentropy": 2.8166507482528687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5916908979415894, + "step": 176 + }, + { + "epoch": 0.00356, + "grad_norm": 7.09375, + "grad_norm_var": 0.7456990559895833, + "learning_rate": 0.0001, + "loss": 8.7701, + "loss/crossentropy": 2.3208402395248413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5968939661979675, + "step": 178 + }, + { + "epoch": 0.0036, + "grad_norm": 7.25, + "grad_norm_var": 0.7425130208333334, + "learning_rate": 0.0001, + "loss": 8.2615, + "loss/crossentropy": 2.817763566970825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5800873041152954, + "step": 180 + }, + { + "epoch": 0.00364, + "grad_norm": 6.78125, + "grad_norm_var": 0.6126912434895834, + "learning_rate": 0.0001, + "loss": 8.3053, + "loss/crossentropy": 2.250023365020752, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5279016494750977, + "step": 182 + }, + { + "epoch": 0.00368, + "grad_norm": 6.09375, + "grad_norm_var": 0.6917805989583333, + "learning_rate": 0.0001, + "loss": 7.7974, + "loss/crossentropy": 2.1337096095085144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5393811166286469, + "step": 184 + }, + { + "epoch": 0.00372, + "grad_norm": 6.34375, + "grad_norm_var": 0.5962198893229167, + "learning_rate": 0.0001, + "loss": 7.7258, + "loss/crossentropy": 2.6338934898376465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6444519460201263, + "step": 186 + }, + { + "epoch": 0.00376, + "grad_norm": 10.4375, + "grad_norm_var": 1.2786458333333333, + "learning_rate": 0.0001, + "loss": 8.0762, + "loss/crossentropy": 2.66677463054657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6012931764125824, + "step": 188 + }, + { + "epoch": 0.0038, + "grad_norm": 6.375, + "grad_norm_var": 1.6642537434895834, + "learning_rate": 0.0001, + "loss": 8.3177, + "loss/crossentropy": 2.3731196522712708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6299596428871155, + "step": 190 + }, + { + "epoch": 0.00384, + "grad_norm": 7.4375, + "grad_norm_var": 1.7719889322916667, + "learning_rate": 0.0001, + "loss": 8.214, + "loss/crossentropy": 2.411492705345154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5328834354877472, + "step": 192 + }, + { + "epoch": 0.00388, + "grad_norm": 7.15625, + "grad_norm_var": 1.9489420572916667, + "learning_rate": 0.0001, + "loss": 7.9763, + "loss/crossentropy": 2.2402734756469727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.539243072271347, + "step": 194 + }, + { + "epoch": 0.00392, + "grad_norm": 7.53125, + "grad_norm_var": 1.895556640625, + "learning_rate": 0.0001, + "loss": 7.9292, + "loss/crossentropy": 2.3250681161880493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5374342203140259, + "step": 196 + }, + { + "epoch": 0.00396, + "grad_norm": 7.75, + "grad_norm_var": 1.8979451497395834, + "learning_rate": 0.0001, + "loss": 7.9201, + "loss/crossentropy": 2.42138135433197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5410921573638916, + "step": 198 + }, + { + "epoch": 0.004, + "grad_norm": 6.78125, + "grad_norm_var": 1.8954386393229166, + "learning_rate": 0.0001, + "loss": 7.7597, + "loss/crossentropy": 2.1954251527786255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.49367184937000275, + "step": 200 + }, + { + "epoch": 0.00404, + "grad_norm": 6.75, + "grad_norm_var": 1.6848307291666667, + "learning_rate": 0.0001, + "loss": 7.9033, + "loss/crossentropy": 2.81479811668396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5567455887794495, + "step": 202 + }, + { + "epoch": 0.00408, + "grad_norm": 6.5, + "grad_norm_var": 1.08228759765625, + "learning_rate": 0.0001, + "loss": 7.9812, + "loss/crossentropy": 2.611761450767517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.544409453868866, + "step": 204 + }, + { + "epoch": 0.00412, + "grad_norm": 6.125, + "grad_norm_var": 0.650244140625, + "learning_rate": 0.0001, + "loss": 7.7921, + "loss/crossentropy": 2.1369245052337646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5181446373462677, + "step": 206 + }, + { + "epoch": 0.00416, + "grad_norm": 7.1875, + "grad_norm_var": 0.30745035807291665, + "learning_rate": 0.0001, + "loss": 8.3375, + "loss/crossentropy": 2.435856580734253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5506252646446228, + "step": 208 + }, + { + "epoch": 0.0042, + "grad_norm": 6.21875, + "grad_norm_var": 0.26848958333333334, + "learning_rate": 0.0001, + "loss": 7.7599, + "loss/crossentropy": 2.2404768466949463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.46975645422935486, + "step": 210 + }, + { + "epoch": 0.00424, + "grad_norm": 6.71875, + "grad_norm_var": 0.19713541666666667, + "learning_rate": 0.0001, + "loss": 7.7083, + "loss/crossentropy": 2.4866777658462524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5217231214046478, + "step": 212 + }, + { + "epoch": 0.00428, + "grad_norm": 6.15625, + "grad_norm_var": 0.121484375, + "learning_rate": 0.0001, + "loss": 7.6519, + "loss/crossentropy": 2.074867010116577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4683506190776825, + "step": 214 + }, + { + "epoch": 0.00432, + "grad_norm": 7.59375, + "grad_norm_var": 0.21679280598958334, + "learning_rate": 0.0001, + "loss": 7.6062, + "loss/crossentropy": 2.2040151357650757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5235520601272583, + "step": 216 + }, + { + "epoch": 0.00436, + "grad_norm": 7.25, + "grad_norm_var": 0.25690104166666666, + "learning_rate": 0.0001, + "loss": 7.886, + "loss/crossentropy": 2.174479365348816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5243187248706818, + "step": 218 + }, + { + "epoch": 0.0044, + "grad_norm": 6.28125, + "grad_norm_var": 0.26523030598958336, + "learning_rate": 0.0001, + "loss": 7.7535, + "loss/crossentropy": 2.5678584575653076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5818615555763245, + "step": 220 + }, + { + "epoch": 0.00444, + "grad_norm": 6.0625, + "grad_norm_var": 0.2814412434895833, + "learning_rate": 0.0001, + "loss": 7.859, + "loss/crossentropy": 2.4551891088485718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5400412082672119, + "step": 222 + }, + { + "epoch": 0.00448, + "grad_norm": 6.15625, + "grad_norm_var": 0.25310872395833334, + "learning_rate": 0.0001, + "loss": 7.7341, + "loss/crossentropy": 2.0638335943222046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.49279752373695374, + "step": 224 + }, + { + "epoch": 0.00452, + "grad_norm": 6.6875, + "grad_norm_var": 0.38235677083333336, + "learning_rate": 0.0001, + "loss": 8.1064, + "loss/crossentropy": 2.553247332572937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5566797852516174, + "step": 226 + }, + { + "epoch": 0.00456, + "grad_norm": 6.0625, + "grad_norm_var": 0.39451497395833335, + "learning_rate": 0.0001, + "loss": 7.7812, + "loss/crossentropy": 2.544332265853882, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5118530094623566, + "step": 228 + }, + { + "epoch": 0.0046, + "grad_norm": 5.875, + "grad_norm_var": 0.4054036458333333, + "learning_rate": 0.0001, + "loss": 7.064, + "loss/crossentropy": 2.191234052181244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5157675743103027, + "step": 230 + }, + { + "epoch": 0.00464, + "grad_norm": 5.4375, + "grad_norm_var": 0.360791015625, + "learning_rate": 0.0001, + "loss": 7.6611, + "loss/crossentropy": 2.3671151399612427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5233491659164429, + "step": 232 + }, + { + "epoch": 0.00468, + "grad_norm": 7.125, + "grad_norm_var": 0.4180826822916667, + "learning_rate": 0.0001, + "loss": 7.4509, + "loss/crossentropy": 2.3003920316696167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5037694871425629, + "step": 234 + }, + { + "epoch": 0.00472, + "grad_norm": 5.625, + "grad_norm_var": 0.4305338541666667, + "learning_rate": 0.0001, + "loss": 7.8236, + "loss/crossentropy": 2.4672670364379883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5187652707099915, + "step": 236 + }, + { + "epoch": 0.00476, + "grad_norm": 6.0625, + "grad_norm_var": 0.4493326822916667, + "learning_rate": 0.0001, + "loss": 7.3246, + "loss/crossentropy": 2.179289937019348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5111505687236786, + "step": 238 + }, + { + "epoch": 0.0048, + "grad_norm": 6.34375, + "grad_norm_var": 0.5123697916666666, + "learning_rate": 0.0001, + "loss": 7.6064, + "loss/crossentropy": 2.2424627542495728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5187103897333145, + "step": 240 + }, + { + "epoch": 0.00484, + "grad_norm": 5.96875, + "grad_norm_var": 0.2986979166666667, + "learning_rate": 0.0001, + "loss": 7.8108, + "loss/crossentropy": 2.8024520874023438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5700482130050659, + "step": 242 + }, + { + "epoch": 0.00488, + "grad_norm": 6.25, + "grad_norm_var": 0.4554524739583333, + "learning_rate": 0.0001, + "loss": 7.6644, + "loss/crossentropy": 2.3653491735458374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5232449471950531, + "step": 244 + }, + { + "epoch": 0.00492, + "grad_norm": 6.90625, + "grad_norm_var": 0.48513997395833336, + "learning_rate": 0.0001, + "loss": 7.5028, + "loss/crossentropy": 2.6201778650283813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5684538185596466, + "step": 246 + }, + { + "epoch": 0.00496, + "grad_norm": 7.3125, + "grad_norm_var": 0.49654541015625, + "learning_rate": 0.0001, + "loss": 7.6631, + "loss/crossentropy": 2.2811471819877625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5543638169765472, + "step": 248 + }, + { + "epoch": 0.005, + "grad_norm": 5.625, + "grad_norm_var": 0.48544514973958336, + "learning_rate": 0.0001, + "loss": 7.6914, + "loss/crossentropy": 2.4381459951400757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5594777166843414, + "step": 250 + }, + { + "epoch": 0.00504, + "grad_norm": 9.375, + "grad_norm_var": 1.04000244140625, + "learning_rate": 0.0001, + "loss": 7.7346, + "loss/crossentropy": 2.435782313346863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5159732699394226, + "step": 252 + }, + { + "epoch": 0.00508, + "grad_norm": 5.46875, + "grad_norm_var": 1.0892862955729166, + "learning_rate": 0.0001, + "loss": 8.0282, + "loss/crossentropy": 2.7867215871810913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.551640123128891, + "step": 254 + }, + { + "epoch": 0.00512, + "grad_norm": 5.46875, + "grad_norm_var": 1.2432902018229166, + "learning_rate": 0.0001, + "loss": 7.0744, + "loss/crossentropy": 1.9328945875167847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45153285562992096, + "step": 256 + }, + { + "epoch": 0.00516, + "grad_norm": 6.21875, + "grad_norm_var": 1.2460286458333334, + "learning_rate": 0.0001, + "loss": 7.292, + "loss/crossentropy": 2.552613139152527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5215992629528046, + "step": 258 + }, + { + "epoch": 0.0052, + "grad_norm": 5.40625, + "grad_norm_var": 1.1848958333333333, + "learning_rate": 0.0001, + "loss": 7.4992, + "loss/crossentropy": 2.3720492124557495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5024219453334808, + "step": 260 + }, + { + "epoch": 0.00524, + "grad_norm": 8.1875, + "grad_norm_var": 1.4898274739583333, + "learning_rate": 0.0001, + "loss": 7.4676, + "loss/crossentropy": 2.465815782546997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5383751839399338, + "step": 262 + }, + { + "epoch": 0.00528, + "grad_norm": 6.0625, + "grad_norm_var": 1.5113240559895833, + "learning_rate": 0.0001, + "loss": 7.3163, + "loss/crossentropy": 2.2791935205459595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.534159854054451, + "step": 264 + }, + { + "epoch": 0.00532, + "grad_norm": 6.28125, + "grad_norm_var": 1.3855305989583333, + "learning_rate": 0.0001, + "loss": 7.9279, + "loss/crossentropy": 2.48906409740448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5344790518283844, + "step": 266 + }, + { + "epoch": 0.00536, + "grad_norm": 4.90625, + "grad_norm_var": 1.0180826822916667, + "learning_rate": 0.0001, + "loss": 7.3178, + "loss/crossentropy": 2.0858306884765625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.47646892070770264, + "step": 268 + }, + { + "epoch": 0.0054, + "grad_norm": 8.375, + "grad_norm_var": 1.18258056640625, + "learning_rate": 0.0001, + "loss": 7.383, + "loss/crossentropy": 2.159322738647461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5543113648891449, + "step": 270 + }, + { + "epoch": 0.00544, + "grad_norm": 4.9375, + "grad_norm_var": 1.204931640625, + "learning_rate": 0.0001, + "loss": 7.1635, + "loss/crossentropy": 2.249913454055786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4896702766418457, + "step": 272 + }, + { + "epoch": 0.00548, + "grad_norm": 8.125, + "grad_norm_var": 1.388916015625, + "learning_rate": 0.0001, + "loss": 7.3565, + "loss/crossentropy": 1.998712420463562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4770403504371643, + "step": 274 + }, + { + "epoch": 0.00552, + "grad_norm": 5.53125, + "grad_norm_var": 1.9266764322916667, + "learning_rate": 0.0001, + "loss": 7.6522, + "loss/crossentropy": 2.391260862350464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5259605348110199, + "step": 276 + }, + { + "epoch": 0.00556, + "grad_norm": 6.1875, + "grad_norm_var": 1.6884765625, + "learning_rate": 0.0001, + "loss": 7.2935, + "loss/crossentropy": 2.523361325263977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4724537879228592, + "step": 278 + }, + { + "epoch": 0.0056, + "grad_norm": 5.34375, + "grad_norm_var": 1.7386555989583334, + "learning_rate": 0.0001, + "loss": 7.3505, + "loss/crossentropy": 2.281963586807251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.49724647402763367, + "step": 280 + }, + { + "epoch": 0.00564, + "grad_norm": 5.0625, + "grad_norm_var": 1.8446451822916667, + "learning_rate": 0.0001, + "loss": 7.1079, + "loss/crossentropy": 2.2403814792633057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45384156703948975, + "step": 282 + }, + { + "epoch": 0.00568, + "grad_norm": 5.6875, + "grad_norm_var": 1.670166015625, + "learning_rate": 0.0001, + "loss": 7.4318, + "loss/crossentropy": 2.2687963247299194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.46648281812667847, + "step": 284 + }, + { + "epoch": 0.00572, + "grad_norm": 5.75, + "grad_norm_var": 1.38541259765625, + "learning_rate": 0.0001, + "loss": 7.3035, + "loss/crossentropy": 2.3336217403411865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4792183041572571, + "step": 286 + }, + { + "epoch": 0.00576, + "grad_norm": 6.4375, + "grad_norm_var": 1.292431640625, + "learning_rate": 0.0001, + "loss": 7.0031, + "loss/crossentropy": 2.4006571769714355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.473391056060791, + "step": 288 + }, + { + "epoch": 0.0058, + "grad_norm": 5.09375, + "grad_norm_var": 1.18671875, + "learning_rate": 0.0001, + "loss": 6.8037, + "loss/crossentropy": 2.0306124687194824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4722501188516617, + "step": 290 + }, + { + "epoch": 0.00584, + "grad_norm": 5.34375, + "grad_norm_var": 0.38865559895833335, + "learning_rate": 0.0001, + "loss": 7.1378, + "loss/crossentropy": 2.412277102470398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.48340730369091034, + "step": 292 + }, + { + "epoch": 0.00588, + "grad_norm": 6.21875, + "grad_norm_var": 0.3952433268229167, + "learning_rate": 0.0001, + "loss": 7.3589, + "loss/crossentropy": 2.3195769786834717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5074818134307861, + "step": 294 + }, + { + "epoch": 0.00592, + "grad_norm": 6.71875, + "grad_norm_var": 0.2867024739583333, + "learning_rate": 0.0001, + "loss": 7.2826, + "loss/crossentropy": 2.3265275955200195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5351093411445618, + "step": 296 + }, + { + "epoch": 0.00596, + "grad_norm": 5.625, + "grad_norm_var": 0.26171468098958334, + "learning_rate": 0.0001, + "loss": 7.0106, + "loss/crossentropy": 2.210574746131897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45469868183135986, + "step": 298 + }, + { + "epoch": 0.006, + "grad_norm": 6.3125, + "grad_norm_var": 0.3034138997395833, + "learning_rate": 0.0001, + "loss": 7.4741, + "loss/crossentropy": 2.347964644432068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45269395411014557, + "step": 300 + }, + { + "epoch": 0.00604, + "grad_norm": 5.0, + "grad_norm_var": 0.34308268229166666, + "learning_rate": 0.0001, + "loss": 6.8901, + "loss/crossentropy": 2.197494626045227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45010584592819214, + "step": 302 + }, + { + "epoch": 0.00608, + "grad_norm": 5.8125, + "grad_norm_var": 0.32224934895833335, + "learning_rate": 0.0001, + "loss": 6.909, + "loss/crossentropy": 2.3295196890830994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.48989084362983704, + "step": 304 + }, + { + "epoch": 0.00612, + "grad_norm": 6.3125, + "grad_norm_var": 0.26347249348958335, + "learning_rate": 0.0001, + "loss": 7.6606, + "loss/crossentropy": 2.5208678245544434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.46991507709026337, + "step": 306 + }, + { + "epoch": 0.00616, + "grad_norm": 5.40625, + "grad_norm_var": 0.22849934895833332, + "learning_rate": 0.0001, + "loss": 7.2881, + "loss/crossentropy": 2.6091307401657104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5090319812297821, + "step": 308 + }, + { + "epoch": 0.0062, + "grad_norm": 5.4375, + "grad_norm_var": 0.3083984375, + "learning_rate": 0.0001, + "loss": 7.0888, + "loss/crossentropy": 2.4142966270446777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45959460735321045, + "step": 310 + }, + { + "epoch": 0.00624, + "grad_norm": 6.0625, + "grad_norm_var": 0.25006103515625, + "learning_rate": 0.0001, + "loss": 7.3054, + "loss/crossentropy": 2.3062673807144165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4381408095359802, + "step": 312 + }, + { + "epoch": 0.00628, + "grad_norm": 4.875, + "grad_norm_var": 0.29498697916666666, + "learning_rate": 0.0001, + "loss": 6.5202, + "loss/crossentropy": 2.1124885082244873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4392775595188141, + "step": 314 + }, + { + "epoch": 0.00632, + "grad_norm": 5.09375, + "grad_norm_var": 0.3001302083333333, + "learning_rate": 0.0001, + "loss": 6.3297, + "loss/crossentropy": 2.0250568985939026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4133095294237137, + "step": 316 + }, + { + "epoch": 0.00636, + "grad_norm": 5.625, + "grad_norm_var": 0.31021728515625, + "learning_rate": 0.0001, + "loss": 6.9903, + "loss/crossentropy": 2.4011316299438477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.46330246329307556, + "step": 318 + }, + { + "epoch": 0.0064, + "grad_norm": 5.65625, + "grad_norm_var": 0.30305582682291665, + "learning_rate": 0.0001, + "loss": 7.2114, + "loss/crossentropy": 2.487559676170349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.47237157821655273, + "step": 320 + }, + { + "epoch": 0.00644, + "grad_norm": 5.1875, + "grad_norm_var": 0.2775349934895833, + "learning_rate": 0.0001, + "loss": 6.5935, + "loss/crossentropy": 1.999566912651062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4165455400943756, + "step": 322 + }, + { + "epoch": 0.00648, + "grad_norm": 6.03125, + "grad_norm_var": 0.27496337890625, + "learning_rate": 0.0001, + "loss": 7.0573, + "loss/crossentropy": 2.545841693878174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4725654572248459, + "step": 324 + }, + { + "epoch": 0.00652, + "grad_norm": 5.0625, + "grad_norm_var": 0.2528483072916667, + "learning_rate": 0.0001, + "loss": 7.2351, + "loss/crossentropy": 2.119086444377899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4649319499731064, + "step": 326 + }, + { + "epoch": 0.00656, + "grad_norm": 5.21875, + "grad_norm_var": 0.221728515625, + "learning_rate": 0.0001, + "loss": 6.8367, + "loss/crossentropy": 2.365525245666504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5121739506721497, + "step": 328 + }, + { + "epoch": 0.0066, + "grad_norm": 5.25, + "grad_norm_var": 0.17467041015625, + "learning_rate": 0.0001, + "loss": 6.8384, + "loss/crossentropy": 2.2604740858078003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4704676419496536, + "step": 330 + }, + { + "epoch": 0.00664, + "grad_norm": 6.15625, + "grad_norm_var": 0.18977864583333334, + "learning_rate": 0.0001, + "loss": 7.5125, + "loss/crossentropy": 2.4891955852508545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5196183770895004, + "step": 332 + }, + { + "epoch": 0.00668, + "grad_norm": 5.46875, + "grad_norm_var": 0.1767578125, + "learning_rate": 0.0001, + "loss": 7.3139, + "loss/crossentropy": 2.430082321166992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.47671228647232056, + "step": 334 + }, + { + "epoch": 0.00672, + "grad_norm": 5.53125, + "grad_norm_var": 0.187353515625, + "learning_rate": 0.0001, + "loss": 6.6969, + "loss/crossentropy": 2.2450510263442993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5124029517173767, + "step": 336 + }, + { + "epoch": 0.00676, + "grad_norm": 5.375, + "grad_norm_var": 0.18251546223958334, + "learning_rate": 0.0001, + "loss": 6.8537, + "loss/crossentropy": 2.225212812423706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4358299970626831, + "step": 338 + }, + { + "epoch": 0.0068, + "grad_norm": 6.71875, + "grad_norm_var": 26.47734375, + "learning_rate": 0.0001, + "loss": 6.8775, + "loss/crossentropy": 2.320846140384674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42757023870944977, + "step": 340 + }, + { + "epoch": 0.00684, + "grad_norm": 4.84375, + "grad_norm_var": 26.739453125, + "learning_rate": 0.0001, + "loss": 6.7394, + "loss/crossentropy": 2.419093132019043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4633703678846359, + "step": 342 + }, + { + "epoch": 0.00688, + "grad_norm": 5.3125, + "grad_norm_var": 26.632405598958332, + "learning_rate": 0.0001, + "loss": 6.7304, + "loss/crossentropy": 1.939517080783844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4387335330247879, + "step": 344 + }, + { + "epoch": 0.00692, + "grad_norm": 5.75, + "grad_norm_var": 26.504410807291666, + "learning_rate": 0.0001, + "loss": 7.0914, + "loss/crossentropy": 2.695888638496399, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.48031529784202576, + "step": 346 + }, + { + "epoch": 0.00696, + "grad_norm": 6.625, + "grad_norm_var": 26.468094889322916, + "learning_rate": 0.0001, + "loss": 6.8381, + "loss/crossentropy": 2.245330333709717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5032568573951721, + "step": 348 + }, + { + "epoch": 0.007, + "grad_norm": 4.28125, + "grad_norm_var": 26.707421875, + "learning_rate": 0.0001, + "loss": 6.4774, + "loss/crossentropy": 1.9668607115745544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4390462785959244, + "step": 350 + }, + { + "epoch": 0.00704, + "grad_norm": 5.25, + "grad_norm_var": 26.92008056640625, + "learning_rate": 0.0001, + "loss": 6.7795, + "loss/crossentropy": 2.4035123586654663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43734824657440186, + "step": 352 + }, + { + "epoch": 0.00708, + "grad_norm": 6.5, + "grad_norm_var": 26.851460774739582, + "learning_rate": 0.0001, + "loss": 7.3932, + "loss/crossentropy": 2.4636529684066772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5208825469017029, + "step": 354 + }, + { + "epoch": 0.00712, + "grad_norm": 6.4375, + "grad_norm_var": 0.5743123372395833, + "learning_rate": 0.0001, + "loss": 6.8482, + "loss/crossentropy": 2.085066556930542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3971068561077118, + "step": 356 + }, + { + "epoch": 0.00716, + "grad_norm": 4.59375, + "grad_norm_var": 0.57847900390625, + "learning_rate": 0.0001, + "loss": 6.8962, + "loss/crossentropy": 2.194266200065613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41747787594795227, + "step": 358 + }, + { + "epoch": 0.0072, + "grad_norm": 4.59375, + "grad_norm_var": 0.649462890625, + "learning_rate": 0.0001, + "loss": 6.7381, + "loss/crossentropy": 2.4678618907928467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4633233994245529, + "step": 360 + }, + { + "epoch": 0.00724, + "grad_norm": 5.59375, + "grad_norm_var": 0.6476847330729166, + "learning_rate": 0.0001, + "loss": 6.5748, + "loss/crossentropy": 2.362962484359741, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4360974431037903, + "step": 362 + }, + { + "epoch": 0.00728, + "grad_norm": 5.6875, + "grad_norm_var": 0.5360514322916666, + "learning_rate": 0.0001, + "loss": 7.3497, + "loss/crossentropy": 2.3162096738815308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4641396403312683, + "step": 364 + }, + { + "epoch": 0.00732, + "grad_norm": 6.0625, + "grad_norm_var": 0.4325358072916667, + "learning_rate": 0.0001, + "loss": 7.0856, + "loss/crossentropy": 2.279396176338196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4331911951303482, + "step": 366 + }, + { + "epoch": 0.00736, + "grad_norm": 4.96875, + "grad_norm_var": 0.377587890625, + "learning_rate": 0.0001, + "loss": 6.8288, + "loss/crossentropy": 2.333961606025696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43770918250083923, + "step": 368 + }, + { + "epoch": 0.0074, + "grad_norm": 5.1875, + "grad_norm_var": 0.27955322265625, + "learning_rate": 0.0001, + "loss": 6.9245, + "loss/crossentropy": 2.130259871482849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4134685546159744, + "step": 370 + }, + { + "epoch": 0.00744, + "grad_norm": 5.59375, + "grad_norm_var": 0.19478759765625, + "learning_rate": 0.0001, + "loss": 6.4884, + "loss/crossentropy": 2.3000282049179077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.46207693219184875, + "step": 372 + }, + { + "epoch": 0.00748, + "grad_norm": 5.59375, + "grad_norm_var": 0.20399983723958334, + "learning_rate": 0.0001, + "loss": 7.3714, + "loss/crossentropy": 2.687412142753601, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.46891947090625763, + "step": 374 + }, + { + "epoch": 0.00752, + "grad_norm": 4.28125, + "grad_norm_var": 0.21962483723958334, + "learning_rate": 0.0001, + "loss": 6.4194, + "loss/crossentropy": 2.2366563081741333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42876073718070984, + "step": 376 + }, + { + "epoch": 0.00756, + "grad_norm": 5.71875, + "grad_norm_var": 0.23108317057291666, + "learning_rate": 0.0001, + "loss": 7.0141, + "loss/crossentropy": 2.5960274934768677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4519210159778595, + "step": 378 + }, + { + "epoch": 0.0076, + "grad_norm": 5.71875, + "grad_norm_var": 0.23435872395833332, + "learning_rate": 0.0001, + "loss": 6.9654, + "loss/crossentropy": 2.4690704345703125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5009289383888245, + "step": 380 + }, + { + "epoch": 0.00764, + "grad_norm": 5.25, + "grad_norm_var": 0.17952067057291668, + "learning_rate": 0.0001, + "loss": 6.6068, + "loss/crossentropy": 2.188890814781189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4912077784538269, + "step": 382 + }, + { + "epoch": 0.00768, + "grad_norm": 5.15625, + "grad_norm_var": 0.19055582682291666, + "learning_rate": 0.0001, + "loss": 6.5789, + "loss/crossentropy": 2.2374125719070435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4437579810619354, + "step": 384 + }, + { + "epoch": 0.00772, + "grad_norm": 5.1875, + "grad_norm_var": 0.19308268229166667, + "learning_rate": 0.0001, + "loss": 6.8081, + "loss/crossentropy": 2.101546287536621, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.39717453718185425, + "step": 386 + }, + { + "epoch": 0.00776, + "grad_norm": 5.1875, + "grad_norm_var": 0.182666015625, + "learning_rate": 0.0001, + "loss": 6.5688, + "loss/crossentropy": 2.2907408475875854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4657403528690338, + "step": 388 + }, + { + "epoch": 0.0078, + "grad_norm": 4.53125, + "grad_norm_var": 0.18203125, + "learning_rate": 0.0001, + "loss": 6.4664, + "loss/crossentropy": 1.9909976720809937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.39810416102409363, + "step": 390 + }, + { + "epoch": 0.00784, + "grad_norm": 6.28125, + "grad_norm_var": 0.23746337890625, + "learning_rate": 0.0001, + "loss": 6.6015, + "loss/crossentropy": 2.109456777572632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4226441979408264, + "step": 392 + }, + { + "epoch": 0.00788, + "grad_norm": 6.03125, + "grad_norm_var": 0.26549072265625, + "learning_rate": 0.0001, + "loss": 7.2592, + "loss/crossentropy": 2.330615997314453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5082881152629852, + "step": 394 + }, + { + "epoch": 0.00792, + "grad_norm": 5.90625, + "grad_norm_var": 0.49332275390625, + "learning_rate": 0.0001, + "loss": 7.301, + "loss/crossentropy": 2.37632155418396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5454596877098083, + "step": 396 + }, + { + "epoch": 0.00796, + "grad_norm": 4.9375, + "grad_norm_var": 0.5669921875, + "learning_rate": 0.0001, + "loss": 6.6057, + "loss/crossentropy": 2.0303866863250732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3780263066291809, + "step": 398 + }, + { + "epoch": 0.008, + "grad_norm": 4.1875, + "grad_norm_var": 0.6671712239583333, + "learning_rate": 0.0001, + "loss": 6.8129, + "loss/crossentropy": 2.077945590019226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4111042767763138, + "step": 400 + }, + { + "epoch": 0.00804, + "grad_norm": 5.5625, + "grad_norm_var": 0.6867146809895833, + "learning_rate": 0.0001, + "loss": 6.919, + "loss/crossentropy": 2.3042391538619995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.44224822521209717, + "step": 402 + }, + { + "epoch": 0.00808, + "grad_norm": 4.3125, + "grad_norm_var": 0.74537353515625, + "learning_rate": 0.0001, + "loss": 6.4785, + "loss/crossentropy": 2.15978467464447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.44845885038375854, + "step": 404 + }, + { + "epoch": 0.00812, + "grad_norm": 4.90625, + "grad_norm_var": 0.69576416015625, + "learning_rate": 0.0001, + "loss": 6.3875, + "loss/crossentropy": 2.4571259021759033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4629521369934082, + "step": 406 + }, + { + "epoch": 0.00816, + "grad_norm": 5.40625, + "grad_norm_var": 0.6654947916666667, + "learning_rate": 0.0001, + "loss": 7.1165, + "loss/crossentropy": 2.653234601020813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5556869208812714, + "step": 408 + }, + { + "epoch": 0.0082, + "grad_norm": 4.53125, + "grad_norm_var": 0.66138916015625, + "learning_rate": 0.0001, + "loss": 6.6643, + "loss/crossentropy": 1.9738067388534546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42570993304252625, + "step": 410 + }, + { + "epoch": 0.00824, + "grad_norm": 4.0625, + "grad_norm_var": 0.46920166015625, + "learning_rate": 0.0001, + "loss": 6.2205, + "loss/crossentropy": 2.093988060951233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40544557571411133, + "step": 412 + }, + { + "epoch": 0.00828, + "grad_norm": 4.34375, + "grad_norm_var": 0.36213785807291665, + "learning_rate": 0.0001, + "loss": 6.6356, + "loss/crossentropy": 2.4798851013183594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4204765260219574, + "step": 414 + }, + { + "epoch": 0.00832, + "grad_norm": 5.625, + "grad_norm_var": 0.3986287434895833, + "learning_rate": 0.0001, + "loss": 6.5601, + "loss/crossentropy": 2.4342020750045776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5148662775754929, + "step": 416 + }, + { + "epoch": 0.00836, + "grad_norm": 5.40625, + "grad_norm_var": 0.3985514322916667, + "learning_rate": 0.0001, + "loss": 6.7757, + "loss/crossentropy": 2.3637804985046387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45475171506404877, + "step": 418 + }, + { + "epoch": 0.0084, + "grad_norm": 4.09375, + "grad_norm_var": 0.39586181640625, + "learning_rate": 0.0001, + "loss": 6.6923, + "loss/crossentropy": 2.4066261053085327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45181581377983093, + "step": 420 + }, + { + "epoch": 0.00844, + "grad_norm": 4.0, + "grad_norm_var": 0.43176676432291666, + "learning_rate": 0.0001, + "loss": 6.2428, + "loss/crossentropy": 2.1273797750473022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3976929485797882, + "step": 422 + }, + { + "epoch": 0.00848, + "grad_norm": 4.90625, + "grad_norm_var": 0.26330973307291666, + "learning_rate": 0.0001, + "loss": 6.6524, + "loss/crossentropy": 2.4227113723754883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.46154454350471497, + "step": 424 + }, + { + "epoch": 0.00852, + "grad_norm": 4.65625, + "grad_norm_var": 0.261181640625, + "learning_rate": 0.0001, + "loss": 6.6558, + "loss/crossentropy": 2.3502479791641235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43580910563468933, + "step": 426 + }, + { + "epoch": 0.00856, + "grad_norm": 5.3125, + "grad_norm_var": 0.26236572265625, + "learning_rate": 0.0001, + "loss": 6.903, + "loss/crossentropy": 2.5034282207489014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4730219095945358, + "step": 428 + }, + { + "epoch": 0.0086, + "grad_norm": 4.40625, + "grad_norm_var": 0.2557576497395833, + "learning_rate": 0.0001, + "loss": 6.2148, + "loss/crossentropy": 2.0902098417282104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.409458264708519, + "step": 430 + }, + { + "epoch": 0.00864, + "grad_norm": 4.9375, + "grad_norm_var": 0.19420166015625, + "learning_rate": 0.0001, + "loss": 6.4634, + "loss/crossentropy": 2.2203429341316223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41066519916057587, + "step": 432 + }, + { + "epoch": 0.00868, + "grad_norm": 5.03125, + "grad_norm_var": 0.14855143229166667, + "learning_rate": 0.0001, + "loss": 6.6943, + "loss/crossentropy": 2.568304419517517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.48846572637557983, + "step": 434 + }, + { + "epoch": 0.00872, + "grad_norm": 5.15625, + "grad_norm_var": 0.13489176432291666, + "learning_rate": 0.0001, + "loss": 6.4829, + "loss/crossentropy": 2.359646439552307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43610572814941406, + "step": 436 + }, + { + "epoch": 0.00876, + "grad_norm": 5.03125, + "grad_norm_var": 0.08870035807291667, + "learning_rate": 0.0001, + "loss": 6.6119, + "loss/crossentropy": 2.2751121520996094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45837563276290894, + "step": 438 + }, + { + "epoch": 0.0088, + "grad_norm": 4.125, + "grad_norm_var": 0.13665364583333334, + "learning_rate": 0.0001, + "loss": 6.5338, + "loss/crossentropy": 2.334506392478943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4196038395166397, + "step": 440 + }, + { + "epoch": 0.00884, + "grad_norm": 4.1875, + "grad_norm_var": 0.15891927083333332, + "learning_rate": 0.0001, + "loss": 6.2206, + "loss/crossentropy": 1.9731069803237915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41661541163921356, + "step": 442 + }, + { + "epoch": 0.00888, + "grad_norm": 5.03125, + "grad_norm_var": 0.13339436848958333, + "learning_rate": 0.0001, + "loss": 6.3377, + "loss/crossentropy": 2.319058418273926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4656156301498413, + "step": 444 + }, + { + "epoch": 0.00892, + "grad_norm": 4.125, + "grad_norm_var": 0.15013020833333332, + "learning_rate": 0.0001, + "loss": 6.5345, + "loss/crossentropy": 2.309122085571289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41400712728500366, + "step": 446 + }, + { + "epoch": 0.00896, + "grad_norm": 4.53125, + "grad_norm_var": 0.14468994140625, + "learning_rate": 0.0001, + "loss": 6.5385, + "loss/crossentropy": 1.867617905139923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.39080700278282166, + "step": 448 + }, + { + "epoch": 0.009, + "grad_norm": 5.21875, + "grad_norm_var": 0.79830322265625, + "learning_rate": 0.0001, + "loss": 6.799, + "loss/crossentropy": 2.205033838748932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4887084364891052, + "step": 450 + }, + { + "epoch": 0.00904, + "grad_norm": 4.9375, + "grad_norm_var": 0.826171875, + "learning_rate": 0.0001, + "loss": 6.6476, + "loss/crossentropy": 2.3054174184799194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4140937328338623, + "step": 452 + }, + { + "epoch": 0.00908, + "grad_norm": 4.53125, + "grad_norm_var": 0.8572224934895833, + "learning_rate": 0.0001, + "loss": 6.7036, + "loss/crossentropy": 2.1358219981193542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3849467635154724, + "step": 454 + }, + { + "epoch": 0.00912, + "grad_norm": 4.9375, + "grad_norm_var": 0.8132120768229166, + "learning_rate": 0.0001, + "loss": 6.4024, + "loss/crossentropy": 1.9811997413635254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42263032495975494, + "step": 456 + }, + { + "epoch": 0.00916, + "grad_norm": 4.8125, + "grad_norm_var": 0.7787109375, + "learning_rate": 0.0001, + "loss": 6.8381, + "loss/crossentropy": 2.319555103778839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4539293050765991, + "step": 458 + }, + { + "epoch": 0.0092, + "grad_norm": 4.6875, + "grad_norm_var": 0.7744425455729167, + "learning_rate": 0.0001, + "loss": 6.72, + "loss/crossentropy": 2.4030569791793823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41373930871486664, + "step": 460 + }, + { + "epoch": 0.00924, + "grad_norm": 4.8125, + "grad_norm_var": 0.7218587239583333, + "learning_rate": 0.0001, + "loss": 6.7376, + "loss/crossentropy": 2.4479328393936157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.447835311293602, + "step": 462 + }, + { + "epoch": 0.00928, + "grad_norm": 5.5, + "grad_norm_var": 0.7289021809895834, + "learning_rate": 0.0001, + "loss": 7.0562, + "loss/crossentropy": 2.056324601173401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4374549984931946, + "step": 464 + }, + { + "epoch": 0.00932, + "grad_norm": 4.28125, + "grad_norm_var": 0.19256184895833334, + "learning_rate": 0.0001, + "loss": 6.6994, + "loss/crossentropy": 2.4104079008102417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4148041307926178, + "step": 466 + }, + { + "epoch": 0.00936, + "grad_norm": 4.75, + "grad_norm_var": 0.13527018229166668, + "learning_rate": 0.0001, + "loss": 7.0758, + "loss/crossentropy": 2.734652876853943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.511719822883606, + "step": 468 + }, + { + "epoch": 0.0094, + "grad_norm": 5.0, + "grad_norm_var": 0.13118082682291668, + "learning_rate": 0.0001, + "loss": 6.4772, + "loss/crossentropy": 2.1748571395874023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3852091133594513, + "step": 470 + }, + { + "epoch": 0.00944, + "grad_norm": 4.625, + "grad_norm_var": 0.13912353515625, + "learning_rate": 0.0001, + "loss": 6.7127, + "loss/crossentropy": 2.476449966430664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4194856435060501, + "step": 472 + }, + { + "epoch": 0.00948, + "grad_norm": 4.40625, + "grad_norm_var": 0.16887613932291667, + "learning_rate": 0.0001, + "loss": 6.494, + "loss/crossentropy": 2.6383973360061646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.44083887338638306, + "step": 474 + }, + { + "epoch": 0.00952, + "grad_norm": 4.375, + "grad_norm_var": 0.18435872395833333, + "learning_rate": 0.0001, + "loss": 6.3184, + "loss/crossentropy": 2.3149259090423584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3863615244626999, + "step": 476 + }, + { + "epoch": 0.00956, + "grad_norm": 5.0625, + "grad_norm_var": 0.18388264973958332, + "learning_rate": 0.0001, + "loss": 6.6217, + "loss/crossentropy": 2.3096635341644287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40995509922504425, + "step": 478 + }, + { + "epoch": 0.0096, + "grad_norm": 4.34375, + "grad_norm_var": 0.10846354166666666, + "learning_rate": 0.0001, + "loss": 6.4849, + "loss/crossentropy": 2.6495853662490845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.412080317735672, + "step": 480 + }, + { + "epoch": 0.00964, + "grad_norm": 5.21875, + "grad_norm_var": 0.13948160807291668, + "learning_rate": 0.0001, + "loss": 6.9293, + "loss/crossentropy": 2.445754885673523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45622071623802185, + "step": 482 + }, + { + "epoch": 0.00968, + "grad_norm": 4.40625, + "grad_norm_var": 0.13661702473958334, + "learning_rate": 0.0001, + "loss": 6.4119, + "loss/crossentropy": 2.418110966682434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40424875915050507, + "step": 484 + }, + { + "epoch": 0.00972, + "grad_norm": 5.0625, + "grad_norm_var": 0.15959879557291667, + "learning_rate": 0.0001, + "loss": 6.6356, + "loss/crossentropy": 1.9564435482025146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45834848284721375, + "step": 486 + }, + { + "epoch": 0.00976, + "grad_norm": 5.375, + "grad_norm_var": 0.16946207682291667, + "learning_rate": 0.0001, + "loss": 6.7056, + "loss/crossentropy": 2.3772581815719604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4418800473213196, + "step": 488 + }, + { + "epoch": 0.0098, + "grad_norm": 4.0625, + "grad_norm_var": 0.208056640625, + "learning_rate": 0.0001, + "loss": 6.3239, + "loss/crossentropy": 1.9526153802871704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3661540001630783, + "step": 490 + }, + { + "epoch": 0.00984, + "grad_norm": 4.875, + "grad_norm_var": 0.19999593098958332, + "learning_rate": 0.0001, + "loss": 6.7642, + "loss/crossentropy": 2.40561842918396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.44370119273662567, + "step": 492 + }, + { + "epoch": 0.00988, + "grad_norm": 4.53125, + "grad_norm_var": 0.1943359375, + "learning_rate": 0.0001, + "loss": 6.6475, + "loss/crossentropy": 2.4316108226776123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4437306672334671, + "step": 494 + }, + { + "epoch": 0.00992, + "grad_norm": 4.53125, + "grad_norm_var": 0.25276285807291665, + "learning_rate": 0.0001, + "loss": 6.4095, + "loss/crossentropy": 2.2919591665267944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4170967787504196, + "step": 496 + }, + { + "epoch": 0.00996, + "grad_norm": 5.03125, + "grad_norm_var": 0.21116129557291666, + "learning_rate": 0.0001, + "loss": 6.6291, + "loss/crossentropy": 2.571357250213623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4073399156332016, + "step": 498 + }, + { + "epoch": 0.01, + "grad_norm": 4.375, + "grad_norm_var": 0.21330973307291667, + "learning_rate": 0.0001, + "loss": 6.2903, + "loss/crossentropy": 2.389290928840637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41799379885196686, + "step": 500 + }, + { + "epoch": 0.01004, + "grad_norm": 4.90625, + "grad_norm_var": 0.20245768229166666, + "learning_rate": 0.0001, + "loss": 6.4319, + "loss/crossentropy": 2.0904359221458435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3914492577314377, + "step": 502 + }, + { + "epoch": 0.01008, + "grad_norm": 3.625, + "grad_norm_var": 0.218212890625, + "learning_rate": 0.0001, + "loss": 6.5581, + "loss/crossentropy": 2.5435129404067993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4660491645336151, + "step": 504 + }, + { + "epoch": 0.01012, + "grad_norm": 4.75, + "grad_norm_var": 0.21458333333333332, + "learning_rate": 0.0001, + "loss": 6.3665, + "loss/crossentropy": 2.039083242416382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38188865780830383, + "step": 506 + }, + { + "epoch": 0.01016, + "grad_norm": 5.21875, + "grad_norm_var": 0.361181640625, + "learning_rate": 0.0001, + "loss": 6.4862, + "loss/crossentropy": 2.056805729866028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4257620573043823, + "step": 508 + }, + { + "epoch": 0.0102, + "grad_norm": 4.03125, + "grad_norm_var": 0.38592122395833334, + "learning_rate": 0.0001, + "loss": 6.4822, + "loss/crossentropy": 2.6178410053253174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43898941576480865, + "step": 510 + }, + { + "epoch": 0.01024, + "grad_norm": 4.59375, + "grad_norm_var": 0.3405558268229167, + "learning_rate": 0.0001, + "loss": 6.4638, + "loss/crossentropy": 2.232435703277588, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37769296765327454, + "step": 512 + }, + { + "epoch": 0.01028, + "grad_norm": 4.40625, + "grad_norm_var": 0.33033447265625, + "learning_rate": 0.0001, + "loss": 6.378, + "loss/crossentropy": 1.8679735660552979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3805970698595047, + "step": 514 + }, + { + "epoch": 0.01032, + "grad_norm": 4.3125, + "grad_norm_var": 0.33175455729166664, + "learning_rate": 0.0001, + "loss": 6.7453, + "loss/crossentropy": 2.6537472009658813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4450981914997101, + "step": 516 + }, + { + "epoch": 0.01036, + "grad_norm": 4.03125, + "grad_norm_var": 0.34915262858072915, + "learning_rate": 0.0001, + "loss": 6.2745, + "loss/crossentropy": 2.5199841260910034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4748214781284332, + "step": 518 + }, + { + "epoch": 0.0104, + "grad_norm": 5.5, + "grad_norm_var": 0.3482004801432292, + "learning_rate": 0.0001, + "loss": 6.6212, + "loss/crossentropy": 2.6603333950042725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4144483357667923, + "step": 520 + }, + { + "epoch": 0.01044, + "grad_norm": 4.28125, + "grad_norm_var": 0.3241119384765625, + "learning_rate": 0.0001, + "loss": 6.4073, + "loss/crossentropy": 2.284039616584778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42801250517368317, + "step": 522 + }, + { + "epoch": 0.01048, + "grad_norm": 3.78125, + "grad_norm_var": 0.1894683837890625, + "learning_rate": 0.0001, + "loss": 6.2153, + "loss/crossentropy": 2.473629951477051, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4107852131128311, + "step": 524 + }, + { + "epoch": 0.01052, + "grad_norm": 4.5, + "grad_norm_var": 0.1827789306640625, + "learning_rate": 0.0001, + "loss": 6.6746, + "loss/crossentropy": 2.1443774700164795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3606857359409332, + "step": 526 + }, + { + "epoch": 0.01056, + "grad_norm": 3.96875, + "grad_norm_var": 0.18889058430989583, + "learning_rate": 0.0001, + "loss": 5.8493, + "loss/crossentropy": 1.8425135016441345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37770508229732513, + "step": 528 + }, + { + "epoch": 0.0106, + "grad_norm": 3.859375, + "grad_norm_var": 0.179052734375, + "learning_rate": 0.0001, + "loss": 6.3319, + "loss/crossentropy": 2.3705164194107056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4354119151830673, + "step": 530 + }, + { + "epoch": 0.01064, + "grad_norm": 4.53125, + "grad_norm_var": 0.16344401041666667, + "learning_rate": 0.0001, + "loss": 6.4823, + "loss/crossentropy": 2.1141316294670105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34213581681251526, + "step": 532 + }, + { + "epoch": 0.01068, + "grad_norm": 4.3125, + "grad_norm_var": 0.15579325358072918, + "learning_rate": 0.0001, + "loss": 6.0454, + "loss/crossentropy": 2.14878511428833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3756616413593292, + "step": 534 + }, + { + "epoch": 0.01072, + "grad_norm": 4.34375, + "grad_norm_var": 0.052155558268229166, + "learning_rate": 0.0001, + "loss": 6.4363, + "loss/crossentropy": 2.2513046264648438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.416190966963768, + "step": 536 + }, + { + "epoch": 0.01076, + "grad_norm": 3.921875, + "grad_norm_var": 0.049169921875, + "learning_rate": 0.0001, + "loss": 6.2674, + "loss/crossentropy": 2.1337047815322876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3839789927005768, + "step": 538 + }, + { + "epoch": 0.0108, + "grad_norm": 5.8125, + "grad_norm_var": 0.20488993326822916, + "learning_rate": 0.0001, + "loss": 6.0559, + "loss/crossentropy": 2.2019962072372437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3806414008140564, + "step": 540 + }, + { + "epoch": 0.01084, + "grad_norm": 4.125, + "grad_norm_var": 0.2133697509765625, + "learning_rate": 0.0001, + "loss": 5.8434, + "loss/crossentropy": 2.113224983215332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3797851800918579, + "step": 542 + }, + { + "epoch": 0.01088, + "grad_norm": 4.0, + "grad_norm_var": 0.21507059733072917, + "learning_rate": 0.0001, + "loss": 6.5131, + "loss/crossentropy": 2.461037516593933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41689516603946686, + "step": 544 + }, + { + "epoch": 0.01092, + "grad_norm": 4.625, + "grad_norm_var": 0.24849853515625, + "learning_rate": 0.0001, + "loss": 6.4191, + "loss/crossentropy": 2.277098298072815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4108494818210602, + "step": 546 + }, + { + "epoch": 0.01096, + "grad_norm": 4.34375, + "grad_norm_var": 0.48818359375, + "learning_rate": 0.0001, + "loss": 6.3436, + "loss/crossentropy": 2.007936477661133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36796192824840546, + "step": 548 + }, + { + "epoch": 0.011, + "grad_norm": 4.625, + "grad_norm_var": 0.5631795247395833, + "learning_rate": 0.0001, + "loss": 6.5793, + "loss/crossentropy": 2.197320520877838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38722094893455505, + "step": 550 + }, + { + "epoch": 0.01104, + "grad_norm": 4.6875, + "grad_norm_var": 0.5419230143229167, + "learning_rate": 0.0001, + "loss": 6.3185, + "loss/crossentropy": 2.225432515144348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3928917348384857, + "step": 552 + }, + { + "epoch": 0.01108, + "grad_norm": 4.28125, + "grad_norm_var": 0.5455067952473959, + "learning_rate": 0.0001, + "loss": 5.9686, + "loss/crossentropy": 2.5253326892852783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41547106206417084, + "step": 554 + }, + { + "epoch": 0.01112, + "grad_norm": 4.15625, + "grad_norm_var": 0.44996337890625, + "learning_rate": 0.0001, + "loss": 6.2158, + "loss/crossentropy": 2.488635540008545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4523312896490097, + "step": 556 + }, + { + "epoch": 0.01116, + "grad_norm": 4.03125, + "grad_norm_var": 0.46119384765625, + "learning_rate": 0.0001, + "loss": 6.3632, + "loss/crossentropy": 2.395568609237671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40695953369140625, + "step": 558 + }, + { + "epoch": 0.0112, + "grad_norm": 4.28125, + "grad_norm_var": 0.4615234375, + "learning_rate": 0.0001, + "loss": 5.998, + "loss/crossentropy": 2.383823275566101, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4482497274875641, + "step": 560 + }, + { + "epoch": 0.01124, + "grad_norm": 4.46875, + "grad_norm_var": 0.46280008951822915, + "learning_rate": 0.0001, + "loss": 5.968, + "loss/crossentropy": 2.0261693000793457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36933301389217377, + "step": 562 + }, + { + "epoch": 0.01128, + "grad_norm": 4.65625, + "grad_norm_var": 0.2221588134765625, + "learning_rate": 0.0001, + "loss": 6.1467, + "loss/crossentropy": 2.131769895553589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.341948002576828, + "step": 564 + }, + { + "epoch": 0.01132, + "grad_norm": 4.8125, + "grad_norm_var": 0.0982574462890625, + "learning_rate": 0.0001, + "loss": 6.5288, + "loss/crossentropy": 2.3899158239364624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.417646586894989, + "step": 566 + }, + { + "epoch": 0.01136, + "grad_norm": 4.21875, + "grad_norm_var": 0.07683817545572917, + "learning_rate": 0.0001, + "loss": 6.6198, + "loss/crossentropy": 2.3139528036117554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.39010919630527496, + "step": 568 + }, + { + "epoch": 0.0114, + "grad_norm": 4.3125, + "grad_norm_var": 0.06782124837239584, + "learning_rate": 0.0001, + "loss": 6.1209, + "loss/crossentropy": 1.966201364994049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3554569333791733, + "step": 570 + }, + { + "epoch": 0.01144, + "grad_norm": 4.21875, + "grad_norm_var": 0.07773335774739583, + "learning_rate": 0.0001, + "loss": 6.1746, + "loss/crossentropy": 2.2325466871261597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4008040726184845, + "step": 572 + }, + { + "epoch": 0.01148, + "grad_norm": 4.96875, + "grad_norm_var": 0.09848531087239583, + "learning_rate": 0.0001, + "loss": 6.121, + "loss/crossentropy": 1.7670194506645203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3362845778465271, + "step": 574 + }, + { + "epoch": 0.01152, + "grad_norm": 4.125, + "grad_norm_var": 0.10274149576822916, + "learning_rate": 0.0001, + "loss": 6.3956, + "loss/crossentropy": 2.332284092903137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4190225303173065, + "step": 576 + }, + { + "epoch": 0.01156, + "grad_norm": 4.71875, + "grad_norm_var": 0.10549723307291667, + "learning_rate": 0.0001, + "loss": 6.4367, + "loss/crossentropy": 2.265386700630188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4109695851802826, + "step": 578 + }, + { + "epoch": 0.0116, + "grad_norm": 5.15625, + "grad_norm_var": 0.13811442057291667, + "learning_rate": 0.0001, + "loss": 6.4475, + "loss/crossentropy": 2.265889286994934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42450472712516785, + "step": 580 + }, + { + "epoch": 0.01164, + "grad_norm": 4.5625, + "grad_norm_var": 0.163134765625, + "learning_rate": 0.0001, + "loss": 6.1483, + "loss/crossentropy": 1.867847204208374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3627365529537201, + "step": 582 + }, + { + "epoch": 0.01168, + "grad_norm": 3.796875, + "grad_norm_var": 0.23681538899739582, + "learning_rate": 0.0001, + "loss": 6.5909, + "loss/crossentropy": 2.5827555656433105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5194894820451736, + "step": 584 + }, + { + "epoch": 0.01172, + "grad_norm": 4.5625, + "grad_norm_var": 0.23188374837239584, + "learning_rate": 0.0001, + "loss": 6.6319, + "loss/crossentropy": 2.26031893491745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.412450835108757, + "step": 586 + }, + { + "epoch": 0.01176, + "grad_norm": 5.53125, + "grad_norm_var": 0.2718739827473958, + "learning_rate": 0.0001, + "loss": 6.5782, + "loss/crossentropy": 2.1885104179382324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41636165976524353, + "step": 588 + }, + { + "epoch": 0.0118, + "grad_norm": 4.53125, + "grad_norm_var": 0.26240132649739584, + "learning_rate": 0.0001, + "loss": 6.5247, + "loss/crossentropy": 2.682767391204834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4279082715511322, + "step": 590 + }, + { + "epoch": 0.01184, + "grad_norm": 4.3125, + "grad_norm_var": 0.24807027180989583, + "learning_rate": 0.0001, + "loss": 6.5862, + "loss/crossentropy": 2.185304641723633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4266812950372696, + "step": 592 + }, + { + "epoch": 0.01188, + "grad_norm": 4.15625, + "grad_norm_var": 0.2589752197265625, + "learning_rate": 0.0001, + "loss": 6.2494, + "loss/crossentropy": 2.383716344833374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38570962846279144, + "step": 594 + }, + { + "epoch": 0.01192, + "grad_norm": 4.3125, + "grad_norm_var": 0.2337066650390625, + "learning_rate": 0.0001, + "loss": 6.3066, + "loss/crossentropy": 2.2963398694992065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43451687693595886, + "step": 596 + }, + { + "epoch": 0.01196, + "grad_norm": 4.96875, + "grad_norm_var": 0.2128570556640625, + "learning_rate": 0.0001, + "loss": 6.3368, + "loss/crossentropy": 2.2728757858276367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4280686676502228, + "step": 598 + }, + { + "epoch": 0.012, + "grad_norm": 4.34375, + "grad_norm_var": 0.141650390625, + "learning_rate": 0.0001, + "loss": 6.1147, + "loss/crossentropy": 2.3486615419387817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37224848568439484, + "step": 600 + }, + { + "epoch": 0.01204, + "grad_norm": 3.765625, + "grad_norm_var": 0.17512613932291668, + "learning_rate": 0.0001, + "loss": 6.4017, + "loss/crossentropy": 2.3265292644500732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4079990088939667, + "step": 602 + }, + { + "epoch": 0.01208, + "grad_norm": 3.84375, + "grad_norm_var": 0.08847554524739583, + "learning_rate": 0.0001, + "loss": 6.0861, + "loss/crossentropy": 2.45253849029541, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42630523443222046, + "step": 604 + }, + { + "epoch": 0.01212, + "grad_norm": 4.03125, + "grad_norm_var": 0.08964742024739583, + "learning_rate": 0.0001, + "loss": 6.4101, + "loss/crossentropy": 2.4417446851730347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40342913568019867, + "step": 606 + }, + { + "epoch": 0.01216, + "grad_norm": 4.25, + "grad_norm_var": 0.08886617024739583, + "learning_rate": 0.0001, + "loss": 6.2513, + "loss/crossentropy": 2.1483529210090637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3846609443426132, + "step": 608 + }, + { + "epoch": 0.0122, + "grad_norm": 4.1875, + "grad_norm_var": 0.09990132649739583, + "learning_rate": 0.0001, + "loss": 6.5479, + "loss/crossentropy": 2.481536865234375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.440000057220459, + "step": 610 + }, + { + "epoch": 0.01224, + "grad_norm": 4.15625, + "grad_norm_var": 0.09464518229166667, + "learning_rate": 0.0001, + "loss": 6.4986, + "loss/crossentropy": 2.4472655057907104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41242682933807373, + "step": 612 + }, + { + "epoch": 0.01228, + "grad_norm": 4.09375, + "grad_norm_var": 0.07828369140625, + "learning_rate": 0.0001, + "loss": 6.4348, + "loss/crossentropy": 2.3511135578155518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38262398540973663, + "step": 614 + }, + { + "epoch": 0.01232, + "grad_norm": 3.71875, + "grad_norm_var": 0.085791015625, + "learning_rate": 0.0001, + "loss": 6.4821, + "loss/crossentropy": 2.5090683698654175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.44584622979164124, + "step": 616 + }, + { + "epoch": 0.01236, + "grad_norm": 4.75, + "grad_norm_var": 0.1117340087890625, + "learning_rate": 0.0001, + "loss": 6.0395, + "loss/crossentropy": 2.166012942790985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40271493792533875, + "step": 618 + }, + { + "epoch": 0.0124, + "grad_norm": 5.09375, + "grad_norm_var": 0.15511067708333334, + "learning_rate": 0.0001, + "loss": 6.4644, + "loss/crossentropy": 2.583309531211853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43380285799503326, + "step": 620 + }, + { + "epoch": 0.01244, + "grad_norm": 3.984375, + "grad_norm_var": 0.19877827962239583, + "learning_rate": 0.0001, + "loss": 6.3289, + "loss/crossentropy": 2.125720262527466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40389589965343475, + "step": 622 + }, + { + "epoch": 0.01248, + "grad_norm": 6.34375, + "grad_norm_var": 0.4984283447265625, + "learning_rate": 0.0001, + "loss": 5.9651, + "loss/crossentropy": 1.7034094333648682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31466029584407806, + "step": 624 + }, + { + "epoch": 0.01252, + "grad_norm": 4.40625, + "grad_norm_var": 0.4901041666666667, + "learning_rate": 0.0001, + "loss": 5.9451, + "loss/crossentropy": 2.163281202316284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38622182607650757, + "step": 626 + }, + { + "epoch": 0.01256, + "grad_norm": 3.875, + "grad_norm_var": 0.4982818603515625, + "learning_rate": 0.0001, + "loss": 5.8931, + "loss/crossentropy": 1.7754453420639038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33256995677948, + "step": 628 + }, + { + "epoch": 0.0126, + "grad_norm": 4.09375, + "grad_norm_var": 0.4894683837890625, + "learning_rate": 0.0001, + "loss": 6.8606, + "loss/crossentropy": 2.273309350013733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5319447964429855, + "step": 630 + }, + { + "epoch": 0.01264, + "grad_norm": 3.828125, + "grad_norm_var": 0.480126953125, + "learning_rate": 0.0001, + "loss": 6.2103, + "loss/crossentropy": 2.397401988506317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4255771040916443, + "step": 632 + }, + { + "epoch": 0.01268, + "grad_norm": 5.28125, + "grad_norm_var": 0.521875, + "learning_rate": 0.0001, + "loss": 6.4179, + "loss/crossentropy": 2.3898611068725586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.46314406394958496, + "step": 634 + }, + { + "epoch": 0.01272, + "grad_norm": 7.40625, + "grad_norm_var": 1.0609212239583334, + "learning_rate": 0.0001, + "loss": 6.5634, + "loss/crossentropy": 2.3740471601486206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.48123428225517273, + "step": 636 + }, + { + "epoch": 0.01276, + "grad_norm": 3.90625, + "grad_norm_var": 1.0519846598307292, + "learning_rate": 0.0001, + "loss": 6.1136, + "loss/crossentropy": 2.236217498779297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.39564159512519836, + "step": 638 + }, + { + "epoch": 0.0128, + "grad_norm": 4.21875, + "grad_norm_var": 0.7972157796223959, + "learning_rate": 0.0001, + "loss": 6.3134, + "loss/crossentropy": 2.4049174785614014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41312308609485626, + "step": 640 + }, + { + "epoch": 0.01284, + "grad_norm": 4.0, + "grad_norm_var": 0.80142822265625, + "learning_rate": 0.0001, + "loss": 6.2726, + "loss/crossentropy": 2.173800766468048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3983110189437866, + "step": 642 + }, + { + "epoch": 0.01288, + "grad_norm": 4.65625, + "grad_norm_var": 0.80084228515625, + "learning_rate": 0.0001, + "loss": 6.2779, + "loss/crossentropy": 2.2124537229537964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38358184695243835, + "step": 644 + }, + { + "epoch": 0.01292, + "grad_norm": 3.90625, + "grad_norm_var": 0.8355377197265625, + "learning_rate": 0.0001, + "loss": 6.1209, + "loss/crossentropy": 2.4939264059066772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3805152475833893, + "step": 646 + }, + { + "epoch": 0.01296, + "grad_norm": 4.25, + "grad_norm_var": 0.8250935872395834, + "learning_rate": 0.0001, + "loss": 6.0714, + "loss/crossentropy": 2.4526472091674805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3958089202642441, + "step": 648 + }, + { + "epoch": 0.013, + "grad_norm": 4.25, + "grad_norm_var": 0.7562978108723958, + "learning_rate": 0.0001, + "loss": 6.3648, + "loss/crossentropy": 2.5171029567718506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.446771502494812, + "step": 650 + }, + { + "epoch": 0.01304, + "grad_norm": 4.125, + "grad_norm_var": 0.1185943603515625, + "learning_rate": 0.0001, + "loss": 6.1656, + "loss/crossentropy": 2.270598888397217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38787929713726044, + "step": 652 + }, + { + "epoch": 0.01308, + "grad_norm": 4.15625, + "grad_norm_var": 0.11404520670572917, + "learning_rate": 0.0001, + "loss": 5.7739, + "loss/crossentropy": 1.8847617506980896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3258303850889206, + "step": 654 + }, + { + "epoch": 0.01312, + "grad_norm": 3.9375, + "grad_norm_var": 0.11286519368489584, + "learning_rate": 0.0001, + "loss": 6.042, + "loss/crossentropy": 2.2471452951431274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38874460756778717, + "step": 656 + }, + { + "epoch": 0.01316, + "grad_norm": 4.125, + "grad_norm_var": 0.10369364420572917, + "learning_rate": 0.0001, + "loss": 6.4383, + "loss/crossentropy": 2.3776252269744873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40553848445415497, + "step": 658 + }, + { + "epoch": 0.0132, + "grad_norm": 3.84375, + "grad_norm_var": 0.06461588541666667, + "learning_rate": 0.0001, + "loss": 5.5389, + "loss/crossentropy": 2.291012167930603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3630271404981613, + "step": 660 + }, + { + "epoch": 0.01324, + "grad_norm": 3.90625, + "grad_norm_var": 0.06116536458333333, + "learning_rate": 0.0001, + "loss": 6.298, + "loss/crossentropy": 2.2029112577438354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37700483202934265, + "step": 662 + }, + { + "epoch": 0.01328, + "grad_norm": 3.984375, + "grad_norm_var": 0.0598541259765625, + "learning_rate": 0.0001, + "loss": 6.4093, + "loss/crossentropy": 2.571411967277527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5071892440319061, + "step": 664 + }, + { + "epoch": 0.01332, + "grad_norm": 3.484375, + "grad_norm_var": 0.045947265625, + "learning_rate": 0.0001, + "loss": 5.6839, + "loss/crossentropy": 2.148792862892151, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3591457009315491, + "step": 666 + }, + { + "epoch": 0.01336, + "grad_norm": 4.09375, + "grad_norm_var": 0.04755452473958333, + "learning_rate": 0.0001, + "loss": 6.4444, + "loss/crossentropy": 2.5091140270233154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3882133811712265, + "step": 668 + }, + { + "epoch": 0.0134, + "grad_norm": 4.09375, + "grad_norm_var": 0.049332682291666666, + "learning_rate": 0.0001, + "loss": 6.15, + "loss/crossentropy": 2.4669524431228638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3941466957330704, + "step": 670 + }, + { + "epoch": 0.01344, + "grad_norm": 3.9375, + "grad_norm_var": 0.06620686848958333, + "learning_rate": 0.0001, + "loss": 6.5358, + "loss/crossentropy": 2.4111422300338745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3973373472690582, + "step": 672 + }, + { + "epoch": 0.01348, + "grad_norm": 4.1875, + "grad_norm_var": 0.06495768229166667, + "learning_rate": 0.0001, + "loss": 5.765, + "loss/crossentropy": 2.1109864115715027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36189010739326477, + "step": 674 + }, + { + "epoch": 0.01352, + "grad_norm": 3.578125, + "grad_norm_var": 0.0744140625, + "learning_rate": 0.0001, + "loss": 6.0289, + "loss/crossentropy": 2.069494664669037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3562029302120209, + "step": 676 + }, + { + "epoch": 0.01356, + "grad_norm": 4.0625, + "grad_norm_var": 0.07224934895833333, + "learning_rate": 0.0001, + "loss": 6.4526, + "loss/crossentropy": 2.1924527883529663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38948506116867065, + "step": 678 + }, + { + "epoch": 0.0136, + "grad_norm": 3.6875, + "grad_norm_var": 0.082421875, + "learning_rate": 0.0001, + "loss": 5.7311, + "loss/crossentropy": 2.1603400707244873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40414859354496, + "step": 680 + }, + { + "epoch": 0.01364, + "grad_norm": 3.71875, + "grad_norm_var": 0.07177632649739583, + "learning_rate": 0.0001, + "loss": 6.2459, + "loss/crossentropy": 2.515262722969055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4363710880279541, + "step": 682 + }, + { + "epoch": 0.01368, + "grad_norm": 4.09375, + "grad_norm_var": 0.07869364420572916, + "learning_rate": 0.0001, + "loss": 6.1174, + "loss/crossentropy": 2.3615161180496216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35220713913440704, + "step": 684 + }, + { + "epoch": 0.01372, + "grad_norm": 6.03125, + "grad_norm_var": 0.3293690999348958, + "learning_rate": 0.0001, + "loss": 6.1941, + "loss/crossentropy": 1.920493245124817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35293935239315033, + "step": 686 + }, + { + "epoch": 0.01376, + "grad_norm": 4.03125, + "grad_norm_var": 0.31579488118489585, + "learning_rate": 0.0001, + "loss": 6.4064, + "loss/crossentropy": 2.493665337562561, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4243907481431961, + "step": 688 + }, + { + "epoch": 0.0138, + "grad_norm": 4.03125, + "grad_norm_var": 0.3451324462890625, + "learning_rate": 0.0001, + "loss": 6.1519, + "loss/crossentropy": 2.1182271242141724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.39624081552028656, + "step": 690 + }, + { + "epoch": 0.01384, + "grad_norm": 4.09375, + "grad_norm_var": 0.32034098307291664, + "learning_rate": 0.0001, + "loss": 6.2457, + "loss/crossentropy": 2.146227180957794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3491668850183487, + "step": 692 + }, + { + "epoch": 0.01388, + "grad_norm": 3.734375, + "grad_norm_var": 0.3376261393229167, + "learning_rate": 0.0001, + "loss": 5.7331, + "loss/crossentropy": 1.8458876609802246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34608474373817444, + "step": 694 + }, + { + "epoch": 0.01392, + "grad_norm": 4.03125, + "grad_norm_var": 0.3294911702473958, + "learning_rate": 0.0001, + "loss": 6.3661, + "loss/crossentropy": 2.270371675491333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33937984704971313, + "step": 696 + }, + { + "epoch": 0.01396, + "grad_norm": 3.875, + "grad_norm_var": 0.32066141764322914, + "learning_rate": 0.0001, + "loss": 5.8264, + "loss/crossentropy": 2.040702223777771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3821101486682892, + "step": 698 + }, + { + "epoch": 0.014, + "grad_norm": 4.09375, + "grad_norm_var": 0.3197428385416667, + "learning_rate": 0.0001, + "loss": 6.1216, + "loss/crossentropy": 2.1711822152137756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35383065044879913, + "step": 700 + }, + { + "epoch": 0.01404, + "grad_norm": 3.890625, + "grad_norm_var": 0.0597076416015625, + "learning_rate": 0.0001, + "loss": 5.8384, + "loss/crossentropy": 2.3292651176452637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37580642104148865, + "step": 702 + }, + { + "epoch": 0.01408, + "grad_norm": 4.34375, + "grad_norm_var": 0.06575520833333333, + "learning_rate": 0.0001, + "loss": 6.0789, + "loss/crossentropy": 2.243735432624817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3997037708759308, + "step": 704 + }, + { + "epoch": 0.01412, + "grad_norm": 3.921875, + "grad_norm_var": 0.0524566650390625, + "learning_rate": 0.0001, + "loss": 5.9987, + "loss/crossentropy": 1.9908145666122437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33391132950782776, + "step": 706 + }, + { + "epoch": 0.01416, + "grad_norm": 4.375, + "grad_norm_var": 0.06243082682291667, + "learning_rate": 0.0001, + "loss": 5.536, + "loss/crossentropy": 2.280096471309662, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3899015784263611, + "step": 708 + }, + { + "epoch": 0.0142, + "grad_norm": 3.6875, + "grad_norm_var": 0.12727762858072916, + "learning_rate": 0.0001, + "loss": 5.9373, + "loss/crossentropy": 2.0714810490608215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3797626197338104, + "step": 710 + }, + { + "epoch": 0.01424, + "grad_norm": 3.875, + "grad_norm_var": 0.14480692545572918, + "learning_rate": 0.0001, + "loss": 6.2466, + "loss/crossentropy": 2.250674605369568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3948116898536682, + "step": 712 + }, + { + "epoch": 0.01428, + "grad_norm": 3.65625, + "grad_norm_var": 0.15719401041666667, + "learning_rate": 0.0001, + "loss": 5.8891, + "loss/crossentropy": 2.0895228385925293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34471337497234344, + "step": 714 + }, + { + "epoch": 0.01432, + "grad_norm": 4.71875, + "grad_norm_var": 0.1826812744140625, + "learning_rate": 0.0001, + "loss": 6.3748, + "loss/crossentropy": 2.337170124053955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4226381927728653, + "step": 716 + }, + { + "epoch": 0.01436, + "grad_norm": 4.03125, + "grad_norm_var": 0.174462890625, + "learning_rate": 0.0001, + "loss": 6.3996, + "loss/crossentropy": 2.337436556816101, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3738469183444977, + "step": 718 + }, + { + "epoch": 0.0144, + "grad_norm": 4.15625, + "grad_norm_var": 0.1669921875, + "learning_rate": 0.0001, + "loss": 6.0278, + "loss/crossentropy": 1.9506489634513855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37431904673576355, + "step": 720 + }, + { + "epoch": 0.01444, + "grad_norm": 3.71875, + "grad_norm_var": 0.17283528645833332, + "learning_rate": 0.0001, + "loss": 5.8083, + "loss/crossentropy": 2.253044009208679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36500048637390137, + "step": 722 + }, + { + "epoch": 0.01448, + "grad_norm": 5.15625, + "grad_norm_var": 0.23413798014322917, + "learning_rate": 0.0001, + "loss": 6.2307, + "loss/crossentropy": 1.9980219006538391, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34674490988254547, + "step": 724 + }, + { + "epoch": 0.01452, + "grad_norm": 3.609375, + "grad_norm_var": 0.1903472900390625, + "learning_rate": 0.0001, + "loss": 5.8283, + "loss/crossentropy": 1.894662618637085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34921175241470337, + "step": 726 + }, + { + "epoch": 0.01456, + "grad_norm": 3.609375, + "grad_norm_var": 0.18310546875, + "learning_rate": 0.0001, + "loss": 6.1117, + "loss/crossentropy": 2.304685115814209, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36858032643795013, + "step": 728 + }, + { + "epoch": 0.0146, + "grad_norm": 3.6875, + "grad_norm_var": 0.17696940104166667, + "learning_rate": 0.0001, + "loss": 6.1041, + "loss/crossentropy": 1.9020891189575195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3854057639837265, + "step": 730 + }, + { + "epoch": 0.01464, + "grad_norm": 3.4375, + "grad_norm_var": 0.15485026041666666, + "learning_rate": 0.0001, + "loss": 5.5327, + "loss/crossentropy": 1.707019329071045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2961048036813736, + "step": 732 + }, + { + "epoch": 0.01468, + "grad_norm": 3.828125, + "grad_norm_var": 0.15344136555989582, + "learning_rate": 0.0001, + "loss": 6.0543, + "loss/crossentropy": 2.423463463783264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3739102631807327, + "step": 734 + }, + { + "epoch": 0.01472, + "grad_norm": 4.0625, + "grad_norm_var": 0.15335286458333333, + "learning_rate": 0.0001, + "loss": 6.2855, + "loss/crossentropy": 2.0490055680274963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4027387350797653, + "step": 736 + }, + { + "epoch": 0.01476, + "grad_norm": 3.984375, + "grad_norm_var": 0.15038960774739582, + "learning_rate": 0.0001, + "loss": 6.1236, + "loss/crossentropy": 2.3712635040283203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3602859079837799, + "step": 738 + }, + { + "epoch": 0.0148, + "grad_norm": 3.75, + "grad_norm_var": 0.058649698893229164, + "learning_rate": 0.0001, + "loss": 6.2938, + "loss/crossentropy": 2.306379556655884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38698625564575195, + "step": 740 + }, + { + "epoch": 0.01484, + "grad_norm": 3.984375, + "grad_norm_var": 0.055939737955729166, + "learning_rate": 0.0001, + "loss": 6.3983, + "loss/crossentropy": 2.6846178770065308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3881431221961975, + "step": 742 + }, + { + "epoch": 0.01488, + "grad_norm": 3.9375, + "grad_norm_var": 0.058690388997395836, + "learning_rate": 0.0001, + "loss": 5.7984, + "loss/crossentropy": 2.0555977821350098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36078818142414093, + "step": 744 + }, + { + "epoch": 0.01492, + "grad_norm": 4.09375, + "grad_norm_var": 0.059325154622395834, + "learning_rate": 0.0001, + "loss": 5.7834, + "loss/crossentropy": 2.0597304701805115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34516778588294983, + "step": 746 + }, + { + "epoch": 0.01496, + "grad_norm": 3.5625, + "grad_norm_var": 0.049479166666666664, + "learning_rate": 0.0001, + "loss": 5.8911, + "loss/crossentropy": 1.9607329964637756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33818933367729187, + "step": 748 + }, + { + "epoch": 0.015, + "grad_norm": 3.546875, + "grad_norm_var": 0.057291666666666664, + "learning_rate": 0.0001, + "loss": 5.8484, + "loss/crossentropy": 1.7854246497154236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3463115990161896, + "step": 750 + }, + { + "epoch": 0.01504, + "grad_norm": 3.703125, + "grad_norm_var": 0.0590240478515625, + "learning_rate": 0.0001, + "loss": 5.7788, + "loss/crossentropy": 1.845078468322754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3714388310909271, + "step": 752 + }, + { + "epoch": 0.01508, + "grad_norm": 3.9375, + "grad_norm_var": 0.05771077473958333, + "learning_rate": 0.0001, + "loss": 6.1677, + "loss/crossentropy": 2.3951027393341064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3723580837249756, + "step": 754 + }, + { + "epoch": 0.01512, + "grad_norm": 3.6875, + "grad_norm_var": 0.032486979166666666, + "learning_rate": 0.0001, + "loss": 6.219, + "loss/crossentropy": 2.500870108604431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42450079321861267, + "step": 756 + }, + { + "epoch": 0.01516, + "grad_norm": 3.75, + "grad_norm_var": 0.030060831705729166, + "learning_rate": 0.0001, + "loss": 5.886, + "loss/crossentropy": 2.1584274768829346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3798908591270447, + "step": 758 + }, + { + "epoch": 0.0152, + "grad_norm": 3.5625, + "grad_norm_var": 0.028251139322916667, + "learning_rate": 0.0001, + "loss": 5.7937, + "loss/crossentropy": 2.3126983642578125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33131279051303864, + "step": 760 + }, + { + "epoch": 0.01524, + "grad_norm": 3.5, + "grad_norm_var": 0.020246378580729165, + "learning_rate": 0.0001, + "loss": 5.9182, + "loss/crossentropy": 2.349764347076416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3910932093858719, + "step": 762 + }, + { + "epoch": 0.01528, + "grad_norm": 3.640625, + "grad_norm_var": 0.0224273681640625, + "learning_rate": 0.0001, + "loss": 6.0472, + "loss/crossentropy": 2.2232795357704163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3483322113752365, + "step": 764 + }, + { + "epoch": 0.01532, + "grad_norm": 3.65625, + "grad_norm_var": 0.0207427978515625, + "learning_rate": 0.0001, + "loss": 6.0312, + "loss/crossentropy": 2.2273412942886353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38545119762420654, + "step": 766 + }, + { + "epoch": 0.01536, + "grad_norm": 4.125, + "grad_norm_var": 0.03355712890625, + "learning_rate": 0.0001, + "loss": 6.0523, + "loss/crossentropy": 2.5879149436950684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38191574811935425, + "step": 768 + }, + { + "epoch": 0.0154, + "grad_norm": 3.953125, + "grad_norm_var": 0.034032185872395836, + "learning_rate": 0.0001, + "loss": 6.027, + "loss/crossentropy": 2.3305420875549316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3704134076833725, + "step": 770 + }, + { + "epoch": 0.01544, + "grad_norm": 3.59375, + "grad_norm_var": 0.03241780598958333, + "learning_rate": 0.0001, + "loss": 6.121, + "loss/crossentropy": 2.0433666706085205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34387587010860443, + "step": 772 + }, + { + "epoch": 0.01548, + "grad_norm": 3.609375, + "grad_norm_var": 0.03337300618489583, + "learning_rate": 0.0001, + "loss": 5.5837, + "loss/crossentropy": 2.1127337217330933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3484109789133072, + "step": 774 + }, + { + "epoch": 0.01552, + "grad_norm": 3.859375, + "grad_norm_var": 0.029002888997395834, + "learning_rate": 0.0001, + "loss": 6.0028, + "loss/crossentropy": 2.1637459993362427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3757011145353317, + "step": 776 + }, + { + "epoch": 0.01556, + "grad_norm": 3.734375, + "grad_norm_var": 0.024446614583333335, + "learning_rate": 0.0001, + "loss": 5.9904, + "loss/crossentropy": 2.4118471145629883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3797076344490051, + "step": 778 + }, + { + "epoch": 0.0156, + "grad_norm": 3.828125, + "grad_norm_var": 0.022191365559895832, + "learning_rate": 0.0001, + "loss": 6.2649, + "loss/crossentropy": 1.9410768151283264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3254295587539673, + "step": 780 + }, + { + "epoch": 0.01564, + "grad_norm": 3.609375, + "grad_norm_var": 0.023558553059895834, + "learning_rate": 0.0001, + "loss": 5.9008, + "loss/crossentropy": 2.1669737100601196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3583529591560364, + "step": 782 + }, + { + "epoch": 0.01568, + "grad_norm": 3.5625, + "grad_norm_var": 0.017606608072916665, + "learning_rate": 0.0001, + "loss": 5.9868, + "loss/crossentropy": 2.217113733291626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3503105044364929, + "step": 784 + }, + { + "epoch": 0.01572, + "grad_norm": 3.84375, + "grad_norm_var": 0.017154947916666666, + "learning_rate": 0.0001, + "loss": 6.0695, + "loss/crossentropy": 2.588438868522644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3941201716661453, + "step": 786 + }, + { + "epoch": 0.01576, + "grad_norm": 3.546875, + "grad_norm_var": 0.018310546875, + "learning_rate": 0.0001, + "loss": 5.9436, + "loss/crossentropy": 2.3925808668136597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3755808621644974, + "step": 788 + }, + { + "epoch": 0.0158, + "grad_norm": 3.625, + "grad_norm_var": 0.018684895833333333, + "learning_rate": 0.0001, + "loss": 5.7254, + "loss/crossentropy": 1.9568504691123962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3865346759557724, + "step": 790 + }, + { + "epoch": 0.01584, + "grad_norm": 3.96875, + "grad_norm_var": 0.024104817708333334, + "learning_rate": 0.0001, + "loss": 6.0174, + "loss/crossentropy": 2.336462616920471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36937348544597626, + "step": 792 + }, + { + "epoch": 0.01588, + "grad_norm": 3.421875, + "grad_norm_var": 0.03623046875, + "learning_rate": 0.0001, + "loss": 5.7742, + "loss/crossentropy": 2.1867082715034485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35803911089897156, + "step": 794 + }, + { + "epoch": 0.01592, + "grad_norm": 3.765625, + "grad_norm_var": 0.03877665201822917, + "learning_rate": 0.0001, + "loss": 6.2825, + "loss/crossentropy": 2.070562243461609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42781224846839905, + "step": 796 + }, + { + "epoch": 0.01596, + "grad_norm": 4.09375, + "grad_norm_var": 0.044384765625, + "learning_rate": 0.0001, + "loss": 6.401, + "loss/crossentropy": 2.160820960998535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34495553374290466, + "step": 798 + }, + { + "epoch": 0.016, + "grad_norm": 4.0, + "grad_norm_var": 0.045426432291666666, + "learning_rate": 0.0001, + "loss": 5.9628, + "loss/crossentropy": 2.3424230813980103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4417698383331299, + "step": 800 + }, + { + "epoch": 0.01604, + "grad_norm": 4.59375, + "grad_norm_var": 0.08385009765625, + "learning_rate": 0.0001, + "loss": 6.1984, + "loss/crossentropy": 2.090175747871399, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35883304476737976, + "step": 802 + }, + { + "epoch": 0.01608, + "grad_norm": 3.90625, + "grad_norm_var": 0.0759429931640625, + "learning_rate": 0.0001, + "loss": 6.2044, + "loss/crossentropy": 2.460660457611084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36723683774471283, + "step": 804 + }, + { + "epoch": 0.01612, + "grad_norm": 3.78125, + "grad_norm_var": 0.0783203125, + "learning_rate": 0.0001, + "loss": 5.8788, + "loss/crossentropy": 2.2680885791778564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3925183415412903, + "step": 806 + }, + { + "epoch": 0.01616, + "grad_norm": 3.796875, + "grad_norm_var": 0.10422261555989583, + "learning_rate": 0.0001, + "loss": 6.1179, + "loss/crossentropy": 2.272566020488739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3425859659910202, + "step": 808 + }, + { + "epoch": 0.0162, + "grad_norm": 3.546875, + "grad_norm_var": 0.10061442057291667, + "learning_rate": 0.0001, + "loss": 5.8933, + "loss/crossentropy": 2.2417107820510864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3634066879749298, + "step": 810 + }, + { + "epoch": 0.01624, + "grad_norm": 4.09375, + "grad_norm_var": 0.10075581868489583, + "learning_rate": 0.0001, + "loss": 5.9907, + "loss/crossentropy": 2.2117987275123596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3669978231191635, + "step": 812 + }, + { + "epoch": 0.01628, + "grad_norm": 4.53125, + "grad_norm_var": 0.12078348795572917, + "learning_rate": 0.0001, + "loss": 6.1767, + "loss/crossentropy": 2.3471380472183228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.39207911491394043, + "step": 814 + }, + { + "epoch": 0.01632, + "grad_norm": 4.1875, + "grad_norm_var": 0.12200113932291666, + "learning_rate": 0.0001, + "loss": 6.005, + "loss/crossentropy": 2.1516740322113037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3394138962030411, + "step": 816 + }, + { + "epoch": 0.01636, + "grad_norm": 3.765625, + "grad_norm_var": 0.10528055826822917, + "learning_rate": 0.0001, + "loss": 6.0827, + "loss/crossentropy": 2.5085272789001465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.39268872141838074, + "step": 818 + }, + { + "epoch": 0.0164, + "grad_norm": 3.515625, + "grad_norm_var": 0.1198883056640625, + "learning_rate": 0.0001, + "loss": 6.0363, + "loss/crossentropy": 2.3051916360855103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3798936903476715, + "step": 820 + }, + { + "epoch": 0.01644, + "grad_norm": 3.5, + "grad_norm_var": 0.14094136555989584, + "learning_rate": 0.0001, + "loss": 5.4403, + "loss/crossentropy": 2.1685640811920166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3666132390499115, + "step": 822 + }, + { + "epoch": 0.01648, + "grad_norm": 3.53125, + "grad_norm_var": 0.109521484375, + "learning_rate": 0.0001, + "loss": 5.6981, + "loss/crossentropy": 2.3374987840652466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32505205273628235, + "step": 824 + }, + { + "epoch": 0.01652, + "grad_norm": 5.4375, + "grad_norm_var": 0.280859375, + "learning_rate": 0.0001, + "loss": 6.1428, + "loss/crossentropy": 2.6427528858184814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4296632409095764, + "step": 826 + }, + { + "epoch": 0.01656, + "grad_norm": 4.0, + "grad_norm_var": 0.28609619140625, + "learning_rate": 0.0001, + "loss": 5.8304, + "loss/crossentropy": 2.1534847617149353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37605202198028564, + "step": 828 + }, + { + "epoch": 0.0166, + "grad_norm": 4.28125, + "grad_norm_var": 0.2637278238932292, + "learning_rate": 0.0001, + "loss": 6.2115, + "loss/crossentropy": 1.9642478227615356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43740569055080414, + "step": 830 + }, + { + "epoch": 0.01664, + "grad_norm": 4.59375, + "grad_norm_var": 3.3863433837890624, + "learning_rate": 0.0001, + "loss": 6.5306, + "loss/crossentropy": 2.283148407936096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3901669532060623, + "step": 832 + }, + { + "epoch": 0.01668, + "grad_norm": 3.46875, + "grad_norm_var": 3.382255045572917, + "learning_rate": 0.0001, + "loss": 6.0831, + "loss/crossentropy": 2.418351888656616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4011431038379669, + "step": 834 + }, + { + "epoch": 0.01672, + "grad_norm": 4.03125, + "grad_norm_var": 3.322565714518229, + "learning_rate": 0.0001, + "loss": 6.1852, + "loss/crossentropy": 2.40928852558136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40596309304237366, + "step": 836 + }, + { + "epoch": 0.01676, + "grad_norm": 3.34375, + "grad_norm_var": 3.3059234619140625, + "learning_rate": 0.0001, + "loss": 5.791, + "loss/crossentropy": 2.4211392402648926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3640473484992981, + "step": 838 + }, + { + "epoch": 0.0168, + "grad_norm": 3.484375, + "grad_norm_var": 3.2890625, + "learning_rate": 0.0001, + "loss": 5.6553, + "loss/crossentropy": 2.047215461730957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3346950262784958, + "step": 840 + }, + { + "epoch": 0.01684, + "grad_norm": 3.578125, + "grad_norm_var": 3.28623046875, + "learning_rate": 0.0001, + "loss": 5.7093, + "loss/crossentropy": 2.020021378993988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3442958742380142, + "step": 842 + }, + { + "epoch": 0.01688, + "grad_norm": 3.96875, + "grad_norm_var": 3.27164306640625, + "learning_rate": 0.0001, + "loss": 6.1484, + "loss/crossentropy": 2.1585127115249634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3768642693758011, + "step": 844 + }, + { + "epoch": 0.01692, + "grad_norm": 3.84375, + "grad_norm_var": 3.298802693684896, + "learning_rate": 0.0001, + "loss": 5.8512, + "loss/crossentropy": 2.2717286348342896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3644135594367981, + "step": 846 + }, + { + "epoch": 0.01696, + "grad_norm": 3.828125, + "grad_norm_var": 0.06539713541666667, + "learning_rate": 0.0001, + "loss": 5.9203, + "loss/crossentropy": 2.2683321237564087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3795373737812042, + "step": 848 + }, + { + "epoch": 0.017, + "grad_norm": 3.1875, + "grad_norm_var": 0.07066141764322917, + "learning_rate": 0.0001, + "loss": 5.7235, + "loss/crossentropy": 2.1189464330673218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35512739419937134, + "step": 850 + }, + { + "epoch": 0.01704, + "grad_norm": 3.65625, + "grad_norm_var": 0.060384114583333336, + "learning_rate": 0.0001, + "loss": 5.6536, + "loss/crossentropy": 2.260777235031128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35152101516723633, + "step": 852 + }, + { + "epoch": 0.01708, + "grad_norm": 3.671875, + "grad_norm_var": 0.06897379557291666, + "learning_rate": 0.0001, + "loss": 5.93, + "loss/crossentropy": 2.323577642440796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32252687215805054, + "step": 854 + }, + { + "epoch": 0.01712, + "grad_norm": 3.71875, + "grad_norm_var": 0.07024637858072917, + "learning_rate": 0.0001, + "loss": 6.1006, + "loss/crossentropy": 2.5965300798416138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43394728004932404, + "step": 856 + }, + { + "epoch": 0.01716, + "grad_norm": 3.140625, + "grad_norm_var": 0.08170572916666667, + "learning_rate": 0.0001, + "loss": 5.8612, + "loss/crossentropy": 2.078580856323242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3437638282775879, + "step": 858 + }, + { + "epoch": 0.0172, + "grad_norm": 3.359375, + "grad_norm_var": 0.08163960774739583, + "learning_rate": 0.0001, + "loss": 5.8394, + "loss/crossentropy": 2.425456404685974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3522993326187134, + "step": 860 + }, + { + "epoch": 0.01724, + "grad_norm": 3.515625, + "grad_norm_var": 0.08478190104166666, + "learning_rate": 0.0001, + "loss": 6.0154, + "loss/crossentropy": 2.2830835580825806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37188920378685, + "step": 862 + }, + { + "epoch": 0.01728, + "grad_norm": 3.8125, + "grad_norm_var": 0.06896870930989583, + "learning_rate": 0.0001, + "loss": 5.9086, + "loss/crossentropy": 2.090674340724945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32989686727523804, + "step": 864 + }, + { + "epoch": 0.01732, + "grad_norm": 3.8125, + "grad_norm_var": 0.0698394775390625, + "learning_rate": 0.0001, + "loss": 5.9617, + "loss/crossentropy": 2.304458498954773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37342821061611176, + "step": 866 + }, + { + "epoch": 0.01736, + "grad_norm": 3.421875, + "grad_norm_var": 0.0715484619140625, + "learning_rate": 0.0001, + "loss": 5.8593, + "loss/crossentropy": 2.6545844078063965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3627774566411972, + "step": 868 + }, + { + "epoch": 0.0174, + "grad_norm": 3.3125, + "grad_norm_var": 0.059789021809895836, + "learning_rate": 0.0001, + "loss": 5.6956, + "loss/crossentropy": 1.9977945685386658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.318715900182724, + "step": 870 + }, + { + "epoch": 0.01744, + "grad_norm": 3.203125, + "grad_norm_var": 0.08033854166666667, + "learning_rate": 0.0001, + "loss": 5.7408, + "loss/crossentropy": 1.9226595759391785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3187277615070343, + "step": 872 + }, + { + "epoch": 0.01748, + "grad_norm": 3.6875, + "grad_norm_var": 0.0694732666015625, + "learning_rate": 0.0001, + "loss": 5.9863, + "loss/crossentropy": 2.323302686214447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36724327504634857, + "step": 874 + }, + { + "epoch": 0.01752, + "grad_norm": 3.5625, + "grad_norm_var": 0.07043355305989583, + "learning_rate": 0.0001, + "loss": 5.9913, + "loss/crossentropy": 2.254343032836914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3444042354822159, + "step": 876 + }, + { + "epoch": 0.01756, + "grad_norm": 3.296875, + "grad_norm_var": 0.0774078369140625, + "learning_rate": 0.0001, + "loss": 5.5038, + "loss/crossentropy": 2.0819836854934692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3477388769388199, + "step": 878 + }, + { + "epoch": 0.0176, + "grad_norm": 3.703125, + "grad_norm_var": 0.10198160807291666, + "learning_rate": 0.0001, + "loss": 5.9947, + "loss/crossentropy": 2.377693295478821, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36597058176994324, + "step": 880 + }, + { + "epoch": 0.01764, + "grad_norm": 4.21875, + "grad_norm_var": 0.11789449055989583, + "learning_rate": 0.0001, + "loss": 6.3011, + "loss/crossentropy": 2.5598798990249634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4082639515399933, + "step": 882 + }, + { + "epoch": 0.01768, + "grad_norm": 3.6875, + "grad_norm_var": 0.1294830322265625, + "learning_rate": 0.0001, + "loss": 5.9966, + "loss/crossentropy": 2.2847843170166016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3531750440597534, + "step": 884 + }, + { + "epoch": 0.01772, + "grad_norm": 3.390625, + "grad_norm_var": 0.12878316243489582, + "learning_rate": 0.0001, + "loss": 5.6685, + "loss/crossentropy": 1.8283140063285828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.336555078625679, + "step": 886 + }, + { + "epoch": 0.01776, + "grad_norm": 3.65625, + "grad_norm_var": 0.09971415201822917, + "learning_rate": 0.0001, + "loss": 5.9507, + "loss/crossentropy": 2.1001436710357666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3380406051874161, + "step": 888 + }, + { + "epoch": 0.0178, + "grad_norm": 4.21875, + "grad_norm_var": 0.11876627604166666, + "learning_rate": 0.0001, + "loss": 5.7552, + "loss/crossentropy": 2.0079030990600586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35362809896469116, + "step": 890 + }, + { + "epoch": 0.01784, + "grad_norm": 4.46875, + "grad_norm_var": 0.15650634765625, + "learning_rate": 0.0001, + "loss": 5.7416, + "loss/crossentropy": 2.176286220550537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34369874000549316, + "step": 892 + }, + { + "epoch": 0.01788, + "grad_norm": 3.984375, + "grad_norm_var": 0.13087565104166668, + "learning_rate": 0.0001, + "loss": 5.9236, + "loss/crossentropy": 2.17675244808197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3495863378047943, + "step": 894 + }, + { + "epoch": 0.01792, + "grad_norm": 4.0, + "grad_norm_var": 0.12189127604166666, + "learning_rate": 0.0001, + "loss": 5.9109, + "loss/crossentropy": 2.312318801879883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40353919565677643, + "step": 896 + }, + { + "epoch": 0.01796, + "grad_norm": 3.375, + "grad_norm_var": 0.14010009765625, + "learning_rate": 0.0001, + "loss": 6.071, + "loss/crossentropy": 2.28923499584198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3770768642425537, + "step": 898 + }, + { + "epoch": 0.018, + "grad_norm": 3.546875, + "grad_norm_var": 0.1447662353515625, + "learning_rate": 0.0001, + "loss": 6.0275, + "loss/crossentropy": 2.2060720920562744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35431359708309174, + "step": 900 + }, + { + "epoch": 0.01804, + "grad_norm": 3.296875, + "grad_norm_var": 0.17552083333333332, + "learning_rate": 0.0001, + "loss": 5.4052, + "loss/crossentropy": 2.0325206518173218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3037416934967041, + "step": 902 + }, + { + "epoch": 0.01808, + "grad_norm": 3.546875, + "grad_norm_var": 0.17635091145833334, + "learning_rate": 0.0001, + "loss": 5.9148, + "loss/crossentropy": 2.1943042278289795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3509814143180847, + "step": 904 + }, + { + "epoch": 0.01812, + "grad_norm": 3.4375, + "grad_norm_var": 0.1696197509765625, + "learning_rate": 0.0001, + "loss": 5.563, + "loss/crossentropy": 1.9589214324951172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31621459126472473, + "step": 906 + }, + { + "epoch": 0.01816, + "grad_norm": 3.375, + "grad_norm_var": 0.12841389973958334, + "learning_rate": 0.0001, + "loss": 5.6511, + "loss/crossentropy": 2.329489588737488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3837278485298157, + "step": 908 + }, + { + "epoch": 0.0182, + "grad_norm": 3.5625, + "grad_norm_var": 0.09648030598958333, + "learning_rate": 0.0001, + "loss": 5.8082, + "loss/crossentropy": 2.1757726669311523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3659580200910568, + "step": 910 + }, + { + "epoch": 0.01824, + "grad_norm": 3.75, + "grad_norm_var": 0.08772786458333333, + "learning_rate": 0.0001, + "loss": 5.7372, + "loss/crossentropy": 2.1498661041259766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3474857211112976, + "step": 912 + }, + { + "epoch": 0.01828, + "grad_norm": 15.8125, + "grad_norm_var": 9.504325358072917, + "learning_rate": 0.0001, + "loss": 5.9297, + "loss/crossentropy": 2.4722740650177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.505307987332344, + "step": 914 + }, + { + "epoch": 0.01832, + "grad_norm": 9.0, + "grad_norm_var": 10.75227762858073, + "learning_rate": 0.0001, + "loss": 5.6296, + "loss/crossentropy": 1.855428695678711, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31347331404685974, + "step": 916 + }, + { + "epoch": 0.01836, + "grad_norm": 3.75, + "grad_norm_var": 10.50523173014323, + "learning_rate": 0.0001, + "loss": 5.9769, + "loss/crossentropy": 2.326256275177002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3996751010417938, + "step": 918 + }, + { + "epoch": 0.0184, + "grad_norm": 3.546875, + "grad_norm_var": 10.518257649739583, + "learning_rate": 0.0001, + "loss": 5.8444, + "loss/crossentropy": 2.3712844848632812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37993232905864716, + "step": 920 + }, + { + "epoch": 0.01844, + "grad_norm": 3.5, + "grad_norm_var": 10.657861328125, + "learning_rate": 0.0001, + "loss": 5.5577, + "loss/crossentropy": 2.0161430835723877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3590858578681946, + "step": 922 + }, + { + "epoch": 0.01848, + "grad_norm": 5.40625, + "grad_norm_var": 10.520926920572917, + "learning_rate": 0.0001, + "loss": 5.6675, + "loss/crossentropy": 2.2401121258735657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33380126953125, + "step": 924 + }, + { + "epoch": 0.01852, + "grad_norm": 3.328125, + "grad_norm_var": 10.598356119791667, + "learning_rate": 0.0001, + "loss": 5.9331, + "loss/crossentropy": 2.2354423999786377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34832654893398285, + "step": 926 + }, + { + "epoch": 0.01856, + "grad_norm": 3.46875, + "grad_norm_var": 10.647850545247396, + "learning_rate": 0.0001, + "loss": 5.5212, + "loss/crossentropy": 2.158566176891327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3664778769016266, + "step": 928 + }, + { + "epoch": 0.0186, + "grad_norm": 3.703125, + "grad_norm_var": 2.1230377197265624, + "learning_rate": 0.0001, + "loss": 5.9146, + "loss/crossentropy": 2.270231008529663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35586032271385193, + "step": 930 + }, + { + "epoch": 0.01864, + "grad_norm": 3.609375, + "grad_norm_var": 0.3744303385416667, + "learning_rate": 0.0001, + "loss": 6.0013, + "loss/crossentropy": 2.233540892601013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3789799362421036, + "step": 932 + }, + { + "epoch": 0.01868, + "grad_norm": 3.65625, + "grad_norm_var": 0.27898763020833334, + "learning_rate": 0.0001, + "loss": 5.5019, + "loss/crossentropy": 1.9381731152534485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28776855766773224, + "step": 934 + }, + { + "epoch": 0.01872, + "grad_norm": 3.34375, + "grad_norm_var": 0.28227437337239586, + "learning_rate": 0.0001, + "loss": 5.6759, + "loss/crossentropy": 2.4225244522094727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.355392187833786, + "step": 936 + }, + { + "epoch": 0.01876, + "grad_norm": 3.15625, + "grad_norm_var": 0.28084208170572916, + "learning_rate": 0.0001, + "loss": 5.911, + "loss/crossentropy": 2.58090603351593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3640855699777603, + "step": 938 + }, + { + "epoch": 0.0188, + "grad_norm": 3.5625, + "grad_norm_var": 0.03673502604166667, + "learning_rate": 0.0001, + "loss": 6.0623, + "loss/crossentropy": 2.436452269554138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3797999918460846, + "step": 940 + }, + { + "epoch": 0.01884, + "grad_norm": 3.3125, + "grad_norm_var": 0.03680013020833333, + "learning_rate": 0.0001, + "loss": 5.7428, + "loss/crossentropy": 2.0378769636154175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3278265744447708, + "step": 942 + }, + { + "epoch": 0.01888, + "grad_norm": 3.546875, + "grad_norm_var": 0.03860270182291667, + "learning_rate": 0.0001, + "loss": 5.6211, + "loss/crossentropy": 2.1212962865829468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34586282074451447, + "step": 944 + }, + { + "epoch": 0.01892, + "grad_norm": 3.515625, + "grad_norm_var": 0.040087890625, + "learning_rate": 0.0001, + "loss": 5.6695, + "loss/crossentropy": 2.1884353160858154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40608666837215424, + "step": 946 + }, + { + "epoch": 0.01896, + "grad_norm": 3.671875, + "grad_norm_var": 0.046858723958333334, + "learning_rate": 0.0001, + "loss": 5.6684, + "loss/crossentropy": 2.2093260288238525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3519093841314316, + "step": 948 + }, + { + "epoch": 0.019, + "grad_norm": 3.328125, + "grad_norm_var": 0.03243815104166667, + "learning_rate": 0.0001, + "loss": 6.0842, + "loss/crossentropy": 2.4246588945388794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37247334420681, + "step": 950 + }, + { + "epoch": 0.01904, + "grad_norm": 3.671875, + "grad_norm_var": 0.031022135416666666, + "learning_rate": 0.0001, + "loss": 5.6116, + "loss/crossentropy": 1.932490050792694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32985979318618774, + "step": 952 + }, + { + "epoch": 0.01908, + "grad_norm": 3.90625, + "grad_norm_var": 0.7084920247395833, + "learning_rate": 0.0001, + "loss": 5.7393, + "loss/crossentropy": 2.4439035654067993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38164034485816956, + "step": 954 + }, + { + "epoch": 0.01912, + "grad_norm": 3.296875, + "grad_norm_var": 0.72437744140625, + "learning_rate": 0.0001, + "loss": 5.6255, + "loss/crossentropy": 1.8876591920852661, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3267661929130554, + "step": 956 + }, + { + "epoch": 0.01916, + "grad_norm": 3.578125, + "grad_norm_var": 0.6990193684895833, + "learning_rate": 0.0001, + "loss": 5.7367, + "loss/crossentropy": 2.284990072250366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34810060262680054, + "step": 958 + }, + { + "epoch": 0.0192, + "grad_norm": 3.53125, + "grad_norm_var": 0.6961008707682291, + "learning_rate": 0.0001, + "loss": 5.888, + "loss/crossentropy": 2.333263397216797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42767176032066345, + "step": 960 + }, + { + "epoch": 0.01924, + "grad_norm": 3.296875, + "grad_norm_var": 0.7323527018229167, + "learning_rate": 0.0001, + "loss": 5.6021, + "loss/crossentropy": 2.2526148557662964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3284706473350525, + "step": 962 + }, + { + "epoch": 0.01928, + "grad_norm": 4.25, + "grad_norm_var": 0.7586252848307292, + "learning_rate": 0.0001, + "loss": 5.5479, + "loss/crossentropy": 2.1782984137535095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35957905650138855, + "step": 964 + }, + { + "epoch": 0.01932, + "grad_norm": 4.28125, + "grad_norm_var": 0.75205078125, + "learning_rate": 0.0001, + "loss": 6.2783, + "loss/crossentropy": 2.292098045349121, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35862940549850464, + "step": 966 + }, + { + "epoch": 0.01936, + "grad_norm": 3.546875, + "grad_norm_var": 0.7503214518229167, + "learning_rate": 0.0001, + "loss": 5.9252, + "loss/crossentropy": 2.102781653404236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3527261018753052, + "step": 968 + }, + { + "epoch": 0.0194, + "grad_norm": 3.453125, + "grad_norm_var": 0.11111653645833333, + "learning_rate": 0.0001, + "loss": 5.8891, + "loss/crossentropy": 2.223380208015442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3608042299747467, + "step": 970 + }, + { + "epoch": 0.01944, + "grad_norm": 3.4375, + "grad_norm_var": 0.10501200358072917, + "learning_rate": 0.0001, + "loss": 5.3348, + "loss/crossentropy": 2.0684096813201904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31996411085128784, + "step": 972 + }, + { + "epoch": 0.01948, + "grad_norm": 4.15625, + "grad_norm_var": 0.126708984375, + "learning_rate": 0.0001, + "loss": 5.6642, + "loss/crossentropy": 2.2011090517044067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34404022991657257, + "step": 974 + }, + { + "epoch": 0.01952, + "grad_norm": 3.140625, + "grad_norm_var": 0.14937235514322916, + "learning_rate": 0.0001, + "loss": 5.5033, + "loss/crossentropy": 2.027641534805298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31718479096889496, + "step": 976 + }, + { + "epoch": 0.01956, + "grad_norm": 3.484375, + "grad_norm_var": 0.13909403483072916, + "learning_rate": 0.0001, + "loss": 5.7294, + "loss/crossentropy": 2.311842203140259, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3904002010822296, + "step": 978 + }, + { + "epoch": 0.0196, + "grad_norm": 3.859375, + "grad_norm_var": 0.11236572265625, + "learning_rate": 0.0001, + "loss": 5.4402, + "loss/crossentropy": 2.3605271577835083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38789358735084534, + "step": 980 + }, + { + "epoch": 0.01964, + "grad_norm": 3.765625, + "grad_norm_var": 0.095166015625, + "learning_rate": 0.0001, + "loss": 6.1094, + "loss/crossentropy": 2.1687097549438477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3534909188747406, + "step": 982 + }, + { + "epoch": 0.01968, + "grad_norm": 3.6875, + "grad_norm_var": 0.0962554931640625, + "learning_rate": 0.0001, + "loss": 5.6656, + "loss/crossentropy": 2.194393038749695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34674669802188873, + "step": 984 + }, + { + "epoch": 0.01972, + "grad_norm": 3.359375, + "grad_norm_var": 0.09602762858072916, + "learning_rate": 0.0001, + "loss": 5.5513, + "loss/crossentropy": 2.1355903148651123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3180558532476425, + "step": 986 + }, + { + "epoch": 0.01976, + "grad_norm": 3.390625, + "grad_norm_var": 0.08559468587239584, + "learning_rate": 0.0001, + "loss": 5.9431, + "loss/crossentropy": 2.2688111066818237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.365144744515419, + "step": 988 + }, + { + "epoch": 0.0198, + "grad_norm": 3.65625, + "grad_norm_var": 0.06303609212239583, + "learning_rate": 0.0001, + "loss": 5.5117, + "loss/crossentropy": 2.423216700553894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34281550347805023, + "step": 990 + }, + { + "epoch": 0.01984, + "grad_norm": 3.375, + "grad_norm_var": 0.0419586181640625, + "learning_rate": 0.0001, + "loss": 5.4698, + "loss/crossentropy": 1.9360128045082092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3343205749988556, + "step": 992 + }, + { + "epoch": 0.01988, + "grad_norm": 3.421875, + "grad_norm_var": 0.04468994140625, + "learning_rate": 0.0001, + "loss": 5.8128, + "loss/crossentropy": 2.181576132774353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35567884147167206, + "step": 994 + }, + { + "epoch": 0.01992, + "grad_norm": 3.578125, + "grad_norm_var": 0.037398274739583334, + "learning_rate": 0.0001, + "loss": 5.9295, + "loss/crossentropy": 2.166663408279419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34105008840560913, + "step": 996 + }, + { + "epoch": 0.01996, + "grad_norm": 3.203125, + "grad_norm_var": 0.027408854166666666, + "learning_rate": 0.0001, + "loss": 5.5579, + "loss/crossentropy": 2.285332202911377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35812389850616455, + "step": 998 + }, + { + "epoch": 0.02, + "grad_norm": 4.0, + "grad_norm_var": 35.396484375, + "learning_rate": 0.0001, + "loss": 6.3986, + "loss/crossentropy": 2.088365077972412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3717931807041168, + "step": 1000 + }, + { + "epoch": 0.02004, + "grad_norm": 4.25, + "grad_norm_var": 35.223714192708336, + "learning_rate": 0.0001, + "loss": 6.1327, + "loss/crossentropy": 2.4051828384399414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.420933797955513, + "step": 1002 + }, + { + "epoch": 0.02008, + "grad_norm": 3.265625, + "grad_norm_var": 35.27108968098958, + "learning_rate": 0.0001, + "loss": 5.6789, + "loss/crossentropy": 2.3092572689056396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37571050226688385, + "step": 1004 + }, + { + "epoch": 0.02012, + "grad_norm": 3.640625, + "grad_norm_var": 35.29562072753906, + "learning_rate": 0.0001, + "loss": 5.6972, + "loss/crossentropy": 2.147248387336731, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29974566400051117, + "step": 1006 + }, + { + "epoch": 0.02016, + "grad_norm": 3.984375, + "grad_norm_var": 35.141299438476565, + "learning_rate": 0.0001, + "loss": 5.988, + "loss/crossentropy": 2.3385868668556213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4254964739084244, + "step": 1008 + }, + { + "epoch": 0.0202, + "grad_norm": 3.625, + "grad_norm_var": 35.15125223795573, + "learning_rate": 0.0001, + "loss": 5.7553, + "loss/crossentropy": 2.142681658267975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3764440715312958, + "step": 1010 + }, + { + "epoch": 0.02024, + "grad_norm": 3.34375, + "grad_norm_var": 35.18043619791667, + "learning_rate": 0.0001, + "loss": 5.5947, + "loss/crossentropy": 2.241790771484375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3385400176048279, + "step": 1012 + }, + { + "epoch": 0.02028, + "grad_norm": 3.859375, + "grad_norm_var": 34.836360677083334, + "learning_rate": 0.0001, + "loss": 6.3228, + "loss/crossentropy": 2.1563867330551147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3954617381095886, + "step": 1014 + }, + { + "epoch": 0.02032, + "grad_norm": 3.484375, + "grad_norm_var": 0.09921468098958333, + "learning_rate": 0.0001, + "loss": 5.4613, + "loss/crossentropy": 1.9462800025939941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32294341921806335, + "step": 1016 + }, + { + "epoch": 0.02036, + "grad_norm": 3.53125, + "grad_norm_var": 0.08026936848958334, + "learning_rate": 0.0001, + "loss": 5.4993, + "loss/crossentropy": 1.83676278591156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3142092078924179, + "step": 1018 + }, + { + "epoch": 0.0204, + "grad_norm": 3.84375, + "grad_norm_var": 0.09038798014322917, + "learning_rate": 0.0001, + "loss": 5.8174, + "loss/crossentropy": 1.951962649822235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3889008015394211, + "step": 1020 + }, + { + "epoch": 0.02044, + "grad_norm": 3.578125, + "grad_norm_var": 0.09976806640625, + "learning_rate": 0.0001, + "loss": 5.7173, + "loss/crossentropy": 2.299771785736084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3214885741472244, + "step": 1022 + }, + { + "epoch": 0.02048, + "grad_norm": 3.1875, + "grad_norm_var": 0.09135640462239583, + "learning_rate": 0.0001, + "loss": 5.2291, + "loss/crossentropy": 1.9117569327354431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32827669382095337, + "step": 1024 + }, + { + "epoch": 0.02052, + "grad_norm": 3.1875, + "grad_norm_var": 0.096533203125, + "learning_rate": 0.0001, + "loss": 5.7974, + "loss/crossentropy": 2.484488010406494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34361426532268524, + "step": 1026 + }, + { + "epoch": 0.02056, + "grad_norm": 3.59375, + "grad_norm_var": 0.09973551432291666, + "learning_rate": 0.0001, + "loss": 5.7044, + "loss/crossentropy": 2.3155311346054077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3734404444694519, + "step": 1028 + }, + { + "epoch": 0.0206, + "grad_norm": 3.125, + "grad_norm_var": 0.04983317057291667, + "learning_rate": 0.0001, + "loss": 5.4202, + "loss/crossentropy": 2.081188380718231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3250262886285782, + "step": 1030 + }, + { + "epoch": 0.02064, + "grad_norm": 3.09375, + "grad_norm_var": 0.0574127197265625, + "learning_rate": 0.0001, + "loss": 5.5885, + "loss/crossentropy": 2.044768512248993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34609031677246094, + "step": 1032 + }, + { + "epoch": 0.02068, + "grad_norm": 4.0, + "grad_norm_var": 0.0875640869140625, + "learning_rate": 0.0001, + "loss": 6.078, + "loss/crossentropy": 2.0666560530662537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.365878626704216, + "step": 1034 + }, + { + "epoch": 0.02072, + "grad_norm": 3.5625, + "grad_norm_var": 0.08336588541666666, + "learning_rate": 0.0001, + "loss": 5.9891, + "loss/crossentropy": 2.2933902740478516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3754771202802658, + "step": 1036 + }, + { + "epoch": 0.02076, + "grad_norm": 3.28125, + "grad_norm_var": 0.08640848795572917, + "learning_rate": 0.0001, + "loss": 5.8105, + "loss/crossentropy": 2.28829288482666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3676798492670059, + "step": 1038 + }, + { + "epoch": 0.0208, + "grad_norm": 3.5, + "grad_norm_var": 0.08378499348958333, + "learning_rate": 0.0001, + "loss": 5.9305, + "loss/crossentropy": 2.5891193151474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40094244480133057, + "step": 1040 + }, + { + "epoch": 0.02084, + "grad_norm": 3.234375, + "grad_norm_var": 0.08056233723958334, + "learning_rate": 0.0001, + "loss": 5.8579, + "loss/crossentropy": 2.238967180252075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3226759731769562, + "step": 1042 + }, + { + "epoch": 0.02088, + "grad_norm": 3.296875, + "grad_norm_var": 0.07796223958333333, + "learning_rate": 0.0001, + "loss": 5.2959, + "loss/crossentropy": 2.0116711258888245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3511478453874588, + "step": 1044 + }, + { + "epoch": 0.02092, + "grad_norm": 3.515625, + "grad_norm_var": 0.07344462076822916, + "learning_rate": 0.0001, + "loss": 5.4885, + "loss/crossentropy": 2.4924051761627197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3375450670719147, + "step": 1046 + }, + { + "epoch": 0.02096, + "grad_norm": 3.234375, + "grad_norm_var": 0.059235636393229166, + "learning_rate": 0.0001, + "loss": 5.6838, + "loss/crossentropy": 2.138728439807892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34410610795021057, + "step": 1048 + }, + { + "epoch": 0.021, + "grad_norm": 3.515625, + "grad_norm_var": 0.046122233072916664, + "learning_rate": 0.0001, + "loss": 5.7371, + "loss/crossentropy": 2.3748635053634644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3580169975757599, + "step": 1050 + }, + { + "epoch": 0.02104, + "grad_norm": 3.125, + "grad_norm_var": 0.044310506184895834, + "learning_rate": 0.0001, + "loss": 5.427, + "loss/crossentropy": 2.061552882194519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31069953739643097, + "step": 1052 + }, + { + "epoch": 0.02108, + "grad_norm": 3.515625, + "grad_norm_var": 0.03769124348958333, + "learning_rate": 0.0001, + "loss": 5.3469, + "loss/crossentropy": 2.299555718898773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31592319905757904, + "step": 1054 + }, + { + "epoch": 0.02112, + "grad_norm": 3.53125, + "grad_norm_var": 0.04571940104166667, + "learning_rate": 0.0001, + "loss": 6.1254, + "loss/crossentropy": 2.4866377115249634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37435297667980194, + "step": 1056 + }, + { + "epoch": 0.02116, + "grad_norm": 3.203125, + "grad_norm_var": 0.047684733072916666, + "learning_rate": 0.0001, + "loss": 5.5676, + "loss/crossentropy": 1.8185054063796997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2784232199192047, + "step": 1058 + }, + { + "epoch": 0.0212, + "grad_norm": 3.3125, + "grad_norm_var": 0.04840087890625, + "learning_rate": 0.0001, + "loss": 5.6184, + "loss/crossentropy": 2.215538501739502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3471361994743347, + "step": 1060 + }, + { + "epoch": 0.02124, + "grad_norm": 3.078125, + "grad_norm_var": 0.057616170247395834, + "learning_rate": 0.0001, + "loss": 5.7615, + "loss/crossentropy": 2.6912894248962402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35911867022514343, + "step": 1062 + }, + { + "epoch": 0.02128, + "grad_norm": 3.125, + "grad_norm_var": 0.06670633951822917, + "learning_rate": 0.0001, + "loss": 5.3436, + "loss/crossentropy": 1.9757090210914612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29868973791599274, + "step": 1064 + }, + { + "epoch": 0.02132, + "grad_norm": 3.21875, + "grad_norm_var": 0.052294921875, + "learning_rate": 0.0001, + "loss": 5.5334, + "loss/crossentropy": 2.3396666049957275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3561312407255173, + "step": 1066 + }, + { + "epoch": 0.02136, + "grad_norm": 3.359375, + "grad_norm_var": 0.0477935791015625, + "learning_rate": 0.0001, + "loss": 5.7564, + "loss/crossentropy": 2.3498982191085815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3842166066169739, + "step": 1068 + }, + { + "epoch": 0.0214, + "grad_norm": 3.234375, + "grad_norm_var": 0.03717041015625, + "learning_rate": 0.0001, + "loss": 5.8936, + "loss/crossentropy": 2.037585139274597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3429017663002014, + "step": 1070 + }, + { + "epoch": 0.02144, + "grad_norm": 3.484375, + "grad_norm_var": 0.0216949462890625, + "learning_rate": 0.0001, + "loss": 5.6938, + "loss/crossentropy": 2.4804376363754272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35338981449604034, + "step": 1072 + }, + { + "epoch": 0.02148, + "grad_norm": 3.375, + "grad_norm_var": 0.027448527018229165, + "learning_rate": 0.0001, + "loss": 5.8146, + "loss/crossentropy": 2.5210201740264893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3769296407699585, + "step": 1074 + }, + { + "epoch": 0.02152, + "grad_norm": 3.40625, + "grad_norm_var": 0.03369852701822917, + "learning_rate": 0.0001, + "loss": 5.7854, + "loss/crossentropy": 2.1258187294006348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3353133201599121, + "step": 1076 + }, + { + "epoch": 0.02156, + "grad_norm": 3.328125, + "grad_norm_var": 0.026676432291666666, + "learning_rate": 0.0001, + "loss": 5.5436, + "loss/crossentropy": 2.2107938528060913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34667155146598816, + "step": 1078 + }, + { + "epoch": 0.0216, + "grad_norm": 3.171875, + "grad_norm_var": 0.0194244384765625, + "learning_rate": 0.0001, + "loss": 5.7639, + "loss/crossentropy": 1.9614633321762085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2958581745624542, + "step": 1080 + }, + { + "epoch": 0.02164, + "grad_norm": 3.515625, + "grad_norm_var": 0.0194976806640625, + "learning_rate": 0.0001, + "loss": 5.8219, + "loss/crossentropy": 2.1403249502182007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3220343589782715, + "step": 1082 + }, + { + "epoch": 0.02168, + "grad_norm": 3.28125, + "grad_norm_var": 0.022493489583333335, + "learning_rate": 0.0001, + "loss": 5.6037, + "loss/crossentropy": 1.6533048152923584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2814648747444153, + "step": 1084 + }, + { + "epoch": 0.02172, + "grad_norm": 3.59375, + "grad_norm_var": 0.0232086181640625, + "learning_rate": 0.0001, + "loss": 5.7731, + "loss/crossentropy": 2.69880211353302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3835880607366562, + "step": 1086 + }, + { + "epoch": 0.02176, + "grad_norm": 3.5, + "grad_norm_var": 0.0240631103515625, + "learning_rate": 0.0001, + "loss": 5.8119, + "loss/crossentropy": 2.214504837989807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31686101853847504, + "step": 1088 + }, + { + "epoch": 0.0218, + "grad_norm": 3.765625, + "grad_norm_var": 0.027164713541666666, + "learning_rate": 0.0001, + "loss": 6.0376, + "loss/crossentropy": 2.2377456426620483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3682183176279068, + "step": 1090 + }, + { + "epoch": 0.02184, + "grad_norm": 3.359375, + "grad_norm_var": 0.027799479166666665, + "learning_rate": 0.0001, + "loss": 5.7387, + "loss/crossentropy": 2.0977545976638794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32313986122608185, + "step": 1092 + }, + { + "epoch": 0.02188, + "grad_norm": 3.125, + "grad_norm_var": 0.0371490478515625, + "learning_rate": 0.0001, + "loss": 5.6407, + "loss/crossentropy": 2.1717870235443115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3619600385427475, + "step": 1094 + }, + { + "epoch": 0.02192, + "grad_norm": 3.546875, + "grad_norm_var": 0.03674723307291667, + "learning_rate": 0.0001, + "loss": 5.554, + "loss/crossentropy": 2.2805471420288086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3328556418418884, + "step": 1096 + }, + { + "epoch": 0.02196, + "grad_norm": 3.71875, + "grad_norm_var": 0.04625244140625, + "learning_rate": 0.0001, + "loss": 5.8009, + "loss/crossentropy": 2.034846782684326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3610256612300873, + "step": 1098 + }, + { + "epoch": 0.022, + "grad_norm": 3.359375, + "grad_norm_var": 0.04612528483072917, + "learning_rate": 0.0001, + "loss": 5.6121, + "loss/crossentropy": 2.0208348631858826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3311958611011505, + "step": 1100 + }, + { + "epoch": 0.02204, + "grad_norm": 3.59375, + "grad_norm_var": 0.04737040201822917, + "learning_rate": 0.0001, + "loss": 5.7135, + "loss/crossentropy": 2.1020554900169373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3638540059328079, + "step": 1102 + }, + { + "epoch": 0.02208, + "grad_norm": 3.46875, + "grad_norm_var": 0.046019490559895834, + "learning_rate": 0.0001, + "loss": 5.5099, + "loss/crossentropy": 2.28346848487854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3434429168701172, + "step": 1104 + }, + { + "epoch": 0.02212, + "grad_norm": 3.21875, + "grad_norm_var": 0.1152740478515625, + "learning_rate": 0.0001, + "loss": 5.6866, + "loss/crossentropy": 2.103623867034912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34000229835510254, + "step": 1106 + }, + { + "epoch": 0.02216, + "grad_norm": 4.1875, + "grad_norm_var": 0.14846089680989583, + "learning_rate": 0.0001, + "loss": 5.5196, + "loss/crossentropy": 2.205894947052002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3432965874671936, + "step": 1108 + }, + { + "epoch": 0.0222, + "grad_norm": 3.109375, + "grad_norm_var": 0.16467183430989582, + "learning_rate": 0.0001, + "loss": 5.6166, + "loss/crossentropy": 2.395035982131958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3834904432296753, + "step": 1110 + }, + { + "epoch": 0.02224, + "grad_norm": 3.296875, + "grad_norm_var": 0.16505533854166668, + "learning_rate": 0.0001, + "loss": 5.9042, + "loss/crossentropy": 2.3755353689193726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33256760239601135, + "step": 1112 + }, + { + "epoch": 0.02228, + "grad_norm": 3.28125, + "grad_norm_var": 0.16402079264322916, + "learning_rate": 0.0001, + "loss": 6.0135, + "loss/crossentropy": 2.6754449605941772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3524845540523529, + "step": 1114 + }, + { + "epoch": 0.02232, + "grad_norm": 3.515625, + "grad_norm_var": 0.15730692545572916, + "learning_rate": 0.0001, + "loss": 5.6448, + "loss/crossentropy": 2.2398552894592285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.343609020113945, + "step": 1116 + }, + { + "epoch": 0.02236, + "grad_norm": 3.3125, + "grad_norm_var": 0.16770833333333332, + "learning_rate": 0.0001, + "loss": 5.3075, + "loss/crossentropy": 2.399322271347046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34514427185058594, + "step": 1118 + }, + { + "epoch": 0.0224, + "grad_norm": 3.3125, + "grad_norm_var": 0.17229410807291667, + "learning_rate": 0.0001, + "loss": 5.8886, + "loss/crossentropy": 2.4002050161361694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35784730315208435, + "step": 1120 + }, + { + "epoch": 0.02244, + "grad_norm": 3.21875, + "grad_norm_var": 0.10064188639322917, + "learning_rate": 0.0001, + "loss": 5.6239, + "loss/crossentropy": 2.293683171272278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37052902579307556, + "step": 1122 + }, + { + "epoch": 0.02248, + "grad_norm": 3.234375, + "grad_norm_var": 0.0532379150390625, + "learning_rate": 0.0001, + "loss": 5.8244, + "loss/crossentropy": 2.2177391052246094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3482564836740494, + "step": 1124 + }, + { + "epoch": 0.02252, + "grad_norm": 4.1875, + "grad_norm_var": 0.0637603759765625, + "learning_rate": 0.0001, + "loss": 5.8959, + "loss/crossentropy": 2.288211703300476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3401540517807007, + "step": 1126 + }, + { + "epoch": 0.02256, + "grad_norm": 3.1875, + "grad_norm_var": 0.0933258056640625, + "learning_rate": 0.0001, + "loss": 5.5054, + "loss/crossentropy": 2.1786144971847534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.331302285194397, + "step": 1128 + }, + { + "epoch": 0.0226, + "grad_norm": 3.5, + "grad_norm_var": 0.09163004557291667, + "learning_rate": 0.0001, + "loss": 5.7604, + "loss/crossentropy": 2.0390175580978394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32953669130802155, + "step": 1130 + }, + { + "epoch": 0.02264, + "grad_norm": 3.5, + "grad_norm_var": 0.0921295166015625, + "learning_rate": 0.0001, + "loss": 5.6952, + "loss/crossentropy": 2.188543677330017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3585694134235382, + "step": 1132 + }, + { + "epoch": 0.02268, + "grad_norm": 3.296875, + "grad_norm_var": 0.08284098307291667, + "learning_rate": 0.0001, + "loss": 5.732, + "loss/crossentropy": 2.0731694102287292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3121813088655472, + "step": 1134 + }, + { + "epoch": 0.02272, + "grad_norm": 3.390625, + "grad_norm_var": 0.0794097900390625, + "learning_rate": 0.0001, + "loss": 5.6306, + "loss/crossentropy": 2.144552707672119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3303966820240021, + "step": 1136 + }, + { + "epoch": 0.02276, + "grad_norm": 3.890625, + "grad_norm_var": 0.09900614420572916, + "learning_rate": 0.0001, + "loss": 5.3794, + "loss/crossentropy": 2.1617428064346313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3420899361371994, + "step": 1138 + }, + { + "epoch": 0.0228, + "grad_norm": 3.109375, + "grad_norm_var": 0.12813212076822916, + "learning_rate": 0.0001, + "loss": 5.2834, + "loss/crossentropy": 1.8098865747451782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28376901149749756, + "step": 1140 + }, + { + "epoch": 0.02284, + "grad_norm": 2.984375, + "grad_norm_var": 0.09795633951822917, + "learning_rate": 0.0001, + "loss": 5.3911, + "loss/crossentropy": 2.133797824382782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3587050139904022, + "step": 1142 + }, + { + "epoch": 0.02288, + "grad_norm": 4.96875, + "grad_norm_var": 0.24221903483072918, + "learning_rate": 0.0001, + "loss": 5.8787, + "loss/crossentropy": 2.378090739250183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4244185537099838, + "step": 1144 + }, + { + "epoch": 0.02292, + "grad_norm": 3.03125, + "grad_norm_var": 0.2736887613932292, + "learning_rate": 0.0001, + "loss": 5.6692, + "loss/crossentropy": 2.4442414045333862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3569464683532715, + "step": 1146 + }, + { + "epoch": 0.02296, + "grad_norm": 3.671875, + "grad_norm_var": 0.29189453125, + "learning_rate": 0.0001, + "loss": 5.5468, + "loss/crossentropy": 2.6446973085403442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36859095096588135, + "step": 1148 + }, + { + "epoch": 0.023, + "grad_norm": 4.4375, + "grad_norm_var": 0.3506988525390625, + "learning_rate": 0.0001, + "loss": 5.9727, + "loss/crossentropy": 2.4100207090377808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4082309305667877, + "step": 1150 + }, + { + "epoch": 0.02304, + "grad_norm": 3.25, + "grad_norm_var": 0.3552317301432292, + "learning_rate": 0.0001, + "loss": 5.3537, + "loss/crossentropy": 2.1472485661506653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3299275189638138, + "step": 1152 + }, + { + "epoch": 0.02308, + "grad_norm": 3.21875, + "grad_norm_var": 0.3376373291015625, + "learning_rate": 0.0001, + "loss": 5.4393, + "loss/crossentropy": 2.1891872882843018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.356732040643692, + "step": 1154 + }, + { + "epoch": 0.02312, + "grad_norm": 3.71875, + "grad_norm_var": 0.30201416015625, + "learning_rate": 0.0001, + "loss": 6.0049, + "loss/crossentropy": 2.1432933807373047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.360423281788826, + "step": 1156 + }, + { + "epoch": 0.02316, + "grad_norm": 3.1875, + "grad_norm_var": 0.2923248291015625, + "learning_rate": 0.0001, + "loss": 5.5541, + "loss/crossentropy": 2.176819324493408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.340317040681839, + "step": 1158 + }, + { + "epoch": 0.0232, + "grad_norm": 2.96875, + "grad_norm_var": 0.17407938639322917, + "learning_rate": 0.0001, + "loss": 5.2312, + "loss/crossentropy": 2.325207471847534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3309635668992996, + "step": 1160 + }, + { + "epoch": 0.02324, + "grad_norm": 3.515625, + "grad_norm_var": 0.14537353515625, + "learning_rate": 0.0001, + "loss": 5.7322, + "loss/crossentropy": 2.2536743879318237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31422293186187744, + "step": 1162 + }, + { + "epoch": 0.02328, + "grad_norm": 3.328125, + "grad_norm_var": 0.12657877604166667, + "learning_rate": 0.0001, + "loss": 5.4354, + "loss/crossentropy": 2.180476427078247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2956361174583435, + "step": 1164 + }, + { + "epoch": 0.02332, + "grad_norm": 3.15625, + "grad_norm_var": 0.0527008056640625, + "learning_rate": 0.0001, + "loss": 5.6067, + "loss/crossentropy": 1.995088815689087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2936599552631378, + "step": 1166 + }, + { + "epoch": 0.02336, + "grad_norm": 3.1875, + "grad_norm_var": 0.05378316243489583, + "learning_rate": 0.0001, + "loss": 5.7304, + "loss/crossentropy": 2.2555994987487793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3630402684211731, + "step": 1168 + }, + { + "epoch": 0.0234, + "grad_norm": 3.296875, + "grad_norm_var": 0.05856119791666667, + "learning_rate": 0.0001, + "loss": 5.1613, + "loss/crossentropy": 2.0441415905952454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2960120141506195, + "step": 1170 + }, + { + "epoch": 0.02344, + "grad_norm": 3.25, + "grad_norm_var": 0.0204498291015625, + "learning_rate": 0.0001, + "loss": 5.2238, + "loss/crossentropy": 1.8090497255325317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29750876128673553, + "step": 1172 + }, + { + "epoch": 0.02348, + "grad_norm": 3.234375, + "grad_norm_var": 0.0281158447265625, + "learning_rate": 0.0001, + "loss": 5.2506, + "loss/crossentropy": 2.2780312299728394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3376633822917938, + "step": 1174 + }, + { + "epoch": 0.02352, + "grad_norm": 3.4375, + "grad_norm_var": 0.02506103515625, + "learning_rate": 0.0001, + "loss": 5.53, + "loss/crossentropy": 1.8047232627868652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29132279753685, + "step": 1176 + }, + { + "epoch": 0.02356, + "grad_norm": 3.265625, + "grad_norm_var": 0.020417277018229166, + "learning_rate": 0.0001, + "loss": 5.4149, + "loss/crossentropy": 2.203469753265381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3124798536300659, + "step": 1178 + }, + { + "epoch": 0.0236, + "grad_norm": 3.34375, + "grad_norm_var": 0.019775390625, + "learning_rate": 0.0001, + "loss": 5.4481, + "loss/crossentropy": 2.078102231025696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.331977054476738, + "step": 1180 + }, + { + "epoch": 0.02364, + "grad_norm": 3.765625, + "grad_norm_var": 0.0368316650390625, + "learning_rate": 0.0001, + "loss": 5.93, + "loss/crossentropy": 2.2701854705810547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3202142268419266, + "step": 1182 + }, + { + "epoch": 0.02368, + "grad_norm": 3.671875, + "grad_norm_var": 0.047240193684895834, + "learning_rate": 0.0001, + "loss": 5.6414, + "loss/crossentropy": 1.930423617362976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3192301094532013, + "step": 1184 + }, + { + "epoch": 0.02372, + "grad_norm": 3.546875, + "grad_norm_var": 0.06542867024739583, + "learning_rate": 0.0001, + "loss": 6.0688, + "loss/crossentropy": 2.291581869125366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3759836256504059, + "step": 1186 + }, + { + "epoch": 0.02376, + "grad_norm": 3.375, + "grad_norm_var": 0.06529541015625, + "learning_rate": 0.0001, + "loss": 5.6689, + "loss/crossentropy": 1.9887789487838745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31285202503204346, + "step": 1188 + }, + { + "epoch": 0.0238, + "grad_norm": 3.109375, + "grad_norm_var": 0.054671223958333334, + "learning_rate": 0.0001, + "loss": 5.7457, + "loss/crossentropy": 2.290129065513611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3461494445800781, + "step": 1190 + }, + { + "epoch": 0.02384, + "grad_norm": 3.6875, + "grad_norm_var": 0.0604888916015625, + "learning_rate": 0.0001, + "loss": 6.288, + "loss/crossentropy": 2.3252567052841187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3817393332719803, + "step": 1192 + }, + { + "epoch": 0.02388, + "grad_norm": 3.21875, + "grad_norm_var": 0.06230367024739583, + "learning_rate": 0.0001, + "loss": 5.4498, + "loss/crossentropy": 1.736217737197876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28928153216838837, + "step": 1194 + }, + { + "epoch": 0.02392, + "grad_norm": 3.09375, + "grad_norm_var": 0.06886393229166667, + "learning_rate": 0.0001, + "loss": 5.6234, + "loss/crossentropy": 2.112093210220337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3196643739938736, + "step": 1196 + }, + { + "epoch": 0.02396, + "grad_norm": 3.15625, + "grad_norm_var": 0.07258707682291667, + "learning_rate": 0.0001, + "loss": 5.3552, + "loss/crossentropy": 2.3334370851516724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3539857119321823, + "step": 1198 + }, + { + "epoch": 0.024, + "grad_norm": 3.484375, + "grad_norm_var": 0.09695536295572917, + "learning_rate": 0.0001, + "loss": 5.3293, + "loss/crossentropy": 1.7393567562103271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3211805671453476, + "step": 1200 + }, + { + "epoch": 0.02404, + "grad_norm": 3.4375, + "grad_norm_var": 0.07935791015625, + "learning_rate": 0.0001, + "loss": 5.6292, + "loss/crossentropy": 2.314788579940796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37599092721939087, + "step": 1202 + }, + { + "epoch": 0.02408, + "grad_norm": 3.1875, + "grad_norm_var": 0.076318359375, + "learning_rate": 0.0001, + "loss": 5.435, + "loss/crossentropy": 2.2820088863372803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33955833315849304, + "step": 1204 + }, + { + "epoch": 0.02412, + "grad_norm": 3.46875, + "grad_norm_var": 0.07183837890625, + "learning_rate": 0.0001, + "loss": 5.87, + "loss/crossentropy": 2.012476146221161, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3048281967639923, + "step": 1206 + }, + { + "epoch": 0.02416, + "grad_norm": 3.1875, + "grad_norm_var": 0.07356669108072916, + "learning_rate": 0.0001, + "loss": 5.6751, + "loss/crossentropy": 2.315858483314514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32628118991851807, + "step": 1208 + }, + { + "epoch": 0.0242, + "grad_norm": 3.359375, + "grad_norm_var": 0.14773763020833333, + "learning_rate": 0.0001, + "loss": 5.6813, + "loss/crossentropy": 2.3260581493377686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.352965384721756, + "step": 1210 + }, + { + "epoch": 0.02424, + "grad_norm": 4.0625, + "grad_norm_var": 0.1748687744140625, + "learning_rate": 0.0001, + "loss": 5.689, + "loss/crossentropy": 2.199007749557495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3089260905981064, + "step": 1212 + }, + { + "epoch": 0.02428, + "grad_norm": 3.703125, + "grad_norm_var": 1.4942047119140625, + "learning_rate": 0.0001, + "loss": 5.798, + "loss/crossentropy": 2.30281138420105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3629491478204727, + "step": 1214 + }, + { + "epoch": 0.02432, + "grad_norm": 3.1875, + "grad_norm_var": 1.5125640869140624, + "learning_rate": 0.0001, + "loss": 5.4927, + "loss/crossentropy": 2.238295316696167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3436104357242584, + "step": 1216 + }, + { + "epoch": 0.02436, + "grad_norm": 3.9375, + "grad_norm_var": 1.501488240559896, + "learning_rate": 0.0001, + "loss": 5.5763, + "loss/crossentropy": 2.52456271648407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40382860600948334, + "step": 1218 + }, + { + "epoch": 0.0244, + "grad_norm": 3.46875, + "grad_norm_var": 1.47197265625, + "learning_rate": 0.0001, + "loss": 5.8796, + "loss/crossentropy": 2.4516665935516357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.378818541765213, + "step": 1220 + }, + { + "epoch": 0.02444, + "grad_norm": 3.046875, + "grad_norm_var": 1.485480753580729, + "learning_rate": 0.0001, + "loss": 5.5438, + "loss/crossentropy": 2.593857169151306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3743816912174225, + "step": 1222 + }, + { + "epoch": 0.02448, + "grad_norm": 3.375, + "grad_norm_var": 1.4396799723307292, + "learning_rate": 0.0001, + "loss": 5.2807, + "loss/crossentropy": 1.873874843120575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2743247449398041, + "step": 1224 + }, + { + "epoch": 0.02452, + "grad_norm": 3.265625, + "grad_norm_var": 1.4642862955729166, + "learning_rate": 0.0001, + "loss": 5.6455, + "loss/crossentropy": 2.1173152923583984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.350399985909462, + "step": 1226 + }, + { + "epoch": 0.02456, + "grad_norm": 3.5, + "grad_norm_var": 1.4721018473307292, + "learning_rate": 0.0001, + "loss": 5.7318, + "loss/crossentropy": 2.3650271892547607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3241463750600815, + "step": 1228 + }, + { + "epoch": 0.0246, + "grad_norm": 3.1875, + "grad_norm_var": 0.08748270670572916, + "learning_rate": 0.0001, + "loss": 5.5229, + "loss/crossentropy": 2.180622935295105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31293927133083344, + "step": 1230 + }, + { + "epoch": 0.02464, + "grad_norm": 3.546875, + "grad_norm_var": 0.08329976399739583, + "learning_rate": 0.0001, + "loss": 5.7443, + "loss/crossentropy": 2.230265259742737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3405953049659729, + "step": 1232 + }, + { + "epoch": 0.02468, + "grad_norm": 3.6875, + "grad_norm_var": 0.07164713541666666, + "learning_rate": 0.0001, + "loss": 5.5518, + "loss/crossentropy": 2.050285518169403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32737791538238525, + "step": 1234 + }, + { + "epoch": 0.02472, + "grad_norm": 3.140625, + "grad_norm_var": 0.07604166666666666, + "learning_rate": 0.0001, + "loss": 5.4412, + "loss/crossentropy": 2.2357693910598755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34366659820079803, + "step": 1236 + }, + { + "epoch": 0.02476, + "grad_norm": 3.09375, + "grad_norm_var": 0.09081624348958334, + "learning_rate": 0.0001, + "loss": 5.6066, + "loss/crossentropy": 2.308778762817383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33825138211250305, + "step": 1238 + }, + { + "epoch": 0.0248, + "grad_norm": 3.390625, + "grad_norm_var": 0.0979888916015625, + "learning_rate": 0.0001, + "loss": 5.419, + "loss/crossentropy": 2.016503393650055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30135589838027954, + "step": 1240 + }, + { + "epoch": 0.02484, + "grad_norm": 3.359375, + "grad_norm_var": 0.0870025634765625, + "learning_rate": 0.0001, + "loss": 5.2494, + "loss/crossentropy": 1.8450073599815369, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34388674795627594, + "step": 1242 + }, + { + "epoch": 0.02488, + "grad_norm": 3.515625, + "grad_norm_var": 0.05263671875, + "learning_rate": 0.0001, + "loss": 5.7092, + "loss/crossentropy": 2.486730694770813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35796378552913666, + "step": 1244 + }, + { + "epoch": 0.02492, + "grad_norm": 2.859375, + "grad_norm_var": 0.0688385009765625, + "learning_rate": 0.0001, + "loss": 5.2565, + "loss/crossentropy": 2.010675370693207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31364670395851135, + "step": 1246 + }, + { + "epoch": 0.02496, + "grad_norm": 3.15625, + "grad_norm_var": 0.07418619791666667, + "learning_rate": 0.0001, + "loss": 5.366, + "loss/crossentropy": 2.128747880458832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3106560483574867, + "step": 1248 + }, + { + "epoch": 0.025, + "grad_norm": 3.359375, + "grad_norm_var": 0.06523335774739583, + "learning_rate": 0.0001, + "loss": 5.8005, + "loss/crossentropy": 2.4563735723495483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33769866824150085, + "step": 1250 + }, + { + "epoch": 0.02504, + "grad_norm": 3.15625, + "grad_norm_var": 0.06453450520833333, + "learning_rate": 0.0001, + "loss": 5.372, + "loss/crossentropy": 2.3785592317581177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36167748272418976, + "step": 1252 + }, + { + "epoch": 0.02508, + "grad_norm": 3.21875, + "grad_norm_var": 0.029069010416666666, + "learning_rate": 0.0001, + "loss": 5.5506, + "loss/crossentropy": 2.2642308473587036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3315223157405853, + "step": 1254 + }, + { + "epoch": 0.02512, + "grad_norm": 3.46875, + "grad_norm_var": 0.028416951497395832, + "learning_rate": 0.0001, + "loss": 5.4608, + "loss/crossentropy": 2.2246991395950317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33083613216876984, + "step": 1256 + }, + { + "epoch": 0.02516, + "grad_norm": 3.1875, + "grad_norm_var": 0.0290435791015625, + "learning_rate": 0.0001, + "loss": 5.762, + "loss/crossentropy": 2.129785656929016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32672610878944397, + "step": 1258 + }, + { + "epoch": 0.0252, + "grad_norm": 3.125, + "grad_norm_var": 0.0243804931640625, + "learning_rate": 0.0001, + "loss": 5.7297, + "loss/crossentropy": 2.0835453271865845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3504233658313751, + "step": 1260 + }, + { + "epoch": 0.02524, + "grad_norm": 3.703125, + "grad_norm_var": 0.024637858072916668, + "learning_rate": 0.0001, + "loss": 5.6707, + "loss/crossentropy": 2.443893313407898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3636191487312317, + "step": 1262 + }, + { + "epoch": 0.02528, + "grad_norm": 3.21875, + "grad_norm_var": 0.02197265625, + "learning_rate": 0.0001, + "loss": 5.6107, + "loss/crossentropy": 2.367433190345764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35734108090400696, + "step": 1264 + }, + { + "epoch": 0.02532, + "grad_norm": 3.1875, + "grad_norm_var": 0.02662353515625, + "learning_rate": 0.0001, + "loss": 5.049, + "loss/crossentropy": 1.8102024793624878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28676700592041016, + "step": 1266 + }, + { + "epoch": 0.02536, + "grad_norm": 3.0, + "grad_norm_var": 0.03738606770833333, + "learning_rate": 0.0001, + "loss": 5.3916, + "loss/crossentropy": 2.54524827003479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3380637466907501, + "step": 1268 + }, + { + "epoch": 0.0254, + "grad_norm": 2.984375, + "grad_norm_var": 0.040913899739583336, + "learning_rate": 0.0001, + "loss": 5.6667, + "loss/crossentropy": 2.4164276123046875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36949749290943146, + "step": 1270 + }, + { + "epoch": 0.02544, + "grad_norm": 3.515625, + "grad_norm_var": 0.043745930989583334, + "learning_rate": 0.0001, + "loss": 5.4235, + "loss/crossentropy": 2.4335602521896362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3608807325363159, + "step": 1272 + }, + { + "epoch": 0.02548, + "grad_norm": 3.1875, + "grad_norm_var": 0.0422760009765625, + "learning_rate": 0.0001, + "loss": 5.5858, + "loss/crossentropy": 2.2711308002471924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3631722033023834, + "step": 1274 + }, + { + "epoch": 0.02552, + "grad_norm": 3.140625, + "grad_norm_var": 0.0430816650390625, + "learning_rate": 0.0001, + "loss": 5.181, + "loss/crossentropy": 2.378043293952942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35628968477249146, + "step": 1276 + }, + { + "epoch": 0.02556, + "grad_norm": 3.1875, + "grad_norm_var": 0.02431640625, + "learning_rate": 0.0001, + "loss": 5.5721, + "loss/crossentropy": 1.8950039148330688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3105602264404297, + "step": 1278 + }, + { + "epoch": 0.0256, + "grad_norm": 2.96875, + "grad_norm_var": 0.026659138997395835, + "learning_rate": 0.0001, + "loss": 5.4649, + "loss/crossentropy": 1.8309656977653503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27458132803440094, + "step": 1280 + }, + { + "epoch": 0.02564, + "grad_norm": 3.40625, + "grad_norm_var": 0.034195963541666666, + "learning_rate": 0.0001, + "loss": 5.993, + "loss/crossentropy": 2.3949296474456787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37693680822849274, + "step": 1282 + }, + { + "epoch": 0.02568, + "grad_norm": 3.140625, + "grad_norm_var": 0.026725260416666667, + "learning_rate": 0.0001, + "loss": 5.6157, + "loss/crossentropy": 2.497879147529602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36076007783412933, + "step": 1284 + }, + { + "epoch": 0.02572, + "grad_norm": 2.953125, + "grad_norm_var": 0.027339680989583334, + "learning_rate": 0.0001, + "loss": 5.354, + "loss/crossentropy": 2.108432352542877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3115152269601822, + "step": 1286 + }, + { + "epoch": 0.02576, + "grad_norm": 3.171875, + "grad_norm_var": 0.0201568603515625, + "learning_rate": 0.0001, + "loss": 5.5424, + "loss/crossentropy": 2.079313635826111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31575673818588257, + "step": 1288 + }, + { + "epoch": 0.0258, + "grad_norm": 3.15625, + "grad_norm_var": 0.020099894205729166, + "learning_rate": 0.0001, + "loss": 5.2594, + "loss/crossentropy": 2.3390332460403442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3293873071670532, + "step": 1290 + }, + { + "epoch": 0.02584, + "grad_norm": 3.15625, + "grad_norm_var": 0.019612630208333332, + "learning_rate": 0.0001, + "loss": 5.2895, + "loss/crossentropy": 2.180980920791626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30600421130657196, + "step": 1292 + }, + { + "epoch": 0.02588, + "grad_norm": 2.9375, + "grad_norm_var": 0.022858683268229166, + "learning_rate": 0.0001, + "loss": 5.2784, + "loss/crossentropy": 2.0647836327552795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3114248663187027, + "step": 1294 + }, + { + "epoch": 0.02592, + "grad_norm": 2.84375, + "grad_norm_var": 0.02958984375, + "learning_rate": 0.0001, + "loss": 5.5063, + "loss/crossentropy": 1.931971788406372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32757866382598877, + "step": 1296 + }, + { + "epoch": 0.02596, + "grad_norm": 3.09375, + "grad_norm_var": 0.0202301025390625, + "learning_rate": 0.0001, + "loss": 5.6389, + "loss/crossentropy": 2.1180718541145325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3272206783294678, + "step": 1298 + }, + { + "epoch": 0.026, + "grad_norm": 3.4375, + "grad_norm_var": 0.024616495768229166, + "learning_rate": 0.0001, + "loss": 5.5069, + "loss/crossentropy": 1.8535473346710205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3100028932094574, + "step": 1300 + }, + { + "epoch": 0.02604, + "grad_norm": 4.5, + "grad_norm_var": 0.16031901041666666, + "learning_rate": 0.0001, + "loss": 5.574, + "loss/crossentropy": 1.9625197052955627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3074956685304642, + "step": 1302 + }, + { + "epoch": 0.02608, + "grad_norm": 3.25, + "grad_norm_var": 0.1673736572265625, + "learning_rate": 0.0001, + "loss": 5.3764, + "loss/crossentropy": 2.248521149158478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33484284579753876, + "step": 1304 + }, + { + "epoch": 0.02612, + "grad_norm": 2.921875, + "grad_norm_var": 0.1750640869140625, + "learning_rate": 0.0001, + "loss": 5.8318, + "loss/crossentropy": 2.6374051570892334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3604440838098526, + "step": 1306 + }, + { + "epoch": 0.02616, + "grad_norm": 4.0625, + "grad_norm_var": 0.22333984375, + "learning_rate": 0.0001, + "loss": 5.5151, + "loss/crossentropy": 2.3213003873825073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3659953773021698, + "step": 1308 + }, + { + "epoch": 0.0262, + "grad_norm": 3.734375, + "grad_norm_var": 0.21988525390625, + "learning_rate": 0.0001, + "loss": 5.9334, + "loss/crossentropy": 2.3527311086654663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3638792932033539, + "step": 1310 + }, + { + "epoch": 0.02624, + "grad_norm": 3.140625, + "grad_norm_var": 0.20551656087239584, + "learning_rate": 0.0001, + "loss": 5.4442, + "loss/crossentropy": 1.6319801807403564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29070258140563965, + "step": 1312 + }, + { + "epoch": 0.02628, + "grad_norm": 3.125, + "grad_norm_var": 0.215966796875, + "learning_rate": 0.0001, + "loss": 5.1864, + "loss/crossentropy": 1.986265480518341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30515219271183014, + "step": 1314 + }, + { + "epoch": 0.02632, + "grad_norm": 2.828125, + "grad_norm_var": 0.23336181640625, + "learning_rate": 0.0001, + "loss": 5.5808, + "loss/crossentropy": 2.237283766269684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32876546680927277, + "step": 1316 + }, + { + "epoch": 0.02636, + "grad_norm": 3.859375, + "grad_norm_var": 0.13362223307291668, + "learning_rate": 0.0001, + "loss": 5.4868, + "loss/crossentropy": 2.215467691421509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34449851512908936, + "step": 1318 + }, + { + "epoch": 0.0264, + "grad_norm": 3.03125, + "grad_norm_var": 0.12567952473958333, + "learning_rate": 0.0001, + "loss": 5.7427, + "loss/crossentropy": 2.264952063560486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.387825608253479, + "step": 1320 + }, + { + "epoch": 0.02644, + "grad_norm": 3.125, + "grad_norm_var": 0.12009989420572917, + "learning_rate": 0.0001, + "loss": 5.5614, + "loss/crossentropy": 2.0678945779800415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3228907287120819, + "step": 1322 + }, + { + "epoch": 0.02648, + "grad_norm": 4.40625, + "grad_norm_var": 0.212841796875, + "learning_rate": 0.0001, + "loss": 5.6259, + "loss/crossentropy": 2.1414765119552612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31144315004348755, + "step": 1324 + }, + { + "epoch": 0.02652, + "grad_norm": 4.03125, + "grad_norm_var": 0.23813374837239584, + "learning_rate": 0.0001, + "loss": 5.8487, + "loss/crossentropy": 2.3898890018463135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3721010088920593, + "step": 1326 + }, + { + "epoch": 0.02656, + "grad_norm": 3.421875, + "grad_norm_var": 0.23483784993489584, + "learning_rate": 0.0001, + "loss": 5.5417, + "loss/crossentropy": 2.404030442237854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36486634612083435, + "step": 1328 + }, + { + "epoch": 0.0266, + "grad_norm": 3.15625, + "grad_norm_var": 0.22004801432291668, + "learning_rate": 0.0001, + "loss": 5.7535, + "loss/crossentropy": 2.1436617970466614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3091920465230942, + "step": 1330 + }, + { + "epoch": 0.02664, + "grad_norm": 3.1875, + "grad_norm_var": 0.2074615478515625, + "learning_rate": 0.0001, + "loss": 5.2832, + "loss/crossentropy": 2.106055796146393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3344803601503372, + "step": 1332 + }, + { + "epoch": 0.02668, + "grad_norm": 3.640625, + "grad_norm_var": 0.19583333333333333, + "learning_rate": 0.0001, + "loss": 6.0091, + "loss/crossentropy": 2.465435266494751, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34129244089126587, + "step": 1334 + }, + { + "epoch": 0.02672, + "grad_norm": 3.21875, + "grad_norm_var": 0.18728841145833333, + "learning_rate": 0.0001, + "loss": 5.5569, + "loss/crossentropy": 1.9109253883361816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30202071368694305, + "step": 1336 + }, + { + "epoch": 0.02676, + "grad_norm": 3.0, + "grad_norm_var": 0.1890045166015625, + "learning_rate": 0.0001, + "loss": 5.7093, + "loss/crossentropy": 2.267784833908081, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32984335720539093, + "step": 1338 + }, + { + "epoch": 0.0268, + "grad_norm": 3.0625, + "grad_norm_var": 0.07266337076822917, + "learning_rate": 0.0001, + "loss": 5.4432, + "loss/crossentropy": 2.6131194829940796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3797858655452728, + "step": 1340 + }, + { + "epoch": 0.02684, + "grad_norm": 3.171875, + "grad_norm_var": 0.031891886393229166, + "learning_rate": 0.0001, + "loss": 5.7456, + "loss/crossentropy": 2.214663505554199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.340796560049057, + "step": 1342 + }, + { + "epoch": 0.02688, + "grad_norm": 3.0, + "grad_norm_var": 0.03245340983072917, + "learning_rate": 0.0001, + "loss": 5.4205, + "loss/crossentropy": 2.0236783027648926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3101559728384018, + "step": 1344 + }, + { + "epoch": 0.02692, + "grad_norm": 3.078125, + "grad_norm_var": 0.030565388997395835, + "learning_rate": 0.0001, + "loss": 5.2671, + "loss/crossentropy": 2.3260135650634766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.339004784822464, + "step": 1346 + }, + { + "epoch": 0.02696, + "grad_norm": 2.96875, + "grad_norm_var": 0.0317047119140625, + "learning_rate": 0.0001, + "loss": 5.6529, + "loss/crossentropy": 2.177807927131653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3171197772026062, + "step": 1348 + }, + { + "epoch": 0.027, + "grad_norm": 3.375, + "grad_norm_var": 0.0202056884765625, + "learning_rate": 0.0001, + "loss": 5.4682, + "loss/crossentropy": 2.351730227470398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34749266505241394, + "step": 1350 + }, + { + "epoch": 0.02704, + "grad_norm": 2.953125, + "grad_norm_var": 0.017867024739583334, + "learning_rate": 0.0001, + "loss": 5.3692, + "loss/crossentropy": 2.2959564924240112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3270048499107361, + "step": 1352 + }, + { + "epoch": 0.02708, + "grad_norm": 3.03125, + "grad_norm_var": 0.016890462239583334, + "learning_rate": 0.0001, + "loss": 5.5243, + "loss/crossentropy": 2.399070382118225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32505376636981964, + "step": 1354 + }, + { + "epoch": 0.02712, + "grad_norm": 3.046875, + "grad_norm_var": 0.01812744140625, + "learning_rate": 0.0001, + "loss": 5.3306, + "loss/crossentropy": 1.9084061980247498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3073619455099106, + "step": 1356 + }, + { + "epoch": 0.02716, + "grad_norm": 3.078125, + "grad_norm_var": 0.01441650390625, + "learning_rate": 0.0001, + "loss": 5.3942, + "loss/crossentropy": 2.1204254627227783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2885961979627609, + "step": 1358 + }, + { + "epoch": 0.0272, + "grad_norm": 3.28125, + "grad_norm_var": 0.014842732747395834, + "learning_rate": 0.0001, + "loss": 5.7313, + "loss/crossentropy": 2.0167239904403687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3026747703552246, + "step": 1360 + }, + { + "epoch": 0.02724, + "grad_norm": 3.171875, + "grad_norm_var": 0.014742024739583333, + "learning_rate": 0.0001, + "loss": 5.3987, + "loss/crossentropy": 1.9588357210159302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2976878881454468, + "step": 1362 + }, + { + "epoch": 0.02728, + "grad_norm": 2.796875, + "grad_norm_var": 0.019449869791666668, + "learning_rate": 0.0001, + "loss": 5.1658, + "loss/crossentropy": 1.8169561624526978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2874959260225296, + "step": 1364 + }, + { + "epoch": 0.02732, + "grad_norm": 3.1875, + "grad_norm_var": 0.014354451497395834, + "learning_rate": 0.0001, + "loss": 5.5128, + "loss/crossentropy": 2.258527398109436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34180621802806854, + "step": 1366 + }, + { + "epoch": 0.02736, + "grad_norm": 3.09375, + "grad_norm_var": 0.017284138997395834, + "learning_rate": 0.0001, + "loss": 5.4374, + "loss/crossentropy": 2.1959571838378906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3124794214963913, + "step": 1368 + }, + { + "epoch": 0.0274, + "grad_norm": 2.875, + "grad_norm_var": 0.0198150634765625, + "learning_rate": 0.0001, + "loss": 5.2299, + "loss/crossentropy": 2.1830934286117554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3260872811079025, + "step": 1370 + }, + { + "epoch": 0.02744, + "grad_norm": 3.203125, + "grad_norm_var": 0.02017822265625, + "learning_rate": 0.0001, + "loss": 5.6831, + "loss/crossentropy": 2.411653518676758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3575899302959442, + "step": 1372 + }, + { + "epoch": 0.02748, + "grad_norm": 3.140625, + "grad_norm_var": 0.021512858072916665, + "learning_rate": 0.0001, + "loss": 5.3919, + "loss/crossentropy": 2.1585222482681274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30734311044216156, + "step": 1374 + }, + { + "epoch": 0.02752, + "grad_norm": 2.90625, + "grad_norm_var": 0.023824055989583332, + "learning_rate": 0.0001, + "loss": 5.37, + "loss/crossentropy": 2.3621217012405396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3373589664697647, + "step": 1376 + }, + { + "epoch": 0.02756, + "grad_norm": 3.109375, + "grad_norm_var": 0.023160807291666665, + "learning_rate": 0.0001, + "loss": 5.4712, + "loss/crossentropy": 2.1203317046165466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28425413370132446, + "step": 1378 + }, + { + "epoch": 0.0276, + "grad_norm": 3.296875, + "grad_norm_var": 0.0223297119140625, + "learning_rate": 0.0001, + "loss": 5.5438, + "loss/crossentropy": 2.2568705081939697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.296497106552124, + "step": 1380 + }, + { + "epoch": 0.02764, + "grad_norm": 3.5625, + "grad_norm_var": 0.03665364583333333, + "learning_rate": 0.0001, + "loss": 5.5613, + "loss/crossentropy": 2.260026216506958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34534430503845215, + "step": 1382 + }, + { + "epoch": 0.02768, + "grad_norm": 5.5625, + "grad_norm_var": 0.4100494384765625, + "learning_rate": 0.0001, + "loss": 5.6217, + "loss/crossentropy": 1.9400787949562073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32853779196739197, + "step": 1384 + }, + { + "epoch": 0.02772, + "grad_norm": 3.390625, + "grad_norm_var": 0.39205322265625, + "learning_rate": 0.0001, + "loss": 5.2009, + "loss/crossentropy": 2.168904423713684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3421178460121155, + "step": 1386 + }, + { + "epoch": 0.02776, + "grad_norm": 2.90625, + "grad_norm_var": 0.405419921875, + "learning_rate": 0.0001, + "loss": 5.3992, + "loss/crossentropy": 2.3474777936935425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31982940435409546, + "step": 1388 + }, + { + "epoch": 0.0278, + "grad_norm": 3.140625, + "grad_norm_var": 0.40615234375, + "learning_rate": 0.0001, + "loss": 5.5791, + "loss/crossentropy": 2.3416868448257446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3400905281305313, + "step": 1390 + }, + { + "epoch": 0.02784, + "grad_norm": 2.8125, + "grad_norm_var": 0.4003570556640625, + "learning_rate": 0.0001, + "loss": 5.4413, + "loss/crossentropy": 2.299672842025757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3143058717250824, + "step": 1392 + }, + { + "epoch": 0.02788, + "grad_norm": 3.15625, + "grad_norm_var": 0.3945058186848958, + "learning_rate": 0.0001, + "loss": 5.6231, + "loss/crossentropy": 2.3058812618255615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33790935575962067, + "step": 1394 + }, + { + "epoch": 0.02792, + "grad_norm": 3.03125, + "grad_norm_var": 0.3947987874348958, + "learning_rate": 0.0001, + "loss": 5.324, + "loss/crossentropy": 2.2137999534606934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33156970143318176, + "step": 1396 + }, + { + "epoch": 0.02796, + "grad_norm": 4.1875, + "grad_norm_var": 0.4427571614583333, + "learning_rate": 0.0001, + "loss": 5.4567, + "loss/crossentropy": 2.04184353351593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32263143360614777, + "step": 1398 + }, + { + "epoch": 0.028, + "grad_norm": 3.265625, + "grad_norm_var": 0.09589436848958334, + "learning_rate": 0.0001, + "loss": 5.339, + "loss/crossentropy": 2.0377472639083862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3136487454175949, + "step": 1400 + }, + { + "epoch": 0.02804, + "grad_norm": 3.171875, + "grad_norm_var": 0.09228413899739583, + "learning_rate": 0.0001, + "loss": 5.5838, + "loss/crossentropy": 2.5366055965423584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3476262539625168, + "step": 1402 + }, + { + "epoch": 0.02808, + "grad_norm": 3.1875, + "grad_norm_var": 0.09063212076822917, + "learning_rate": 0.0001, + "loss": 5.2447, + "loss/crossentropy": 2.088012456893921, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31111815571784973, + "step": 1404 + }, + { + "epoch": 0.02812, + "grad_norm": 3.53125, + "grad_norm_var": 0.10530598958333333, + "learning_rate": 0.0001, + "loss": 5.7316, + "loss/crossentropy": 2.1750329732894897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3138856291770935, + "step": 1406 + }, + { + "epoch": 0.02816, + "grad_norm": 3.625, + "grad_norm_var": 0.09811909993489583, + "learning_rate": 0.0001, + "loss": 5.4708, + "loss/crossentropy": 1.977031648159027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3179774433374405, + "step": 1408 + }, + { + "epoch": 0.0282, + "grad_norm": 3.296875, + "grad_norm_var": 0.12561848958333333, + "learning_rate": 0.0001, + "loss": 5.3815, + "loss/crossentropy": 2.0594210028648376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32200056314468384, + "step": 1410 + }, + { + "epoch": 0.02824, + "grad_norm": 3.59375, + "grad_norm_var": 0.11237691243489584, + "learning_rate": 0.0001, + "loss": 5.443, + "loss/crossentropy": 2.3887473344802856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3303599953651428, + "step": 1412 + }, + { + "epoch": 0.02828, + "grad_norm": 3.453125, + "grad_norm_var": 0.06500244140625, + "learning_rate": 0.0001, + "loss": 5.5507, + "loss/crossentropy": 2.188898801803589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3378629684448242, + "step": 1414 + }, + { + "epoch": 0.02832, + "grad_norm": 3.171875, + "grad_norm_var": 0.06982014973958334, + "learning_rate": 0.0001, + "loss": 5.458, + "loss/crossentropy": 1.981561303138733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2814937084913254, + "step": 1416 + }, + { + "epoch": 0.02836, + "grad_norm": 3.1875, + "grad_norm_var": 0.0756011962890625, + "learning_rate": 0.0001, + "loss": 5.271, + "loss/crossentropy": 2.2141716480255127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3429889380931854, + "step": 1418 + }, + { + "epoch": 0.0284, + "grad_norm": 3.1875, + "grad_norm_var": 0.0818267822265625, + "learning_rate": 0.0001, + "loss": 5.048, + "loss/crossentropy": 2.0720977783203125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30601558089256287, + "step": 1420 + }, + { + "epoch": 0.02844, + "grad_norm": 3.21875, + "grad_norm_var": 0.07434488932291666, + "learning_rate": 0.0001, + "loss": 5.6407, + "loss/crossentropy": 2.0516344904899597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3182393014431, + "step": 1422 + }, + { + "epoch": 0.02848, + "grad_norm": 3.28125, + "grad_norm_var": 0.070166015625, + "learning_rate": 0.0001, + "loss": 5.6532, + "loss/crossentropy": 2.161284327507019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3383013904094696, + "step": 1424 + }, + { + "epoch": 0.02852, + "grad_norm": 3.1875, + "grad_norm_var": 0.03163960774739583, + "learning_rate": 0.0001, + "loss": 5.1594, + "loss/crossentropy": 1.9955796599388123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3169983923435211, + "step": 1426 + }, + { + "epoch": 0.02856, + "grad_norm": 2.984375, + "grad_norm_var": 0.023421223958333334, + "learning_rate": 0.0001, + "loss": 5.3951, + "loss/crossentropy": 2.1046979427337646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30594320595264435, + "step": 1428 + }, + { + "epoch": 0.0286, + "grad_norm": 2.90625, + "grad_norm_var": 0.020231119791666665, + "learning_rate": 0.0001, + "loss": 5.364, + "loss/crossentropy": 1.9611601829528809, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3039780706167221, + "step": 1430 + }, + { + "epoch": 0.02864, + "grad_norm": 2.9375, + "grad_norm_var": 0.021610514322916666, + "learning_rate": 0.0001, + "loss": 5.8823, + "loss/crossentropy": 2.256209373474121, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3315662145614624, + "step": 1432 + }, + { + "epoch": 0.02868, + "grad_norm": 2.9375, + "grad_norm_var": 0.030594889322916666, + "learning_rate": 0.0001, + "loss": 5.2821, + "loss/crossentropy": 2.150561034679413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33046241104602814, + "step": 1434 + }, + { + "epoch": 0.02872, + "grad_norm": 3.078125, + "grad_norm_var": 0.024898274739583334, + "learning_rate": 0.0001, + "loss": 5.3609, + "loss/crossentropy": 2.279554605484009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3621693551540375, + "step": 1436 + }, + { + "epoch": 0.02876, + "grad_norm": 3.015625, + "grad_norm_var": 0.0259918212890625, + "learning_rate": 0.0001, + "loss": 5.4323, + "loss/crossentropy": 2.0183790922164917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33361808955669403, + "step": 1438 + }, + { + "epoch": 0.0288, + "grad_norm": 4.15625, + "grad_norm_var": 0.7352203369140625, + "learning_rate": 0.0001, + "loss": 5.4781, + "loss/crossentropy": 1.964252531528473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31638333201408386, + "step": 1440 + }, + { + "epoch": 0.02884, + "grad_norm": 3.046875, + "grad_norm_var": 0.73902587890625, + "learning_rate": 0.0001, + "loss": 5.6907, + "loss/crossentropy": 1.9271634817123413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3408525586128235, + "step": 1442 + }, + { + "epoch": 0.02888, + "grad_norm": 2.953125, + "grad_norm_var": 0.7411417643229167, + "learning_rate": 0.0001, + "loss": 5.4869, + "loss/crossentropy": 2.4400887489318848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33455249667167664, + "step": 1444 + }, + { + "epoch": 0.02892, + "grad_norm": 3.078125, + "grad_norm_var": 0.7270904541015625, + "learning_rate": 0.0001, + "loss": 5.6179, + "loss/crossentropy": 2.465711832046509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3528379648923874, + "step": 1446 + }, + { + "epoch": 0.02896, + "grad_norm": 3.328125, + "grad_norm_var": 0.7147288004557292, + "learning_rate": 0.0001, + "loss": 5.5125, + "loss/crossentropy": 2.269619941711426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32898105680942535, + "step": 1448 + }, + { + "epoch": 0.029, + "grad_norm": 2.984375, + "grad_norm_var": 0.68287353515625, + "learning_rate": 0.0001, + "loss": 5.5775, + "loss/crossentropy": 2.0418076515197754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3170912265777588, + "step": 1450 + }, + { + "epoch": 0.02904, + "grad_norm": 3.15625, + "grad_norm_var": 0.6841756184895833, + "learning_rate": 0.0001, + "loss": 5.5964, + "loss/crossentropy": 2.291188359260559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35694004595279694, + "step": 1452 + }, + { + "epoch": 0.02908, + "grad_norm": 2.75, + "grad_norm_var": 0.7253163655598959, + "learning_rate": 0.0001, + "loss": 5.1608, + "loss/crossentropy": 2.497802972793579, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3271156847476959, + "step": 1454 + }, + { + "epoch": 0.02912, + "grad_norm": 2.984375, + "grad_norm_var": 0.0334625244140625, + "learning_rate": 0.0001, + "loss": 5.1527, + "loss/crossentropy": 2.161794900894165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32509492337703705, + "step": 1456 + }, + { + "epoch": 0.02916, + "grad_norm": 3.3125, + "grad_norm_var": 0.03470052083333333, + "learning_rate": 0.0001, + "loss": 5.7939, + "loss/crossentropy": 2.632015347480774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36461199820041656, + "step": 1458 + }, + { + "epoch": 0.0292, + "grad_norm": 3.03125, + "grad_norm_var": 0.03372395833333333, + "learning_rate": 0.0001, + "loss": 5.6233, + "loss/crossentropy": 2.2478749752044678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3311367332935333, + "step": 1460 + }, + { + "epoch": 0.02924, + "grad_norm": 4.03125, + "grad_norm_var": 0.09263916015625, + "learning_rate": 0.0001, + "loss": 5.2528, + "loss/crossentropy": 1.8451723456382751, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31802159547805786, + "step": 1462 + }, + { + "epoch": 0.02928, + "grad_norm": 3.71875, + "grad_norm_var": 0.11772359212239583, + "learning_rate": 0.0001, + "loss": 5.6469, + "loss/crossentropy": 2.6684207916259766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35483625531196594, + "step": 1464 + }, + { + "epoch": 0.02932, + "grad_norm": 2.921875, + "grad_norm_var": 0.11543680826822916, + "learning_rate": 0.0001, + "loss": 5.2866, + "loss/crossentropy": 2.497614622116089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3422156721353531, + "step": 1466 + }, + { + "epoch": 0.02936, + "grad_norm": 4.25, + "grad_norm_var": 0.19032796223958334, + "learning_rate": 0.0001, + "loss": 5.7424, + "loss/crossentropy": 2.750740170478821, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.377558171749115, + "step": 1468 + }, + { + "epoch": 0.0294, + "grad_norm": 2.8125, + "grad_norm_var": 0.17550455729166667, + "learning_rate": 0.0001, + "loss": 5.1741, + "loss/crossentropy": 1.9961607456207275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3166217654943466, + "step": 1470 + }, + { + "epoch": 0.02944, + "grad_norm": 3.359375, + "grad_norm_var": 0.1655914306640625, + "learning_rate": 0.0001, + "loss": 5.3114, + "loss/crossentropy": 2.0874768495559692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2912164479494095, + "step": 1472 + }, + { + "epoch": 0.02948, + "grad_norm": 2.953125, + "grad_norm_var": 0.174072265625, + "learning_rate": 0.0001, + "loss": 5.2102, + "loss/crossentropy": 2.112182080745697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29954925179481506, + "step": 1474 + }, + { + "epoch": 0.02952, + "grad_norm": 3.6875, + "grad_norm_var": 0.4074371337890625, + "learning_rate": 0.0001, + "loss": 5.7839, + "loss/crossentropy": 2.1319644451141357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3002544492483139, + "step": 1476 + }, + { + "epoch": 0.02956, + "grad_norm": 3.0625, + "grad_norm_var": 0.37743733723958334, + "learning_rate": 0.0001, + "loss": 5.8305, + "loss/crossentropy": 2.3029643297195435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3262677788734436, + "step": 1478 + }, + { + "epoch": 0.0296, + "grad_norm": 3.15625, + "grad_norm_var": 0.3826243082682292, + "learning_rate": 0.0001, + "loss": 5.6803, + "loss/crossentropy": 2.8598941564559937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38816460967063904, + "step": 1480 + }, + { + "epoch": 0.02964, + "grad_norm": 2.984375, + "grad_norm_var": 0.37844950358072915, + "learning_rate": 0.0001, + "loss": 5.2177, + "loss/crossentropy": 2.134063720703125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3128468096256256, + "step": 1482 + }, + { + "epoch": 0.02968, + "grad_norm": 3.125, + "grad_norm_var": 0.31961263020833336, + "learning_rate": 0.0001, + "loss": 5.5234, + "loss/crossentropy": 2.481287717819214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3209179639816284, + "step": 1484 + }, + { + "epoch": 0.02972, + "grad_norm": 3.3125, + "grad_norm_var": 0.30549723307291665, + "learning_rate": 0.0001, + "loss": 5.3571, + "loss/crossentropy": 2.0571895837783813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3121480941772461, + "step": 1486 + }, + { + "epoch": 0.02976, + "grad_norm": 3.203125, + "grad_norm_var": 0.30614827473958334, + "learning_rate": 0.0001, + "loss": 5.2725, + "loss/crossentropy": 2.1930073499679565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3031492233276367, + "step": 1488 + }, + { + "epoch": 0.0298, + "grad_norm": 3.046875, + "grad_norm_var": 0.30426025390625, + "learning_rate": 0.0001, + "loss": 5.3256, + "loss/crossentropy": 2.403902530670166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32206277549266815, + "step": 1490 + }, + { + "epoch": 0.02984, + "grad_norm": 2.9375, + "grad_norm_var": 0.0538482666015625, + "learning_rate": 0.0001, + "loss": 5.2859, + "loss/crossentropy": 2.131115198135376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31651052832603455, + "step": 1492 + }, + { + "epoch": 0.02988, + "grad_norm": 2.828125, + "grad_norm_var": 0.05826416015625, + "learning_rate": 0.0001, + "loss": 5.1232, + "loss/crossentropy": 2.024750769138336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3142934888601303, + "step": 1494 + }, + { + "epoch": 0.02992, + "grad_norm": 2.984375, + "grad_norm_var": 0.06279195149739583, + "learning_rate": 0.0001, + "loss": 5.3804, + "loss/crossentropy": 1.9606707692146301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28610387444496155, + "step": 1496 + }, + { + "epoch": 0.02996, + "grad_norm": 3.15625, + "grad_norm_var": 0.062474568684895836, + "learning_rate": 0.0001, + "loss": 5.5731, + "loss/crossentropy": 2.31516432762146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32875190675258636, + "step": 1498 + }, + { + "epoch": 0.03, + "grad_norm": 3.359375, + "grad_norm_var": 0.0711822509765625, + "learning_rate": 0.0001, + "loss": 5.7626, + "loss/crossentropy": 2.295942783355713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3379499167203903, + "step": 1500 + }, + { + "epoch": 0.03004, + "grad_norm": 3.21875, + "grad_norm_var": 0.06961263020833333, + "learning_rate": 0.0001, + "loss": 5.5997, + "loss/crossentropy": 2.1859577894210815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32143887877464294, + "step": 1502 + }, + { + "epoch": 0.03008, + "grad_norm": 3.28125, + "grad_norm_var": 0.07023824055989583, + "learning_rate": 0.0001, + "loss": 5.4159, + "loss/crossentropy": 2.1852502822875977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3582807630300522, + "step": 1504 + }, + { + "epoch": 0.03012, + "grad_norm": 2.75, + "grad_norm_var": 0.07810770670572917, + "learning_rate": 0.0001, + "loss": 5.3011, + "loss/crossentropy": 2.174897611141205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33067959547042847, + "step": 1506 + }, + { + "epoch": 0.03016, + "grad_norm": 2.96875, + "grad_norm_var": 0.04057515462239583, + "learning_rate": 0.0001, + "loss": 5.2775, + "loss/crossentropy": 2.24343740940094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34563779830932617, + "step": 1508 + }, + { + "epoch": 0.0302, + "grad_norm": 2.90625, + "grad_norm_var": 0.043473307291666666, + "learning_rate": 0.0001, + "loss": 5.317, + "loss/crossentropy": 1.9828822612762451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2887475937604904, + "step": 1510 + }, + { + "epoch": 0.03024, + "grad_norm": 3.0, + "grad_norm_var": 0.03655192057291667, + "learning_rate": 0.0001, + "loss": 5.3172, + "loss/crossentropy": 2.2110393047332764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32290786504745483, + "step": 1512 + }, + { + "epoch": 0.03028, + "grad_norm": 2.90625, + "grad_norm_var": 0.04527587890625, + "learning_rate": 0.0001, + "loss": 5.2598, + "loss/crossentropy": 2.3797603845596313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3262799382209778, + "step": 1514 + }, + { + "epoch": 0.03032, + "grad_norm": 3.1875, + "grad_norm_var": 0.0281158447265625, + "learning_rate": 0.0001, + "loss": 5.4107, + "loss/crossentropy": 2.0183085799217224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2915680408477783, + "step": 1516 + }, + { + "epoch": 0.03036, + "grad_norm": 3.78125, + "grad_norm_var": 0.9888631184895833, + "learning_rate": 0.0001, + "loss": 5.4632, + "loss/crossentropy": 1.875212013721466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30001460015773773, + "step": 1518 + }, + { + "epoch": 0.0304, + "grad_norm": 3.0625, + "grad_norm_var": 0.9916300455729167, + "learning_rate": 0.0001, + "loss": 5.4406, + "loss/crossentropy": 2.1000564098358154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.354188472032547, + "step": 1520 + }, + { + "epoch": 0.03044, + "grad_norm": 2.84375, + "grad_norm_var": 0.980126953125, + "learning_rate": 0.0001, + "loss": 5.4837, + "loss/crossentropy": 2.071643114089966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.333335280418396, + "step": 1522 + }, + { + "epoch": 0.03048, + "grad_norm": 3.203125, + "grad_norm_var": 0.9749664306640625, + "learning_rate": 0.0001, + "loss": 5.2716, + "loss/crossentropy": 2.4253947734832764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31365686655044556, + "step": 1524 + }, + { + "epoch": 0.03052, + "grad_norm": 3.0625, + "grad_norm_var": 0.9605133056640625, + "learning_rate": 0.0001, + "loss": 5.1601, + "loss/crossentropy": 1.967090904712677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30677899718284607, + "step": 1526 + }, + { + "epoch": 0.03056, + "grad_norm": 3.15625, + "grad_norm_var": 0.954443359375, + "learning_rate": 0.0001, + "loss": 5.0971, + "loss/crossentropy": 2.112701952457428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3029911667108536, + "step": 1528 + }, + { + "epoch": 0.0306, + "grad_norm": 4.5625, + "grad_norm_var": 1.0084269205729166, + "learning_rate": 0.0001, + "loss": 5.6836, + "loss/crossentropy": 2.5657063722610474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3484763503074646, + "step": 1530 + }, + { + "epoch": 0.03064, + "grad_norm": 3.234375, + "grad_norm_var": 1.110497029622396, + "learning_rate": 0.0001, + "loss": 5.3888, + "loss/crossentropy": 2.1214585304260254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3243858218193054, + "step": 1532 + }, + { + "epoch": 0.03068, + "grad_norm": 3.234375, + "grad_norm_var": 0.35461832682291666, + "learning_rate": 0.0001, + "loss": 5.4662, + "loss/crossentropy": 2.427902936935425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3428474962711334, + "step": 1534 + }, + { + "epoch": 0.03072, + "grad_norm": 3.078125, + "grad_norm_var": 0.36741434733072914, + "learning_rate": 0.0001, + "loss": 5.2956, + "loss/crossentropy": 1.975690484046936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3136949688196182, + "step": 1536 + }, + { + "epoch": 0.03076, + "grad_norm": 2.734375, + "grad_norm_var": 0.38342692057291666, + "learning_rate": 0.0001, + "loss": 5.1233, + "loss/crossentropy": 2.295845150947571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31703390181064606, + "step": 1538 + }, + { + "epoch": 0.0308, + "grad_norm": 3.171875, + "grad_norm_var": 0.37892252604166665, + "learning_rate": 0.0001, + "loss": 5.8286, + "loss/crossentropy": 2.117497444152832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32601243257522583, + "step": 1540 + }, + { + "epoch": 0.03084, + "grad_norm": 3.140625, + "grad_norm_var": 0.49339192708333335, + "learning_rate": 0.0001, + "loss": 5.4065, + "loss/crossentropy": 2.3824862241744995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3332519829273224, + "step": 1542 + }, + { + "epoch": 0.03088, + "grad_norm": 3.140625, + "grad_norm_var": 0.4940582275390625, + "learning_rate": 0.0001, + "loss": 5.0672, + "loss/crossentropy": 1.9037857055664062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3143462985754013, + "step": 1544 + }, + { + "epoch": 0.03092, + "grad_norm": 3.09375, + "grad_norm_var": 0.4064737955729167, + "learning_rate": 0.0001, + "loss": 5.2328, + "loss/crossentropy": 1.9191133379936218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29443541169166565, + "step": 1546 + }, + { + "epoch": 0.03096, + "grad_norm": 2.734375, + "grad_norm_var": 0.28172200520833335, + "learning_rate": 0.0001, + "loss": 5.3402, + "loss/crossentropy": 2.216462254524231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31411711871623993, + "step": 1548 + }, + { + "epoch": 0.031, + "grad_norm": 2.96875, + "grad_norm_var": 0.227685546875, + "learning_rate": 0.0001, + "loss": 5.0455, + "loss/crossentropy": 1.8064754605293274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2667318135499954, + "step": 1550 + }, + { + "epoch": 0.03104, + "grad_norm": 3.453125, + "grad_norm_var": 0.46340738932291664, + "learning_rate": 0.0001, + "loss": 5.5672, + "loss/crossentropy": 2.488176465034485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3422655761241913, + "step": 1552 + }, + { + "epoch": 0.03108, + "grad_norm": 2.859375, + "grad_norm_var": 0.4529205322265625, + "learning_rate": 0.0001, + "loss": 5.4404, + "loss/crossentropy": 2.5670164823532104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3462950587272644, + "step": 1554 + }, + { + "epoch": 0.03112, + "grad_norm": 3.25, + "grad_norm_var": 0.46499735514322915, + "learning_rate": 0.0001, + "loss": 5.0295, + "loss/crossentropy": 2.0630581378936768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29241877794265747, + "step": 1556 + }, + { + "epoch": 0.03116, + "grad_norm": 2.84375, + "grad_norm_var": 0.30891011555989584, + "learning_rate": 0.0001, + "loss": 5.4751, + "loss/crossentropy": 2.685954213142395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37143297493457794, + "step": 1558 + }, + { + "epoch": 0.0312, + "grad_norm": 3.421875, + "grad_norm_var": 0.311669921875, + "learning_rate": 0.0001, + "loss": 5.6995, + "loss/crossentropy": 1.9786988496780396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40276341140270233, + "step": 1560 + }, + { + "epoch": 0.03124, + "grad_norm": 3.0, + "grad_norm_var": 0.31383056640625, + "learning_rate": 0.0001, + "loss": 5.6695, + "loss/crossentropy": 2.1484411358833313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3264364004135132, + "step": 1562 + }, + { + "epoch": 0.03128, + "grad_norm": 2.875, + "grad_norm_var": 0.3103424072265625, + "learning_rate": 0.0001, + "loss": 5.3015, + "loss/crossentropy": 2.1411852836608887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3156583160161972, + "step": 1564 + }, + { + "epoch": 0.03132, + "grad_norm": 2.90625, + "grad_norm_var": 0.31302083333333336, + "learning_rate": 0.0001, + "loss": 5.2911, + "loss/crossentropy": 2.1509006023406982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3066476732492447, + "step": 1566 + }, + { + "epoch": 0.03136, + "grad_norm": 2.90625, + "grad_norm_var": 0.0416412353515625, + "learning_rate": 0.0001, + "loss": 5.1904, + "loss/crossentropy": 1.7540676593780518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2705047130584717, + "step": 1568 + }, + { + "epoch": 0.0314, + "grad_norm": 3.046875, + "grad_norm_var": 0.0431640625, + "learning_rate": 0.0001, + "loss": 4.9637, + "loss/crossentropy": 2.2091184854507446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2920738309621811, + "step": 1570 + }, + { + "epoch": 0.03144, + "grad_norm": 2.984375, + "grad_norm_var": 0.045735677083333336, + "learning_rate": 0.0001, + "loss": 5.6048, + "loss/crossentropy": 1.8405091762542725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2981649935245514, + "step": 1572 + }, + { + "epoch": 0.03148, + "grad_norm": 2.6875, + "grad_norm_var": 0.0420806884765625, + "learning_rate": 0.0001, + "loss": 5.4258, + "loss/crossentropy": 2.2292457818984985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30654460191726685, + "step": 1574 + }, + { + "epoch": 0.03152, + "grad_norm": 4.21875, + "grad_norm_var": 0.12868550618489583, + "learning_rate": 0.0001, + "loss": 5.8439, + "loss/crossentropy": 2.653907895088196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35303865373134613, + "step": 1576 + }, + { + "epoch": 0.03156, + "grad_norm": 3.234375, + "grad_norm_var": 0.13888346354166667, + "learning_rate": 0.0001, + "loss": 5.6145, + "loss/crossentropy": 2.6799376010894775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37383997440338135, + "step": 1578 + }, + { + "epoch": 0.0316, + "grad_norm": 2.828125, + "grad_norm_var": 0.14431864420572918, + "learning_rate": 0.0001, + "loss": 5.3321, + "loss/crossentropy": 2.3438535928726196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30706922709941864, + "step": 1580 + }, + { + "epoch": 0.03164, + "grad_norm": 2.96875, + "grad_norm_var": 0.14254150390625, + "learning_rate": 0.0001, + "loss": 5.2139, + "loss/crossentropy": 2.1885964274406433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30707718431949615, + "step": 1582 + }, + { + "epoch": 0.03168, + "grad_norm": 2.984375, + "grad_norm_var": 0.14042561848958332, + "learning_rate": 0.0001, + "loss": 5.1661, + "loss/crossentropy": 2.0471584796905518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31723180413246155, + "step": 1584 + }, + { + "epoch": 0.03172, + "grad_norm": 3.0, + "grad_norm_var": 0.1363433837890625, + "learning_rate": 0.0001, + "loss": 5.227, + "loss/crossentropy": 2.135176420211792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29839280247688293, + "step": 1586 + }, + { + "epoch": 0.03176, + "grad_norm": 3.3125, + "grad_norm_var": 0.13482666015625, + "learning_rate": 0.0001, + "loss": 5.6412, + "loss/crossentropy": 2.4444775581359863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33574284613132477, + "step": 1588 + }, + { + "epoch": 0.0318, + "grad_norm": 3.140625, + "grad_norm_var": 0.12195536295572916, + "learning_rate": 0.0001, + "loss": 5.4289, + "loss/crossentropy": 2.1725653409957886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.310588076710701, + "step": 1590 + }, + { + "epoch": 0.03184, + "grad_norm": 2.875, + "grad_norm_var": 0.04572652180989583, + "learning_rate": 0.0001, + "loss": 5.3727, + "loss/crossentropy": 2.3610929250717163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32158929109573364, + "step": 1592 + }, + { + "epoch": 0.03188, + "grad_norm": 2.84375, + "grad_norm_var": 0.0502105712890625, + "learning_rate": 0.0001, + "loss": 4.8794, + "loss/crossentropy": 1.9271156787872314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28283432126045227, + "step": 1594 + }, + { + "epoch": 0.03192, + "grad_norm": 3.203125, + "grad_norm_var": 0.04372456868489583, + "learning_rate": 0.0001, + "loss": 5.3912, + "loss/crossentropy": 2.4196890592575073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3463610112667084, + "step": 1596 + }, + { + "epoch": 0.03196, + "grad_norm": 2.71875, + "grad_norm_var": 0.0500152587890625, + "learning_rate": 0.0001, + "loss": 5.1524, + "loss/crossentropy": 2.207236647605896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33829018473625183, + "step": 1598 + }, + { + "epoch": 0.032, + "grad_norm": 3.0625, + "grad_norm_var": 0.04101155598958333, + "learning_rate": 0.0001, + "loss": 5.4724, + "loss/crossentropy": 2.3757678270339966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3342677056789398, + "step": 1600 + }, + { + "epoch": 0.03204, + "grad_norm": 2.921875, + "grad_norm_var": 0.04512430826822917, + "learning_rate": 0.0001, + "loss": 5.1763, + "loss/crossentropy": 2.2605016231536865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3062159866094589, + "step": 1602 + }, + { + "epoch": 0.03208, + "grad_norm": 3.359375, + "grad_norm_var": 0.04794921875, + "learning_rate": 0.0001, + "loss": 5.7139, + "loss/crossentropy": 2.4536768198013306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36668023467063904, + "step": 1604 + }, + { + "epoch": 0.03212, + "grad_norm": 3.09375, + "grad_norm_var": 0.0523834228515625, + "learning_rate": 0.0001, + "loss": 5.137, + "loss/crossentropy": 1.9870036244392395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2974477708339691, + "step": 1606 + }, + { + "epoch": 0.03216, + "grad_norm": 3.015625, + "grad_norm_var": 0.039876302083333336, + "learning_rate": 0.0001, + "loss": 5.3926, + "loss/crossentropy": 2.1852506399154663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3327452540397644, + "step": 1608 + }, + { + "epoch": 0.0322, + "grad_norm": 3.140625, + "grad_norm_var": 0.03128255208333333, + "learning_rate": 0.0001, + "loss": 5.4964, + "loss/crossentropy": 2.2226197719573975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3110218793153763, + "step": 1610 + }, + { + "epoch": 0.03224, + "grad_norm": 3.09375, + "grad_norm_var": 0.029523722330729165, + "learning_rate": 0.0001, + "loss": 5.398, + "loss/crossentropy": 1.8255922198295593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28216174244880676, + "step": 1612 + }, + { + "epoch": 0.03228, + "grad_norm": 2.96875, + "grad_norm_var": 0.023200480143229167, + "learning_rate": 0.0001, + "loss": 5.2777, + "loss/crossentropy": 1.9663920998573303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3095496743917465, + "step": 1614 + }, + { + "epoch": 0.03232, + "grad_norm": 2.953125, + "grad_norm_var": 0.023908487955729165, + "learning_rate": 0.0001, + "loss": 5.1578, + "loss/crossentropy": 2.2089942693710327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3307983875274658, + "step": 1616 + }, + { + "epoch": 0.03236, + "grad_norm": 3.03125, + "grad_norm_var": 0.01793212890625, + "learning_rate": 0.0001, + "loss": 5.3331, + "loss/crossentropy": 2.261039137840271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.314799427986145, + "step": 1618 + }, + { + "epoch": 0.0324, + "grad_norm": 2.859375, + "grad_norm_var": 0.015555826822916667, + "learning_rate": 0.0001, + "loss": 5.1674, + "loss/crossentropy": 2.350824236869812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3515756279230118, + "step": 1620 + }, + { + "epoch": 0.03244, + "grad_norm": 2.9375, + "grad_norm_var": 0.010749308268229167, + "learning_rate": 0.0001, + "loss": 5.4853, + "loss/crossentropy": 2.2964736223220825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3059935122728348, + "step": 1622 + }, + { + "epoch": 0.03248, + "grad_norm": 2.859375, + "grad_norm_var": 0.016597493489583334, + "learning_rate": 0.0001, + "loss": 5.2503, + "loss/crossentropy": 2.36459481716156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33763329684734344, + "step": 1624 + }, + { + "epoch": 0.03252, + "grad_norm": 2.984375, + "grad_norm_var": 0.0126617431640625, + "learning_rate": 0.0001, + "loss": 5.2242, + "loss/crossentropy": 2.165920853614807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31318019330501556, + "step": 1626 + }, + { + "epoch": 0.03256, + "grad_norm": 2.859375, + "grad_norm_var": 0.010628255208333333, + "learning_rate": 0.0001, + "loss": 5.2544, + "loss/crossentropy": 2.326790690422058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3044355511665344, + "step": 1628 + }, + { + "epoch": 0.0326, + "grad_norm": 2.96875, + "grad_norm_var": 0.01129150390625, + "learning_rate": 0.0001, + "loss": 5.3369, + "loss/crossentropy": 1.8848688006401062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.306812584400177, + "step": 1630 + }, + { + "epoch": 0.03264, + "grad_norm": 2.953125, + "grad_norm_var": 0.0134185791015625, + "learning_rate": 0.0001, + "loss": 5.351, + "loss/crossentropy": 2.045863091945648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3091462701559067, + "step": 1632 + }, + { + "epoch": 0.03268, + "grad_norm": 2.78125, + "grad_norm_var": 0.01148681640625, + "learning_rate": 0.0001, + "loss": 4.9742, + "loss/crossentropy": 2.0707273483276367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3038959503173828, + "step": 1634 + }, + { + "epoch": 0.03272, + "grad_norm": 2.859375, + "grad_norm_var": 0.011237589518229167, + "learning_rate": 0.0001, + "loss": 5.2076, + "loss/crossentropy": 2.0786932706832886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29513464868068695, + "step": 1636 + }, + { + "epoch": 0.03276, + "grad_norm": 3.203125, + "grad_norm_var": 0.01900634765625, + "learning_rate": 0.0001, + "loss": 5.4237, + "loss/crossentropy": 2.1527108550071716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32102274894714355, + "step": 1638 + }, + { + "epoch": 0.0328, + "grad_norm": 3.15625, + "grad_norm_var": 0.021361287434895834, + "learning_rate": 0.0001, + "loss": 5.126, + "loss/crossentropy": 2.0383081436157227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3253529220819473, + "step": 1640 + }, + { + "epoch": 0.03284, + "grad_norm": 2.734375, + "grad_norm_var": 0.021809895833333332, + "learning_rate": 0.0001, + "loss": 5.0775, + "loss/crossentropy": 2.2801902294158936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32200203835964203, + "step": 1642 + }, + { + "epoch": 0.03288, + "grad_norm": 2.953125, + "grad_norm_var": 0.022184244791666665, + "learning_rate": 0.0001, + "loss": 5.3035, + "loss/crossentropy": 2.164717435836792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2997436225414276, + "step": 1644 + }, + { + "epoch": 0.03292, + "grad_norm": 3.1875, + "grad_norm_var": 0.025191243489583334, + "learning_rate": 0.0001, + "loss": 5.5166, + "loss/crossentropy": 2.389414072036743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32998231053352356, + "step": 1646 + }, + { + "epoch": 0.03296, + "grad_norm": 2.921875, + "grad_norm_var": 0.023949178059895833, + "learning_rate": 0.0001, + "loss": 5.3225, + "loss/crossentropy": 2.09418523311615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33829881250858307, + "step": 1648 + }, + { + "epoch": 0.033, + "grad_norm": 3.109375, + "grad_norm_var": 0.021800740559895834, + "learning_rate": 0.0001, + "loss": 5.261, + "loss/crossentropy": 2.324030637741089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30337512493133545, + "step": 1650 + }, + { + "epoch": 0.03304, + "grad_norm": 3.109375, + "grad_norm_var": 0.020992024739583334, + "learning_rate": 0.0001, + "loss": 5.4162, + "loss/crossentropy": 1.8635556101799011, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27077023684978485, + "step": 1652 + }, + { + "epoch": 0.03308, + "grad_norm": 2.8125, + "grad_norm_var": 0.0215972900390625, + "learning_rate": 0.0001, + "loss": 5.2371, + "loss/crossentropy": 2.1776190996170044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30546560883522034, + "step": 1654 + }, + { + "epoch": 0.03312, + "grad_norm": 3.1875, + "grad_norm_var": 0.021240234375, + "learning_rate": 0.0001, + "loss": 5.1721, + "loss/crossentropy": 2.1003682613372803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3304767310619354, + "step": 1656 + }, + { + "epoch": 0.03316, + "grad_norm": 3.203125, + "grad_norm_var": 0.02584228515625, + "learning_rate": 0.0001, + "loss": 5.8029, + "loss/crossentropy": 2.4331823587417603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3653264045715332, + "step": 1658 + }, + { + "epoch": 0.0332, + "grad_norm": 3.125, + "grad_norm_var": 0.1084381103515625, + "learning_rate": 0.0001, + "loss": 5.7157, + "loss/crossentropy": 1.9939777851104736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28948865830898285, + "step": 1660 + }, + { + "epoch": 0.03324, + "grad_norm": 2.9375, + "grad_norm_var": 0.10896809895833333, + "learning_rate": 0.0001, + "loss": 4.933, + "loss/crossentropy": 1.9093859791755676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3007328063249588, + "step": 1662 + }, + { + "epoch": 0.03328, + "grad_norm": 2.734375, + "grad_norm_var": 0.11998697916666666, + "learning_rate": 0.0001, + "loss": 5.1861, + "loss/crossentropy": 2.2847355604171753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2975463569164276, + "step": 1664 + }, + { + "epoch": 0.03332, + "grad_norm": 3.046875, + "grad_norm_var": 0.120166015625, + "learning_rate": 0.0001, + "loss": 5.3623, + "loss/crossentropy": 2.0081310868263245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28596948087215424, + "step": 1666 + }, + { + "epoch": 0.03336, + "grad_norm": 3.125, + "grad_norm_var": 0.11974283854166666, + "learning_rate": 0.0001, + "loss": 5.2338, + "loss/crossentropy": 2.189584493637085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30838510394096375, + "step": 1668 + }, + { + "epoch": 0.0334, + "grad_norm": 2.796875, + "grad_norm_var": 0.12802327473958333, + "learning_rate": 0.0001, + "loss": 5.2749, + "loss/crossentropy": 2.204169988632202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3091500401496887, + "step": 1670 + }, + { + "epoch": 0.03344, + "grad_norm": 2.75, + "grad_norm_var": 0.1338775634765625, + "learning_rate": 0.0001, + "loss": 5.2755, + "loss/crossentropy": 2.195966899394989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30603383481502533, + "step": 1672 + }, + { + "epoch": 0.03348, + "grad_norm": 3.0, + "grad_norm_var": 0.1251953125, + "learning_rate": 0.0001, + "loss": 5.8768, + "loss/crossentropy": 2.5402153730392456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3657945841550827, + "step": 1674 + }, + { + "epoch": 0.03352, + "grad_norm": 3.09375, + "grad_norm_var": 0.0277252197265625, + "learning_rate": 0.0001, + "loss": 5.5387, + "loss/crossentropy": 2.1721729040145874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29539716243743896, + "step": 1676 + }, + { + "epoch": 0.03356, + "grad_norm": 2.953125, + "grad_norm_var": 0.027253214518229166, + "learning_rate": 0.0001, + "loss": 5.2355, + "loss/crossentropy": 2.0591543912887573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29050062596797943, + "step": 1678 + }, + { + "epoch": 0.0336, + "grad_norm": 2.765625, + "grad_norm_var": 0.025846354166666665, + "learning_rate": 0.0001, + "loss": 5.4895, + "loss/crossentropy": 2.1396639347076416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3261077404022217, + "step": 1680 + }, + { + "epoch": 0.03364, + "grad_norm": 3.015625, + "grad_norm_var": 0.024689737955729166, + "learning_rate": 0.0001, + "loss": 5.2094, + "loss/crossentropy": 1.9553123712539673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3204822689294815, + "step": 1682 + }, + { + "epoch": 0.03368, + "grad_norm": 2.78125, + "grad_norm_var": 0.023746744791666666, + "learning_rate": 0.0001, + "loss": 5.3417, + "loss/crossentropy": 2.4139883518218994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32602658867836, + "step": 1684 + }, + { + "epoch": 0.03372, + "grad_norm": 2.90625, + "grad_norm_var": 0.019624837239583335, + "learning_rate": 0.0001, + "loss": 5.3479, + "loss/crossentropy": 1.9849293231964111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32610756158828735, + "step": 1686 + }, + { + "epoch": 0.03376, + "grad_norm": 2.953125, + "grad_norm_var": 0.008622233072916667, + "learning_rate": 0.0001, + "loss": 5.6218, + "loss/crossentropy": 2.698970675468445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3749641329050064, + "step": 1688 + }, + { + "epoch": 0.0338, + "grad_norm": 3.0625, + "grad_norm_var": 0.01285400390625, + "learning_rate": 0.0001, + "loss": 5.7617, + "loss/crossentropy": 2.715620517730713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35715436935424805, + "step": 1690 + }, + { + "epoch": 0.03384, + "grad_norm": 3.0, + "grad_norm_var": 0.01201171875, + "learning_rate": 0.0001, + "loss": 5.5073, + "loss/crossentropy": 2.7213666439056396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34585659205913544, + "step": 1692 + }, + { + "epoch": 0.03388, + "grad_norm": 2.84375, + "grad_norm_var": 0.016044108072916667, + "learning_rate": 0.0001, + "loss": 5.2674, + "loss/crossentropy": 2.277606725692749, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3140410780906677, + "step": 1694 + }, + { + "epoch": 0.03392, + "grad_norm": 3.765625, + "grad_norm_var": 0.05364176432291667, + "learning_rate": 0.0001, + "loss": 5.2633, + "loss/crossentropy": 2.332197904586792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30997559428215027, + "step": 1696 + }, + { + "epoch": 0.03396, + "grad_norm": 2.953125, + "grad_norm_var": 0.05464579264322917, + "learning_rate": 0.0001, + "loss": 5.4323, + "loss/crossentropy": 2.4413230419158936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3075388967990875, + "step": 1698 + }, + { + "epoch": 0.034, + "grad_norm": 2.859375, + "grad_norm_var": 0.0567535400390625, + "learning_rate": 0.0001, + "loss": 5.1099, + "loss/crossentropy": 2.2601696252822876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3050367534160614, + "step": 1700 + }, + { + "epoch": 0.03404, + "grad_norm": 2.640625, + "grad_norm_var": 0.06572977701822917, + "learning_rate": 0.0001, + "loss": 4.9925, + "loss/crossentropy": 2.2910414934158325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3248990923166275, + "step": 1702 + }, + { + "epoch": 0.03408, + "grad_norm": 2.890625, + "grad_norm_var": 0.06843159993489584, + "learning_rate": 0.0001, + "loss": 5.3814, + "loss/crossentropy": 1.9898682832717896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27573561668395996, + "step": 1704 + }, + { + "epoch": 0.03412, + "grad_norm": 2.96875, + "grad_norm_var": 0.06398824055989584, + "learning_rate": 0.0001, + "loss": 5.1506, + "loss/crossentropy": 2.1234602332115173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30724021792411804, + "step": 1706 + }, + { + "epoch": 0.03416, + "grad_norm": 3.390625, + "grad_norm_var": 0.0764801025390625, + "learning_rate": 0.0001, + "loss": 5.5433, + "loss/crossentropy": 2.34807026386261, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32687780261039734, + "step": 1708 + }, + { + "epoch": 0.0342, + "grad_norm": 2.71875, + "grad_norm_var": 0.07669169108072917, + "learning_rate": 0.0001, + "loss": 5.1249, + "loss/crossentropy": 2.17264860868454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3086177706718445, + "step": 1710 + }, + { + "epoch": 0.03424, + "grad_norm": 2.96875, + "grad_norm_var": 0.0303131103515625, + "learning_rate": 0.0001, + "loss": 5.3613, + "loss/crossentropy": 2.1094497442245483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29483039677143097, + "step": 1712 + }, + { + "epoch": 0.03428, + "grad_norm": 2.71875, + "grad_norm_var": 0.03284505208333333, + "learning_rate": 0.0001, + "loss": 5.256, + "loss/crossentropy": 2.2379074692726135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2799292802810669, + "step": 1714 + }, + { + "epoch": 0.03432, + "grad_norm": 3.078125, + "grad_norm_var": 0.21389567057291667, + "learning_rate": 0.0001, + "loss": 5.5834, + "loss/crossentropy": 2.4616905450820923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33721986413002014, + "step": 1716 + }, + { + "epoch": 0.03436, + "grad_norm": 2.953125, + "grad_norm_var": 0.2042877197265625, + "learning_rate": 0.0001, + "loss": 5.4415, + "loss/crossentropy": 2.383226990699768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3242805302143097, + "step": 1718 + }, + { + "epoch": 0.0344, + "grad_norm": 2.921875, + "grad_norm_var": 0.19931538899739584, + "learning_rate": 0.0001, + "loss": 5.5514, + "loss/crossentropy": 2.495948314666748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3131762146949768, + "step": 1720 + }, + { + "epoch": 0.03444, + "grad_norm": 3.109375, + "grad_norm_var": 0.19524739583333334, + "learning_rate": 0.0001, + "loss": 5.6767, + "loss/crossentropy": 2.1921653747558594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.320631667971611, + "step": 1722 + }, + { + "epoch": 0.03448, + "grad_norm": 3.40625, + "grad_norm_var": 0.20221354166666666, + "learning_rate": 0.0001, + "loss": 5.2, + "loss/crossentropy": 2.0795475840568542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29602357745170593, + "step": 1724 + }, + { + "epoch": 0.03452, + "grad_norm": 2.75, + "grad_norm_var": 0.201171875, + "learning_rate": 0.0001, + "loss": 4.7489, + "loss/crossentropy": 1.911207377910614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.287412166595459, + "step": 1726 + }, + { + "epoch": 0.03456, + "grad_norm": 2.96875, + "grad_norm_var": 0.20129801432291666, + "learning_rate": 0.0001, + "loss": 5.155, + "loss/crossentropy": 2.0638214349746704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32240423560142517, + "step": 1728 + }, + { + "epoch": 0.0346, + "grad_norm": 2.75, + "grad_norm_var": 0.20066630045572917, + "learning_rate": 0.0001, + "loss": 5.3464, + "loss/crossentropy": 2.355573534965515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3161381185054779, + "step": 1730 + }, + { + "epoch": 0.03464, + "grad_norm": 2.8125, + "grad_norm_var": 0.04185282389322917, + "learning_rate": 0.0001, + "loss": 5.1039, + "loss/crossentropy": 2.2227123975753784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31909704208374023, + "step": 1732 + }, + { + "epoch": 0.03468, + "grad_norm": 2.765625, + "grad_norm_var": 0.03954671223958333, + "learning_rate": 0.0001, + "loss": 5.2249, + "loss/crossentropy": 2.203602910041809, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3238847255706787, + "step": 1734 + }, + { + "epoch": 0.03472, + "grad_norm": 3.34375, + "grad_norm_var": 0.07062174479166666, + "learning_rate": 0.0001, + "loss": 5.0623, + "loss/crossentropy": 2.2696332335472107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3174774497747421, + "step": 1736 + }, + { + "epoch": 0.03476, + "grad_norm": 2.78125, + "grad_norm_var": 0.07385660807291666, + "learning_rate": 0.0001, + "loss": 5.0469, + "loss/crossentropy": 1.8124465942382812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2645348161458969, + "step": 1738 + }, + { + "epoch": 0.0348, + "grad_norm": 3.015625, + "grad_norm_var": 0.0578765869140625, + "learning_rate": 0.0001, + "loss": 5.589, + "loss/crossentropy": 2.1951464414596558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31606370210647583, + "step": 1740 + }, + { + "epoch": 0.03484, + "grad_norm": 3.109375, + "grad_norm_var": 0.05624593098958333, + "learning_rate": 0.0001, + "loss": 5.4867, + "loss/crossentropy": 2.4413230419158936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33688417077064514, + "step": 1742 + }, + { + "epoch": 0.03488, + "grad_norm": 3.09375, + "grad_norm_var": 0.05705973307291667, + "learning_rate": 0.0001, + "loss": 5.7898, + "loss/crossentropy": 2.1357219219207764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3476516157388687, + "step": 1744 + }, + { + "epoch": 0.03492, + "grad_norm": 2.859375, + "grad_norm_var": 0.0551666259765625, + "learning_rate": 0.0001, + "loss": 5.4465, + "loss/crossentropy": 2.1557592153549194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3080083876848221, + "step": 1746 + }, + { + "epoch": 0.03496, + "grad_norm": 2.859375, + "grad_norm_var": 0.0512603759765625, + "learning_rate": 0.0001, + "loss": 5.4949, + "loss/crossentropy": 2.3549705743789673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33814217150211334, + "step": 1748 + }, + { + "epoch": 0.035, + "grad_norm": 3.109375, + "grad_norm_var": 0.04348958333333333, + "learning_rate": 0.0001, + "loss": 5.5881, + "loss/crossentropy": 2.382844924926758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31944599747657776, + "step": 1750 + }, + { + "epoch": 0.03504, + "grad_norm": 2.765625, + "grad_norm_var": 0.02138671875, + "learning_rate": 0.0001, + "loss": 5.2937, + "loss/crossentropy": 2.3312920331954956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3346693813800812, + "step": 1752 + }, + { + "epoch": 0.03508, + "grad_norm": 3.015625, + "grad_norm_var": 0.019677734375, + "learning_rate": 0.0001, + "loss": 5.1981, + "loss/crossentropy": 2.1921491026878357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31133508682250977, + "step": 1754 + }, + { + "epoch": 0.03512, + "grad_norm": 2.71875, + "grad_norm_var": 0.024494425455729166, + "learning_rate": 0.0001, + "loss": 4.8796, + "loss/crossentropy": 2.0229611992836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26832816004753113, + "step": 1756 + }, + { + "epoch": 0.03516, + "grad_norm": 2.75, + "grad_norm_var": 0.025764973958333333, + "learning_rate": 0.0001, + "loss": 5.0322, + "loss/crossentropy": 1.8138108849525452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2576165944337845, + "step": 1758 + }, + { + "epoch": 0.0352, + "grad_norm": 2.75, + "grad_norm_var": 0.0232574462890625, + "learning_rate": 0.0001, + "loss": 5.1276, + "loss/crossentropy": 2.0019126534461975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28094957768917084, + "step": 1760 + }, + { + "epoch": 0.03524, + "grad_norm": 2.703125, + "grad_norm_var": 0.022554524739583335, + "learning_rate": 0.0001, + "loss": 5.1776, + "loss/crossentropy": 2.400240898132324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30936548113822937, + "step": 1762 + }, + { + "epoch": 0.03528, + "grad_norm": 3.328125, + "grad_norm_var": 0.03495992024739583, + "learning_rate": 0.0001, + "loss": 4.9401, + "loss/crossentropy": 2.019958734512329, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27741560339927673, + "step": 1764 + }, + { + "epoch": 0.03532, + "grad_norm": 2.875, + "grad_norm_var": 0.025032552083333333, + "learning_rate": 0.0001, + "loss": 5.0912, + "loss/crossentropy": 1.9099596738815308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28769225627183914, + "step": 1766 + }, + { + "epoch": 0.03536, + "grad_norm": 2.890625, + "grad_norm_var": 0.024344889322916667, + "learning_rate": 0.0001, + "loss": 5.42, + "loss/crossentropy": 2.2508283853530884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2945093661546707, + "step": 1768 + }, + { + "epoch": 0.0354, + "grad_norm": 2.828125, + "grad_norm_var": 0.0222320556640625, + "learning_rate": 0.0001, + "loss": 5.2516, + "loss/crossentropy": 1.9332409501075745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26304441690444946, + "step": 1770 + }, + { + "epoch": 0.03544, + "grad_norm": 2.875, + "grad_norm_var": 0.022557576497395832, + "learning_rate": 0.0001, + "loss": 5.1896, + "loss/crossentropy": 2.143627643585205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28407415747642517, + "step": 1772 + }, + { + "epoch": 0.03548, + "grad_norm": 2.859375, + "grad_norm_var": 0.029683430989583332, + "learning_rate": 0.0001, + "loss": 5.4763, + "loss/crossentropy": 2.32085120677948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3176119029521942, + "step": 1774 + }, + { + "epoch": 0.03552, + "grad_norm": 2.828125, + "grad_norm_var": 0.028539021809895832, + "learning_rate": 0.0001, + "loss": 5.1717, + "loss/crossentropy": 1.7893801927566528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27407801151275635, + "step": 1776 + }, + { + "epoch": 0.03556, + "grad_norm": 2.78125, + "grad_norm_var": 0.027228800455729167, + "learning_rate": 0.0001, + "loss": 5.42, + "loss/crossentropy": 2.206292986869812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3071517199277878, + "step": 1778 + }, + { + "epoch": 0.0356, + "grad_norm": 2.84375, + "grad_norm_var": 0.013263956705729166, + "learning_rate": 0.0001, + "loss": 5.1879, + "loss/crossentropy": 2.1285043954849243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3164139539003372, + "step": 1780 + }, + { + "epoch": 0.03564, + "grad_norm": 3.15625, + "grad_norm_var": 0.018635050455729166, + "learning_rate": 0.0001, + "loss": 5.2826, + "loss/crossentropy": 2.16570383310318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3052050769329071, + "step": 1782 + }, + { + "epoch": 0.03568, + "grad_norm": 2.734375, + "grad_norm_var": 0.019950358072916667, + "learning_rate": 0.0001, + "loss": 5.0921, + "loss/crossentropy": 1.9799941778182983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2655785381793976, + "step": 1784 + }, + { + "epoch": 0.03572, + "grad_norm": 2.765625, + "grad_norm_var": 0.0209625244140625, + "learning_rate": 0.0001, + "loss": 5.2468, + "loss/crossentropy": 1.9801498651504517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2824363112449646, + "step": 1786 + }, + { + "epoch": 0.03576, + "grad_norm": 2.828125, + "grad_norm_var": 0.020947265625, + "learning_rate": 0.0001, + "loss": 5.0131, + "loss/crossentropy": 1.5805786848068237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25234321504831314, + "step": 1788 + }, + { + "epoch": 0.0358, + "grad_norm": 3.15625, + "grad_norm_var": 0.021484375, + "learning_rate": 0.0001, + "loss": 5.296, + "loss/crossentropy": 2.2434048652648926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2896551638841629, + "step": 1790 + }, + { + "epoch": 0.03584, + "grad_norm": 2.78125, + "grad_norm_var": 0.022411092122395834, + "learning_rate": 0.0001, + "loss": 5.0179, + "loss/crossentropy": 1.9738762378692627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2959328889846802, + "step": 1792 + }, + { + "epoch": 0.03588, + "grad_norm": 3.0625, + "grad_norm_var": 0.0272857666015625, + "learning_rate": 0.0001, + "loss": 5.2317, + "loss/crossentropy": 2.222583770751953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29915711283683777, + "step": 1794 + }, + { + "epoch": 0.03592, + "grad_norm": 2.8125, + "grad_norm_var": 0.032942708333333334, + "learning_rate": 0.0001, + "loss": 5.4192, + "loss/crossentropy": 2.188909649848938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.294817179441452, + "step": 1796 + }, + { + "epoch": 0.03596, + "grad_norm": 3.015625, + "grad_norm_var": 0.030907185872395833, + "learning_rate": 0.0001, + "loss": 5.6303, + "loss/crossentropy": 2.4745373725891113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32838208973407745, + "step": 1798 + }, + { + "epoch": 0.036, + "grad_norm": 2.890625, + "grad_norm_var": 0.028595987955729166, + "learning_rate": 0.0001, + "loss": 5.3466, + "loss/crossentropy": 1.9314215779304504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29102426767349243, + "step": 1800 + }, + { + "epoch": 0.03604, + "grad_norm": 2.796875, + "grad_norm_var": 0.026276652018229166, + "learning_rate": 0.0001, + "loss": 4.9782, + "loss/crossentropy": 2.0099900364875793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30170293152332306, + "step": 1802 + }, + { + "epoch": 0.03608, + "grad_norm": 2.859375, + "grad_norm_var": 0.024388631184895832, + "learning_rate": 0.0001, + "loss": 5.2506, + "loss/crossentropy": 2.1273564100265503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3019937574863434, + "step": 1804 + }, + { + "epoch": 0.03612, + "grad_norm": 2.96875, + "grad_norm_var": 0.016341145833333334, + "learning_rate": 0.0001, + "loss": 5.1003, + "loss/crossentropy": 2.065160095691681, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2817380279302597, + "step": 1806 + }, + { + "epoch": 0.03616, + "grad_norm": 2.828125, + "grad_norm_var": 0.018561808268229167, + "learning_rate": 0.0001, + "loss": 5.1569, + "loss/crossentropy": 2.262821078300476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33245067298412323, + "step": 1808 + }, + { + "epoch": 0.0362, + "grad_norm": 2.625, + "grad_norm_var": 0.020003255208333334, + "learning_rate": 0.0001, + "loss": 4.8323, + "loss/crossentropy": 2.164163827896118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26008155941963196, + "step": 1810 + }, + { + "epoch": 0.03624, + "grad_norm": 2.78125, + "grad_norm_var": 0.012398274739583333, + "learning_rate": 0.0001, + "loss": 5.2517, + "loss/crossentropy": 2.147629737854004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3073730617761612, + "step": 1812 + }, + { + "epoch": 0.03628, + "grad_norm": 2.75, + "grad_norm_var": 0.023900349934895832, + "learning_rate": 0.0001, + "loss": 5.1255, + "loss/crossentropy": 2.04233980178833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2981158718466759, + "step": 1814 + }, + { + "epoch": 0.03632, + "grad_norm": 3.0, + "grad_norm_var": 0.026439412434895834, + "learning_rate": 0.0001, + "loss": 4.9465, + "loss/crossentropy": 2.264409065246582, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32124973833560944, + "step": 1816 + }, + { + "epoch": 0.03636, + "grad_norm": 3.109375, + "grad_norm_var": 0.033610026041666664, + "learning_rate": 0.0001, + "loss": 5.4324, + "loss/crossentropy": 2.092079997062683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3318764269351959, + "step": 1818 + }, + { + "epoch": 0.0364, + "grad_norm": 2.828125, + "grad_norm_var": 0.03850504557291667, + "learning_rate": 0.0001, + "loss": 5.2766, + "loss/crossentropy": 2.1007314324378967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2930498272180557, + "step": 1820 + }, + { + "epoch": 0.03644, + "grad_norm": 2.84375, + "grad_norm_var": 0.03697001139322917, + "learning_rate": 0.0001, + "loss": 5.129, + "loss/crossentropy": 2.2377375960350037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30473431944847107, + "step": 1822 + }, + { + "epoch": 0.03648, + "grad_norm": 3.296875, + "grad_norm_var": 0.04684244791666667, + "learning_rate": 0.0001, + "loss": 5.4209, + "loss/crossentropy": 2.0965787172317505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30526305735111237, + "step": 1824 + }, + { + "epoch": 0.03652, + "grad_norm": 3.078125, + "grad_norm_var": 0.04755859375, + "learning_rate": 0.0001, + "loss": 5.0359, + "loss/crossentropy": 2.208711862564087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2727830111980438, + "step": 1826 + }, + { + "epoch": 0.03656, + "grad_norm": 2.796875, + "grad_norm_var": 0.050633748372395836, + "learning_rate": 0.0001, + "loss": 5.1692, + "loss/crossentropy": 2.1706738471984863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3054092824459076, + "step": 1828 + }, + { + "epoch": 0.0366, + "grad_norm": 2.796875, + "grad_norm_var": 0.037262980143229166, + "learning_rate": 0.0001, + "loss": 5.2311, + "loss/crossentropy": 2.0891621112823486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3111531287431717, + "step": 1830 + }, + { + "epoch": 0.03664, + "grad_norm": 2.859375, + "grad_norm_var": 0.03560791015625, + "learning_rate": 0.0001, + "loss": 5.0537, + "loss/crossentropy": 2.0721256732940674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30227208137512207, + "step": 1832 + }, + { + "epoch": 0.03668, + "grad_norm": 3.015625, + "grad_norm_var": 0.03023681640625, + "learning_rate": 0.0001, + "loss": 5.1267, + "loss/crossentropy": 2.2015734910964966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3185647875070572, + "step": 1834 + }, + { + "epoch": 0.03672, + "grad_norm": 3.03125, + "grad_norm_var": 0.026493326822916666, + "learning_rate": 0.0001, + "loss": 5.1973, + "loss/crossentropy": 2.1985132694244385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3032165467739105, + "step": 1836 + }, + { + "epoch": 0.03676, + "grad_norm": 2.890625, + "grad_norm_var": 0.026334635416666665, + "learning_rate": 0.0001, + "loss": 5.2041, + "loss/crossentropy": 2.170067548751831, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3295029550790787, + "step": 1838 + }, + { + "epoch": 0.0368, + "grad_norm": 2.921875, + "grad_norm_var": 0.014615885416666667, + "learning_rate": 0.0001, + "loss": 5.0049, + "loss/crossentropy": 2.113592267036438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2775915116071701, + "step": 1840 + }, + { + "epoch": 0.03684, + "grad_norm": 2.78125, + "grad_norm_var": 0.0117828369140625, + "learning_rate": 0.0001, + "loss": 5.4221, + "loss/crossentropy": 2.1905024647712708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33297547698020935, + "step": 1842 + }, + { + "epoch": 0.03688, + "grad_norm": 2.828125, + "grad_norm_var": 0.012919108072916666, + "learning_rate": 0.0001, + "loss": 5.1859, + "loss/crossentropy": 2.252369999885559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29375749826431274, + "step": 1844 + }, + { + "epoch": 0.03692, + "grad_norm": 3.015625, + "grad_norm_var": 0.020182291666666668, + "learning_rate": 0.0001, + "loss": 4.8942, + "loss/crossentropy": 1.7526759505271912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2707225978374481, + "step": 1846 + }, + { + "epoch": 0.03696, + "grad_norm": 3.21875, + "grad_norm_var": 0.028880818684895834, + "learning_rate": 0.0001, + "loss": 5.6529, + "loss/crossentropy": 2.592544913291931, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3520146906375885, + "step": 1848 + }, + { + "epoch": 0.037, + "grad_norm": 2.8125, + "grad_norm_var": 0.0278961181640625, + "learning_rate": 0.0001, + "loss": 4.9816, + "loss/crossentropy": 1.8699345588684082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2852860391139984, + "step": 1850 + }, + { + "epoch": 0.03704, + "grad_norm": 2.640625, + "grad_norm_var": 0.03209228515625, + "learning_rate": 0.0001, + "loss": 5.1326, + "loss/crossentropy": 2.2219313383102417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3210798054933548, + "step": 1852 + }, + { + "epoch": 0.03708, + "grad_norm": 2.796875, + "grad_norm_var": 0.0327056884765625, + "learning_rate": 0.0001, + "loss": 5.2301, + "loss/crossentropy": 1.9926818013191223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26928839832544327, + "step": 1854 + }, + { + "epoch": 0.03712, + "grad_norm": 2.828125, + "grad_norm_var": 0.032515462239583334, + "learning_rate": 0.0001, + "loss": 5.0372, + "loss/crossentropy": 2.019917130470276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28046920895576477, + "step": 1856 + }, + { + "epoch": 0.03716, + "grad_norm": 4.84375, + "grad_norm_var": 0.2762858072916667, + "learning_rate": 0.0001, + "loss": 5.6297, + "loss/crossentropy": 2.2585690021514893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30598941445350647, + "step": 1858 + }, + { + "epoch": 0.0372, + "grad_norm": 2.984375, + "grad_norm_var": 0.27327067057291665, + "learning_rate": 0.0001, + "loss": 5.3508, + "loss/crossentropy": 2.298324942588806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3494870364665985, + "step": 1860 + }, + { + "epoch": 0.03724, + "grad_norm": 3.125, + "grad_norm_var": 0.25745035807291666, + "learning_rate": 0.0001, + "loss": 5.54, + "loss/crossentropy": 2.430496573448181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3227449208498001, + "step": 1862 + }, + { + "epoch": 0.03728, + "grad_norm": 2.953125, + "grad_norm_var": 0.26183980305989585, + "learning_rate": 0.0001, + "loss": 5.1876, + "loss/crossentropy": 2.090576171875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2790074646472931, + "step": 1864 + }, + { + "epoch": 0.03732, + "grad_norm": 3.0625, + "grad_norm_var": 0.25806884765625, + "learning_rate": 0.0001, + "loss": 5.1799, + "loss/crossentropy": 2.2794109582901, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3135898858308792, + "step": 1866 + }, + { + "epoch": 0.03736, + "grad_norm": 2.640625, + "grad_norm_var": 0.2597076416015625, + "learning_rate": 0.0001, + "loss": 4.9333, + "loss/crossentropy": 2.2433481216430664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27558377385139465, + "step": 1868 + }, + { + "epoch": 0.0374, + "grad_norm": 2.59375, + "grad_norm_var": 0.26806233723958334, + "learning_rate": 0.0001, + "loss": 5.292, + "loss/crossentropy": 2.3111730813980103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3172578364610672, + "step": 1870 + }, + { + "epoch": 0.03744, + "grad_norm": 2.921875, + "grad_norm_var": 0.270654296875, + "learning_rate": 0.0001, + "loss": 5.2364, + "loss/crossentropy": 2.0028095841407776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35644619166851044, + "step": 1872 + }, + { + "epoch": 0.03748, + "grad_norm": 2.828125, + "grad_norm_var": 0.04501546223958333, + "learning_rate": 0.0001, + "loss": 5.2183, + "loss/crossentropy": 2.347644329071045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3119680881500244, + "step": 1874 + }, + { + "epoch": 0.03752, + "grad_norm": 2.65625, + "grad_norm_var": 0.03474833170572917, + "learning_rate": 0.0001, + "loss": 5.0787, + "loss/crossentropy": 2.118954062461853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28975334763526917, + "step": 1876 + }, + { + "epoch": 0.03756, + "grad_norm": 3.0625, + "grad_norm_var": 0.033568318684895834, + "learning_rate": 0.0001, + "loss": 5.6649, + "loss/crossentropy": 2.4376548528671265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3358190506696701, + "step": 1878 + }, + { + "epoch": 0.0376, + "grad_norm": 3.28125, + "grad_norm_var": 0.042724609375, + "learning_rate": 0.0001, + "loss": 5.4924, + "loss/crossentropy": 2.5907636880874634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3234570771455765, + "step": 1880 + }, + { + "epoch": 0.03764, + "grad_norm": 2.859375, + "grad_norm_var": 0.04352925618489583, + "learning_rate": 0.0001, + "loss": 5.0298, + "loss/crossentropy": 1.574956238269806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26118964701890945, + "step": 1882 + }, + { + "epoch": 0.03768, + "grad_norm": 3.203125, + "grad_norm_var": 0.04599609375, + "learning_rate": 0.0001, + "loss": 5.3339, + "loss/crossentropy": 2.3571736812591553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.308853879570961, + "step": 1884 + }, + { + "epoch": 0.03772, + "grad_norm": 2.546875, + "grad_norm_var": 0.03877665201822917, + "learning_rate": 0.0001, + "loss": 5.0807, + "loss/crossentropy": 2.1266958117485046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.307782918214798, + "step": 1886 + }, + { + "epoch": 0.03776, + "grad_norm": 3.0, + "grad_norm_var": 0.03876546223958333, + "learning_rate": 0.0001, + "loss": 5.5402, + "loss/crossentropy": 2.3529077768325806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32287272810935974, + "step": 1888 + }, + { + "epoch": 0.0378, + "grad_norm": 3.015625, + "grad_norm_var": 0.03905843098958333, + "learning_rate": 0.0001, + "loss": 5.4555, + "loss/crossentropy": 2.277345299720764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32319171726703644, + "step": 1890 + }, + { + "epoch": 0.03784, + "grad_norm": 2.859375, + "grad_norm_var": 0.035008748372395836, + "learning_rate": 0.0001, + "loss": 5.01, + "loss/crossentropy": 2.102940857410431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.292771652340889, + "step": 1892 + }, + { + "epoch": 0.03788, + "grad_norm": 2.640625, + "grad_norm_var": 0.03998921712239583, + "learning_rate": 0.0001, + "loss": 5.0784, + "loss/crossentropy": 1.9744033813476562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2844446450471878, + "step": 1894 + }, + { + "epoch": 0.03792, + "grad_norm": 2.625, + "grad_norm_var": 0.03250325520833333, + "learning_rate": 0.0001, + "loss": 4.9859, + "loss/crossentropy": 1.8222022652626038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28611525893211365, + "step": 1896 + }, + { + "epoch": 0.03796, + "grad_norm": 2.96875, + "grad_norm_var": 0.03135477701822917, + "learning_rate": 0.0001, + "loss": 5.0704, + "loss/crossentropy": 2.1963966488838196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2988738566637039, + "step": 1898 + }, + { + "epoch": 0.038, + "grad_norm": 3.046875, + "grad_norm_var": 0.025927734375, + "learning_rate": 0.0001, + "loss": 5.1155, + "loss/crossentropy": 1.9982789754867554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.312205046415329, + "step": 1900 + }, + { + "epoch": 0.03804, + "grad_norm": 2.75, + "grad_norm_var": 0.020612589518229165, + "learning_rate": 0.0001, + "loss": 5.2097, + "loss/crossentropy": 2.1999258995056152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29916079342365265, + "step": 1902 + }, + { + "epoch": 0.03808, + "grad_norm": 2.859375, + "grad_norm_var": 0.016999308268229166, + "learning_rate": 0.0001, + "loss": 5.2233, + "loss/crossentropy": 2.0725532174110413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3093830645084381, + "step": 1904 + }, + { + "epoch": 0.03812, + "grad_norm": 3.03125, + "grad_norm_var": 0.027079264322916668, + "learning_rate": 0.0001, + "loss": 5.7014, + "loss/crossentropy": 2.2504276037216187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.332836389541626, + "step": 1906 + }, + { + "epoch": 0.03816, + "grad_norm": 2.875, + "grad_norm_var": 0.026927693684895834, + "learning_rate": 0.0001, + "loss": 5.3413, + "loss/crossentropy": 2.1570577025413513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3210139572620392, + "step": 1908 + }, + { + "epoch": 0.0382, + "grad_norm": 3.59375, + "grad_norm_var": 0.05137430826822917, + "learning_rate": 0.0001, + "loss": 5.6215, + "loss/crossentropy": 2.0739041566848755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3091907352209091, + "step": 1910 + }, + { + "epoch": 0.03824, + "grad_norm": 2.703125, + "grad_norm_var": 0.0502838134765625, + "learning_rate": 0.0001, + "loss": 5.0772, + "loss/crossentropy": 2.0542168021202087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2791624963283539, + "step": 1912 + }, + { + "epoch": 0.03828, + "grad_norm": 2.65625, + "grad_norm_var": 0.0561431884765625, + "learning_rate": 0.0001, + "loss": 4.9386, + "loss/crossentropy": 1.9705287218093872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25483617186546326, + "step": 1914 + }, + { + "epoch": 0.03832, + "grad_norm": 2.875, + "grad_norm_var": 0.05819905598958333, + "learning_rate": 0.0001, + "loss": 5.054, + "loss/crossentropy": 2.0234111547470093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3110152333974838, + "step": 1916 + }, + { + "epoch": 0.03836, + "grad_norm": 3.15625, + "grad_norm_var": 0.06004130045572917, + "learning_rate": 0.0001, + "loss": 5.2723, + "loss/crossentropy": 2.010735273361206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2943515181541443, + "step": 1918 + }, + { + "epoch": 0.0384, + "grad_norm": 2.734375, + "grad_norm_var": 0.06575419108072916, + "learning_rate": 0.0001, + "loss": 4.9471, + "loss/crossentropy": 2.1912686824798584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27914971113204956, + "step": 1920 + }, + { + "epoch": 0.03844, + "grad_norm": 2.421875, + "grad_norm_var": 0.0709869384765625, + "learning_rate": 0.0001, + "loss": 5.085, + "loss/crossentropy": 1.9889940023422241, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26166442036628723, + "step": 1922 + }, + { + "epoch": 0.03848, + "grad_norm": 2.96875, + "grad_norm_var": 0.07668863932291667, + "learning_rate": 0.0001, + "loss": 5.3534, + "loss/crossentropy": 2.154898941516876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31905338168144226, + "step": 1924 + }, + { + "epoch": 0.03852, + "grad_norm": 2.796875, + "grad_norm_var": 0.03997395833333333, + "learning_rate": 0.0001, + "loss": 5.0847, + "loss/crossentropy": 2.44269061088562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28736811876296997, + "step": 1926 + }, + { + "epoch": 0.03856, + "grad_norm": 2.6875, + "grad_norm_var": 0.04108784993489583, + "learning_rate": 0.0001, + "loss": 5.0585, + "loss/crossentropy": 1.6790328621864319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32607489824295044, + "step": 1928 + }, + { + "epoch": 0.0386, + "grad_norm": 3.046875, + "grad_norm_var": 0.05115559895833333, + "learning_rate": 0.0001, + "loss": 5.336, + "loss/crossentropy": 2.0223641991615295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2871186435222626, + "step": 1930 + }, + { + "epoch": 0.03864, + "grad_norm": 2.8125, + "grad_norm_var": 0.05756734212239583, + "learning_rate": 0.0001, + "loss": 5.549, + "loss/crossentropy": 2.451051712036133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33973076939582825, + "step": 1932 + }, + { + "epoch": 0.03868, + "grad_norm": 2.765625, + "grad_norm_var": 0.052262369791666666, + "learning_rate": 0.0001, + "loss": 5.4403, + "loss/crossentropy": 2.2884862422943115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29729554057121277, + "step": 1934 + }, + { + "epoch": 0.03872, + "grad_norm": 2.890625, + "grad_norm_var": 0.04889322916666667, + "learning_rate": 0.0001, + "loss": 5.6203, + "loss/crossentropy": 2.113345444202423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36945630609989166, + "step": 1936 + }, + { + "epoch": 0.03876, + "grad_norm": 2.71875, + "grad_norm_var": 0.03752848307291667, + "learning_rate": 0.0001, + "loss": 5.1845, + "loss/crossentropy": 2.139409840106964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2899101823568344, + "step": 1938 + }, + { + "epoch": 0.0388, + "grad_norm": 2.953125, + "grad_norm_var": 0.03622639973958333, + "learning_rate": 0.0001, + "loss": 5.5397, + "loss/crossentropy": 2.1029305458068848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29830390214920044, + "step": 1940 + }, + { + "epoch": 0.03884, + "grad_norm": 2.734375, + "grad_norm_var": 0.03816630045572917, + "learning_rate": 0.0001, + "loss": 5.1383, + "loss/crossentropy": 1.7736502885818481, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27648696303367615, + "step": 1942 + }, + { + "epoch": 0.03888, + "grad_norm": 2.671875, + "grad_norm_var": 0.03942057291666667, + "learning_rate": 0.0001, + "loss": 5.0885, + "loss/crossentropy": 2.0281469225883484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30329641699790955, + "step": 1944 + }, + { + "epoch": 0.03892, + "grad_norm": 2.890625, + "grad_norm_var": 0.03528544108072917, + "learning_rate": 0.0001, + "loss": 5.3265, + "loss/crossentropy": 2.404891610145569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3563212752342224, + "step": 1946 + }, + { + "epoch": 0.03896, + "grad_norm": 2.53125, + "grad_norm_var": 0.03572489420572917, + "learning_rate": 0.0001, + "loss": 5.0657, + "loss/crossentropy": 2.2187922596931458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2999056503176689, + "step": 1948 + }, + { + "epoch": 0.039, + "grad_norm": 2.875, + "grad_norm_var": 0.03566792805989583, + "learning_rate": 0.0001, + "loss": 5.0499, + "loss/crossentropy": 2.3901994228363037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3288661539554596, + "step": 1950 + }, + { + "epoch": 0.03904, + "grad_norm": 3.15625, + "grad_norm_var": 0.04069722493489583, + "learning_rate": 0.0001, + "loss": 5.252, + "loss/crossentropy": 2.274617314338684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2928028404712677, + "step": 1952 + }, + { + "epoch": 0.03908, + "grad_norm": 2.859375, + "grad_norm_var": 0.03430582682291667, + "learning_rate": 0.0001, + "loss": 5.4463, + "loss/crossentropy": 2.2478950023651123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3027127981185913, + "step": 1954 + }, + { + "epoch": 0.03912, + "grad_norm": 2.78125, + "grad_norm_var": 0.030321248372395835, + "learning_rate": 0.0001, + "loss": 5.2707, + "loss/crossentropy": 2.0634876489639282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29374830424785614, + "step": 1956 + }, + { + "epoch": 0.03916, + "grad_norm": 2.90625, + "grad_norm_var": 0.0287994384765625, + "learning_rate": 0.0001, + "loss": 5.3441, + "loss/crossentropy": 2.171326994895935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2995557487010956, + "step": 1958 + }, + { + "epoch": 0.0392, + "grad_norm": 2.515625, + "grad_norm_var": 0.040266927083333334, + "learning_rate": 0.0001, + "loss": 4.9937, + "loss/crossentropy": 2.142563223838806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27376608550548553, + "step": 1960 + }, + { + "epoch": 0.03924, + "grad_norm": 5.34375, + "grad_norm_var": 0.42568359375, + "learning_rate": 0.0001, + "loss": 5.4145, + "loss/crossentropy": 2.3189245462417603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37109237909317017, + "step": 1962 + }, + { + "epoch": 0.03928, + "grad_norm": 3.203125, + "grad_norm_var": 0.40812886555989586, + "learning_rate": 0.0001, + "loss": 5.1826, + "loss/crossentropy": 2.1483139991760254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32499393820762634, + "step": 1964 + }, + { + "epoch": 0.03932, + "grad_norm": 3.046875, + "grad_norm_var": 0.404736328125, + "learning_rate": 0.0001, + "loss": 5.445, + "loss/crossentropy": 2.2916383743286133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29367291927337646, + "step": 1966 + }, + { + "epoch": 0.03936, + "grad_norm": 2.9375, + "grad_norm_var": 0.3999582926432292, + "learning_rate": 0.0001, + "loss": 5.1663, + "loss/crossentropy": 2.4217371940612793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31309331953525543, + "step": 1968 + }, + { + "epoch": 0.0394, + "grad_norm": 2.9375, + "grad_norm_var": 0.40103759765625, + "learning_rate": 0.0001, + "loss": 5.2057, + "loss/crossentropy": 1.9491975903511047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29921913146972656, + "step": 1970 + }, + { + "epoch": 0.03944, + "grad_norm": 2.78125, + "grad_norm_var": 0.39126688639322915, + "learning_rate": 0.0001, + "loss": 5.1788, + "loss/crossentropy": 2.144432306289673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29162150621414185, + "step": 1972 + }, + { + "epoch": 0.03948, + "grad_norm": 2.765625, + "grad_norm_var": 0.40748291015625, + "learning_rate": 0.0001, + "loss": 5.1336, + "loss/crossentropy": 1.9492529034614563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2875414192676544, + "step": 1974 + }, + { + "epoch": 0.03952, + "grad_norm": 2.75, + "grad_norm_var": 0.4002593994140625, + "learning_rate": 0.0001, + "loss": 5.053, + "loss/crossentropy": 1.9269813895225525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2755381464958191, + "step": 1976 + }, + { + "epoch": 0.03956, + "grad_norm": 2.828125, + "grad_norm_var": 0.03798726399739583, + "learning_rate": 0.0001, + "loss": 4.8879, + "loss/crossentropy": 2.074360489845276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2987503558397293, + "step": 1978 + }, + { + "epoch": 0.0396, + "grad_norm": 2.953125, + "grad_norm_var": 0.02086181640625, + "learning_rate": 0.0001, + "loss": 4.8834, + "loss/crossentropy": 2.257633090019226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2992274910211563, + "step": 1980 + }, + { + "epoch": 0.03964, + "grad_norm": 2.75, + "grad_norm_var": 0.01802978515625, + "learning_rate": 0.0001, + "loss": 4.9533, + "loss/crossentropy": 1.8207083940505981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2752758115530014, + "step": 1982 + }, + { + "epoch": 0.03968, + "grad_norm": 2.875, + "grad_norm_var": 0.015623982747395833, + "learning_rate": 0.0001, + "loss": 5.343, + "loss/crossentropy": 2.105292797088623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3131226450204849, + "step": 1984 + }, + { + "epoch": 0.03972, + "grad_norm": 3.484375, + "grad_norm_var": 0.05191650390625, + "learning_rate": 0.0001, + "loss": 5.4785, + "loss/crossentropy": 2.1191373467445374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3175530731678009, + "step": 1986 + }, + { + "epoch": 0.03976, + "grad_norm": 2.75, + "grad_norm_var": 0.051878865559895834, + "learning_rate": 0.0001, + "loss": 4.9236, + "loss/crossentropy": 2.2214397192001343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3091724067926407, + "step": 1988 + }, + { + "epoch": 0.0398, + "grad_norm": 2.875, + "grad_norm_var": 0.05133056640625, + "learning_rate": 0.0001, + "loss": 5.0031, + "loss/crossentropy": 1.7347424626350403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2825440764427185, + "step": 1990 + }, + { + "epoch": 0.03984, + "grad_norm": 2.890625, + "grad_norm_var": 0.0500396728515625, + "learning_rate": 0.0001, + "loss": 5.0951, + "loss/crossentropy": 2.1566559076309204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2961925268173218, + "step": 1992 + }, + { + "epoch": 0.03988, + "grad_norm": 2.703125, + "grad_norm_var": 0.06396077473958334, + "learning_rate": 0.0001, + "loss": 4.9195, + "loss/crossentropy": 2.2129205465316772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29672613739967346, + "step": 1994 + }, + { + "epoch": 0.03992, + "grad_norm": 3.3125, + "grad_norm_var": 0.07595113118489584, + "learning_rate": 0.0001, + "loss": 5.7534, + "loss/crossentropy": 2.4702744483947754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3707122802734375, + "step": 1996 + }, + { + "epoch": 0.03996, + "grad_norm": 2.84375, + "grad_norm_var": 0.06731669108072917, + "learning_rate": 0.0001, + "loss": 5.0296, + "loss/crossentropy": 2.0463536977767944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3124735355377197, + "step": 1998 + }, + { + "epoch": 0.04, + "grad_norm": 2.796875, + "grad_norm_var": 0.07281494140625, + "learning_rate": 0.0001, + "loss": 4.959, + "loss/crossentropy": 2.1550235748291016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32177163660526276, + "step": 2000 + }, + { + "epoch": 0.04004, + "grad_norm": 2.71875, + "grad_norm_var": 0.059798177083333334, + "learning_rate": 0.0001, + "loss": 5.2078, + "loss/crossentropy": 2.1312190890312195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29013920575380325, + "step": 2002 + }, + { + "epoch": 0.04008, + "grad_norm": 2.75, + "grad_norm_var": 0.06082356770833333, + "learning_rate": 0.0001, + "loss": 5.0086, + "loss/crossentropy": 1.8546085357666016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2619543671607971, + "step": 2004 + }, + { + "epoch": 0.04012, + "grad_norm": 3.015625, + "grad_norm_var": 0.0561920166015625, + "learning_rate": 0.0001, + "loss": 5.3416, + "loss/crossentropy": 2.262398660182953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28742220997810364, + "step": 2006 + }, + { + "epoch": 0.04016, + "grad_norm": 3.109375, + "grad_norm_var": 0.05734049479166667, + "learning_rate": 0.0001, + "loss": 5.4315, + "loss/crossentropy": 2.156043767929077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30488699674606323, + "step": 2008 + }, + { + "epoch": 0.0402, + "grad_norm": 2.765625, + "grad_norm_var": 0.04290364583333333, + "learning_rate": 0.0001, + "loss": 4.8761, + "loss/crossentropy": 1.925516963005066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2795708477497101, + "step": 2010 + }, + { + "epoch": 0.04024, + "grad_norm": 2.6875, + "grad_norm_var": 0.031281534830729166, + "learning_rate": 0.0001, + "loss": 5.0729, + "loss/crossentropy": 1.947714388370514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27192793786525726, + "step": 2012 + }, + { + "epoch": 0.04028, + "grad_norm": 2.625, + "grad_norm_var": 0.038939412434895834, + "learning_rate": 0.0001, + "loss": 4.6214, + "loss/crossentropy": 1.9584010243415833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2835424840450287, + "step": 2014 + }, + { + "epoch": 0.04032, + "grad_norm": 2.578125, + "grad_norm_var": 0.04108784993489583, + "learning_rate": 0.0001, + "loss": 5.1974, + "loss/crossentropy": 2.461808919906616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3104698956012726, + "step": 2016 + }, + { + "epoch": 0.04036, + "grad_norm": 2.734375, + "grad_norm_var": 0.021903483072916667, + "learning_rate": 0.0001, + "loss": 5.286, + "loss/crossentropy": 2.094545900821686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30010756850242615, + "step": 2018 + }, + { + "epoch": 0.0404, + "grad_norm": 2.78125, + "grad_norm_var": 0.021923828125, + "learning_rate": 0.0001, + "loss": 5.1274, + "loss/crossentropy": 2.353589177131653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29775144159793854, + "step": 2020 + }, + { + "epoch": 0.04044, + "grad_norm": 3.375, + "grad_norm_var": 10.55537821451823, + "learning_rate": 0.0001, + "loss": 5.3359, + "loss/crossentropy": 2.4468252658843994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34534354507923126, + "step": 2022 + }, + { + "epoch": 0.04048, + "grad_norm": 2.984375, + "grad_norm_var": 10.529678344726562, + "learning_rate": 0.0001, + "loss": 5.5028, + "loss/crossentropy": 2.2037755250930786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.303710475564003, + "step": 2024 + }, + { + "epoch": 0.04052, + "grad_norm": 2.71875, + "grad_norm_var": 10.546240234375, + "learning_rate": 0.0001, + "loss": 4.9229, + "loss/crossentropy": 1.9658318161964417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2829178273677826, + "step": 2026 + }, + { + "epoch": 0.04056, + "grad_norm": 2.734375, + "grad_norm_var": 10.555729166666667, + "learning_rate": 0.0001, + "loss": 4.8996, + "loss/crossentropy": 2.118351697921753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29469528794288635, + "step": 2028 + }, + { + "epoch": 0.0406, + "grad_norm": 2.875, + "grad_norm_var": 10.507957967122396, + "learning_rate": 0.0001, + "loss": 5.5817, + "loss/crossentropy": 2.172826111316681, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35036201775074005, + "step": 2030 + }, + { + "epoch": 0.04064, + "grad_norm": 3.953125, + "grad_norm_var": 10.432124837239583, + "learning_rate": 0.0001, + "loss": 5.4846, + "loss/crossentropy": 2.185975730419159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3397497236728668, + "step": 2032 + }, + { + "epoch": 0.04068, + "grad_norm": 2.78125, + "grad_norm_var": 10.411026000976562, + "learning_rate": 0.0001, + "loss": 5.0145, + "loss/crossentropy": 2.043874442577362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3034070134162903, + "step": 2034 + }, + { + "epoch": 0.04072, + "grad_norm": 2.984375, + "grad_norm_var": 10.380106608072916, + "learning_rate": 0.0001, + "loss": 5.3958, + "loss/crossentropy": 2.3315287828445435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32198067009449005, + "step": 2036 + }, + { + "epoch": 0.04076, + "grad_norm": 2.828125, + "grad_norm_var": 0.1112213134765625, + "learning_rate": 0.0001, + "loss": 5.0266, + "loss/crossentropy": 2.05656898021698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30919137597084045, + "step": 2038 + }, + { + "epoch": 0.0408, + "grad_norm": 2.578125, + "grad_norm_var": 0.10188395182291667, + "learning_rate": 0.0001, + "loss": 5.3205, + "loss/crossentropy": 2.2451635599136353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3412973880767822, + "step": 2040 + }, + { + "epoch": 0.04084, + "grad_norm": 3.1875, + "grad_norm_var": 0.10715738932291667, + "learning_rate": 0.0001, + "loss": 5.1734, + "loss/crossentropy": 2.527924060821533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3178148865699768, + "step": 2042 + }, + { + "epoch": 0.04088, + "grad_norm": 2.78125, + "grad_norm_var": 0.10305582682291667, + "learning_rate": 0.0001, + "loss": 4.8441, + "loss/crossentropy": 2.03126460313797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29549214243888855, + "step": 2044 + }, + { + "epoch": 0.04092, + "grad_norm": 2.515625, + "grad_norm_var": 0.11038004557291667, + "learning_rate": 0.0001, + "loss": 5.0134, + "loss/crossentropy": 2.029997706413269, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27938202023506165, + "step": 2046 + }, + { + "epoch": 0.04096, + "grad_norm": 2.703125, + "grad_norm_var": 0.03211263020833333, + "learning_rate": 0.0001, + "loss": 4.9321, + "loss/crossentropy": 1.764098048210144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27712512016296387, + "step": 2048 + }, + { + "epoch": 0.041, + "grad_norm": 3.046875, + "grad_norm_var": 0.0356597900390625, + "learning_rate": 0.0001, + "loss": 5.435, + "loss/crossentropy": 2.605324864387512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3540754020214081, + "step": 2050 + }, + { + "epoch": 0.04104, + "grad_norm": 2.875, + "grad_norm_var": 0.0349761962890625, + "learning_rate": 0.0001, + "loss": 5.0207, + "loss/crossentropy": 1.9333613514900208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29670488089323044, + "step": 2052 + }, + { + "epoch": 0.04108, + "grad_norm": 3.078125, + "grad_norm_var": 0.040022786458333334, + "learning_rate": 0.0001, + "loss": 5.0056, + "loss/crossentropy": 1.7876797914505005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2579014301300049, + "step": 2054 + }, + { + "epoch": 0.04112, + "grad_norm": 3.125, + "grad_norm_var": 0.0417877197265625, + "learning_rate": 0.0001, + "loss": 5.1401, + "loss/crossentropy": 2.0947588682174683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3334304690361023, + "step": 2056 + }, + { + "epoch": 0.04116, + "grad_norm": 3.0, + "grad_norm_var": 0.035965983072916666, + "learning_rate": 0.0001, + "loss": 4.9009, + "loss/crossentropy": 2.1838767528533936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.306907519698143, + "step": 2058 + }, + { + "epoch": 0.0412, + "grad_norm": 3.015625, + "grad_norm_var": 0.0341461181640625, + "learning_rate": 0.0001, + "loss": 5.3724, + "loss/crossentropy": 2.2180997133255005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.314627081155777, + "step": 2060 + }, + { + "epoch": 0.04124, + "grad_norm": 2.875, + "grad_norm_var": 0.025419108072916665, + "learning_rate": 0.0001, + "loss": 4.8362, + "loss/crossentropy": 1.914646863937378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2848198413848877, + "step": 2062 + }, + { + "epoch": 0.04128, + "grad_norm": 2.609375, + "grad_norm_var": 0.0262603759765625, + "learning_rate": 0.0001, + "loss": 5.4538, + "loss/crossentropy": 2.42458713054657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31268230080604553, + "step": 2064 + }, + { + "epoch": 0.04132, + "grad_norm": 2.828125, + "grad_norm_var": 0.02457275390625, + "learning_rate": 0.0001, + "loss": 5.2497, + "loss/crossentropy": 2.23202121257782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30988384783267975, + "step": 2066 + }, + { + "epoch": 0.04136, + "grad_norm": 2.5625, + "grad_norm_var": 0.0284820556640625, + "learning_rate": 0.0001, + "loss": 5.0416, + "loss/crossentropy": 2.0225483179092407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26953594386577606, + "step": 2068 + }, + { + "epoch": 0.0414, + "grad_norm": 2.875, + "grad_norm_var": 0.028669230143229165, + "learning_rate": 0.0001, + "loss": 4.8259, + "loss/crossentropy": 1.8593338131904602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27706706523895264, + "step": 2070 + }, + { + "epoch": 0.04144, + "grad_norm": 2.59375, + "grad_norm_var": 0.023387654622395834, + "learning_rate": 0.0001, + "loss": 4.9949, + "loss/crossentropy": 2.373727560043335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3000074476003647, + "step": 2072 + }, + { + "epoch": 0.04148, + "grad_norm": 2.578125, + "grad_norm_var": 0.022704060872395834, + "learning_rate": 0.0001, + "loss": 4.9438, + "loss/crossentropy": 1.959564983844757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26365046203136444, + "step": 2074 + }, + { + "epoch": 0.04152, + "grad_norm": 2.921875, + "grad_norm_var": 0.014159138997395833, + "learning_rate": 0.0001, + "loss": 4.983, + "loss/crossentropy": 2.0590676069259644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2816864550113678, + "step": 2076 + }, + { + "epoch": 0.04156, + "grad_norm": 2.84375, + "grad_norm_var": 0.017121378580729166, + "learning_rate": 0.0001, + "loss": 5.2049, + "loss/crossentropy": 2.147680163383484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29019051790237427, + "step": 2078 + }, + { + "epoch": 0.0416, + "grad_norm": 2.609375, + "grad_norm_var": 0.020198567708333334, + "learning_rate": 0.0001, + "loss": 5.6103, + "loss/crossentropy": 2.212267220020294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2804659754037857, + "step": 2080 + }, + { + "epoch": 0.04164, + "grad_norm": 2.65625, + "grad_norm_var": 0.020406087239583332, + "learning_rate": 0.0001, + "loss": 5.0489, + "loss/crossentropy": 2.144743025302887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2946523129940033, + "step": 2082 + }, + { + "epoch": 0.04168, + "grad_norm": 2.703125, + "grad_norm_var": 0.019172159830729167, + "learning_rate": 0.0001, + "loss": 4.9311, + "loss/crossentropy": 2.3702481985092163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2923210561275482, + "step": 2084 + }, + { + "epoch": 0.04172, + "grad_norm": 2.96875, + "grad_norm_var": 0.019684855143229166, + "learning_rate": 0.0001, + "loss": 5.2291, + "loss/crossentropy": 1.9512975811958313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28704553842544556, + "step": 2086 + }, + { + "epoch": 0.04176, + "grad_norm": 3.25, + "grad_norm_var": 0.25388895670572914, + "learning_rate": 0.0001, + "loss": 4.9572, + "loss/crossentropy": 2.180745005607605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28361976146698, + "step": 2088 + }, + { + "epoch": 0.0418, + "grad_norm": 2.671875, + "grad_norm_var": 0.24712626139322916, + "learning_rate": 0.0001, + "loss": 4.8579, + "loss/crossentropy": 1.9768275022506714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28042787313461304, + "step": 2090 + }, + { + "epoch": 0.04184, + "grad_norm": 2.90625, + "grad_norm_var": 0.24544270833333334, + "learning_rate": 0.0001, + "loss": 5.2602, + "loss/crossentropy": 2.148472547531128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30099035799503326, + "step": 2092 + }, + { + "epoch": 0.04188, + "grad_norm": 2.734375, + "grad_norm_var": 0.24763081868489584, + "learning_rate": 0.0001, + "loss": 5.0152, + "loss/crossentropy": 2.1698715686798096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3140450567007065, + "step": 2094 + }, + { + "epoch": 0.04192, + "grad_norm": 2.828125, + "grad_norm_var": 0.24172770182291667, + "learning_rate": 0.0001, + "loss": 4.8679, + "loss/crossentropy": 2.1142334938049316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2847675681114197, + "step": 2096 + }, + { + "epoch": 0.04196, + "grad_norm": 2.703125, + "grad_norm_var": 0.2395660400390625, + "learning_rate": 0.0001, + "loss": 5.2185, + "loss/crossentropy": 2.1908479928970337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28702451288700104, + "step": 2098 + }, + { + "epoch": 0.042, + "grad_norm": 2.6875, + "grad_norm_var": 0.23321024576822916, + "learning_rate": 0.0001, + "loss": 5.0212, + "loss/crossentropy": 2.0519612431526184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29224735498428345, + "step": 2100 + }, + { + "epoch": 0.04204, + "grad_norm": 2.96875, + "grad_norm_var": 0.2412994384765625, + "learning_rate": 0.0001, + "loss": 4.871, + "loss/crossentropy": 1.9304961562156677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28785137832164764, + "step": 2102 + }, + { + "epoch": 0.04208, + "grad_norm": 2.765625, + "grad_norm_var": 0.0134185791015625, + "learning_rate": 0.0001, + "loss": 5.2462, + "loss/crossentropy": 2.297300934791565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.302143856883049, + "step": 2104 + }, + { + "epoch": 0.04212, + "grad_norm": 2.453125, + "grad_norm_var": 0.019782511393229167, + "learning_rate": 0.0001, + "loss": 5.0491, + "loss/crossentropy": 2.2764381170272827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28773219883441925, + "step": 2106 + }, + { + "epoch": 0.04216, + "grad_norm": 2.625, + "grad_norm_var": 0.019391886393229165, + "learning_rate": 0.0001, + "loss": 5.0563, + "loss/crossentropy": 2.141321837902069, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3108212947845459, + "step": 2108 + }, + { + "epoch": 0.0422, + "grad_norm": 2.71875, + "grad_norm_var": 0.0185699462890625, + "learning_rate": 0.0001, + "loss": 5.0362, + "loss/crossentropy": 1.9619495272636414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2938811331987381, + "step": 2110 + }, + { + "epoch": 0.04224, + "grad_norm": 3.015625, + "grad_norm_var": 0.026325480143229166, + "learning_rate": 0.0001, + "loss": 5.4496, + "loss/crossentropy": 1.9741051197052002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28835102915763855, + "step": 2112 + }, + { + "epoch": 0.04228, + "grad_norm": 2.546875, + "grad_norm_var": 0.029255167643229166, + "learning_rate": 0.0001, + "loss": 4.9303, + "loss/crossentropy": 1.9510936737060547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2839510589838028, + "step": 2114 + }, + { + "epoch": 0.04232, + "grad_norm": 2.828125, + "grad_norm_var": 0.026753743489583332, + "learning_rate": 0.0001, + "loss": 5.2446, + "loss/crossentropy": 2.0201885104179382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30627067387104034, + "step": 2116 + }, + { + "epoch": 0.04236, + "grad_norm": 2.6875, + "grad_norm_var": 0.022493489583333335, + "learning_rate": 0.0001, + "loss": 5.1411, + "loss/crossentropy": 2.4522262811660767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.308579683303833, + "step": 2118 + }, + { + "epoch": 0.0424, + "grad_norm": 3.09375, + "grad_norm_var": 0.04345296223958333, + "learning_rate": 0.0001, + "loss": 5.5535, + "loss/crossentropy": 1.9289590120315552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29346515238285065, + "step": 2120 + }, + { + "epoch": 0.04244, + "grad_norm": 2.8125, + "grad_norm_var": 0.03437398274739583, + "learning_rate": 0.0001, + "loss": 5.0588, + "loss/crossentropy": 2.2020061016082764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2992282509803772, + "step": 2122 + }, + { + "epoch": 0.04248, + "grad_norm": 2.78125, + "grad_norm_var": 0.031494140625, + "learning_rate": 0.0001, + "loss": 5.2466, + "loss/crossentropy": 2.180301785469055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28323256969451904, + "step": 2124 + }, + { + "epoch": 0.04252, + "grad_norm": 2.78125, + "grad_norm_var": 0.03435872395833333, + "learning_rate": 0.0001, + "loss": 4.9061, + "loss/crossentropy": 2.1250513792037964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.272533118724823, + "step": 2126 + }, + { + "epoch": 0.04256, + "grad_norm": 2.78125, + "grad_norm_var": 0.037873331705729166, + "learning_rate": 0.0001, + "loss": 5.4375, + "loss/crossentropy": 2.3509981632232666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3719516545534134, + "step": 2128 + }, + { + "epoch": 0.0426, + "grad_norm": 3.078125, + "grad_norm_var": 0.03508707682291667, + "learning_rate": 0.0001, + "loss": 5.3067, + "loss/crossentropy": 2.135426163673401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3582882583141327, + "step": 2130 + }, + { + "epoch": 0.04264, + "grad_norm": 2.578125, + "grad_norm_var": 0.0398590087890625, + "learning_rate": 0.0001, + "loss": 5.2406, + "loss/crossentropy": 2.316452383995056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30162203311920166, + "step": 2132 + }, + { + "epoch": 0.04268, + "grad_norm": 2.765625, + "grad_norm_var": 0.03846028645833333, + "learning_rate": 0.0001, + "loss": 5.0372, + "loss/crossentropy": 2.0325432419776917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.287256121635437, + "step": 2134 + }, + { + "epoch": 0.04272, + "grad_norm": 2.6875, + "grad_norm_var": 0.0236236572265625, + "learning_rate": 0.0001, + "loss": 5.1985, + "loss/crossentropy": 2.070056974887848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2552843391895294, + "step": 2136 + }, + { + "epoch": 0.04276, + "grad_norm": 3.078125, + "grad_norm_var": 0.029157511393229165, + "learning_rate": 0.0001, + "loss": 5.0623, + "loss/crossentropy": 1.7005944848060608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24984879791736603, + "step": 2138 + }, + { + "epoch": 0.0428, + "grad_norm": 2.75, + "grad_norm_var": 0.03203125, + "learning_rate": 0.0001, + "loss": 5.0862, + "loss/crossentropy": 1.6700931787490845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2572527676820755, + "step": 2140 + }, + { + "epoch": 0.04284, + "grad_norm": 2.65625, + "grad_norm_var": 0.03125, + "learning_rate": 0.0001, + "loss": 5.0186, + "loss/crossentropy": 2.3074774742126465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31137382984161377, + "step": 2142 + }, + { + "epoch": 0.04288, + "grad_norm": 2.84375, + "grad_norm_var": 0.0229400634765625, + "learning_rate": 0.0001, + "loss": 5.1973, + "loss/crossentropy": 2.103408098220825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30157215893268585, + "step": 2144 + }, + { + "epoch": 0.04292, + "grad_norm": 2.796875, + "grad_norm_var": 0.020979817708333334, + "learning_rate": 0.0001, + "loss": 4.8206, + "loss/crossentropy": 1.8602584600448608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28300634026527405, + "step": 2146 + }, + { + "epoch": 0.04296, + "grad_norm": 2.671875, + "grad_norm_var": 0.019188435872395833, + "learning_rate": 0.0001, + "loss": 5.0525, + "loss/crossentropy": 2.337582588195801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28086431324481964, + "step": 2148 + }, + { + "epoch": 0.043, + "grad_norm": 2.796875, + "grad_norm_var": 0.026439412434895834, + "learning_rate": 0.0001, + "loss": 5.2405, + "loss/crossentropy": 2.2635254859924316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3113311231136322, + "step": 2150 + }, + { + "epoch": 0.04304, + "grad_norm": 2.96875, + "grad_norm_var": 0.0277984619140625, + "learning_rate": 0.0001, + "loss": 5.187, + "loss/crossentropy": 2.3971948623657227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32285284996032715, + "step": 2152 + }, + { + "epoch": 0.04308, + "grad_norm": 2.875, + "grad_norm_var": 0.021891276041666668, + "learning_rate": 0.0001, + "loss": 5.1438, + "loss/crossentropy": 1.8900776505470276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28163351118564606, + "step": 2154 + }, + { + "epoch": 0.04312, + "grad_norm": 2.765625, + "grad_norm_var": 0.020536295572916665, + "learning_rate": 0.0001, + "loss": 4.9774, + "loss/crossentropy": 1.908443808555603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2901918590068817, + "step": 2156 + }, + { + "epoch": 0.04316, + "grad_norm": 2.75, + "grad_norm_var": 0.020409138997395833, + "learning_rate": 0.0001, + "loss": 4.7808, + "loss/crossentropy": 1.8003268837928772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2762569487094879, + "step": 2158 + }, + { + "epoch": 0.0432, + "grad_norm": 2.640625, + "grad_norm_var": 0.0221832275390625, + "learning_rate": 0.0001, + "loss": 5.0477, + "loss/crossentropy": 1.996739387512207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24626458436250687, + "step": 2160 + }, + { + "epoch": 0.04324, + "grad_norm": 2.84375, + "grad_norm_var": 0.018880208333333332, + "learning_rate": 0.0001, + "loss": 5.1737, + "loss/crossentropy": 2.0175461173057556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31059183180332184, + "step": 2162 + }, + { + "epoch": 0.04328, + "grad_norm": 2.6875, + "grad_norm_var": 0.019266764322916668, + "learning_rate": 0.0001, + "loss": 5.0448, + "loss/crossentropy": 2.0009909868240356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2835536003112793, + "step": 2164 + }, + { + "epoch": 0.04332, + "grad_norm": 2.65625, + "grad_norm_var": 0.017967732747395833, + "learning_rate": 0.0001, + "loss": 4.9035, + "loss/crossentropy": 1.9848785400390625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26657119393348694, + "step": 2166 + }, + { + "epoch": 0.04336, + "grad_norm": 2.734375, + "grad_norm_var": 0.01207275390625, + "learning_rate": 0.0001, + "loss": 4.9108, + "loss/crossentropy": 2.076065957546234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2801993191242218, + "step": 2168 + }, + { + "epoch": 0.0434, + "grad_norm": 2.828125, + "grad_norm_var": 0.011617024739583334, + "learning_rate": 0.0001, + "loss": 5.1425, + "loss/crossentropy": 2.1208528876304626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3152369260787964, + "step": 2170 + }, + { + "epoch": 0.04344, + "grad_norm": 2.625, + "grad_norm_var": 0.011400349934895833, + "learning_rate": 0.0001, + "loss": 5.0608, + "loss/crossentropy": 2.1971306204795837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31046128273010254, + "step": 2172 + }, + { + "epoch": 0.04348, + "grad_norm": 2.734375, + "grad_norm_var": 0.013158162434895834, + "learning_rate": 0.0001, + "loss": 5.2445, + "loss/crossentropy": 2.275176525115967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3090529441833496, + "step": 2174 + }, + { + "epoch": 0.04352, + "grad_norm": 2.8125, + "grad_norm_var": 0.01197509765625, + "learning_rate": 0.0001, + "loss": 4.9366, + "loss/crossentropy": 2.1574501395225525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.280165433883667, + "step": 2176 + }, + { + "epoch": 0.04356, + "grad_norm": 2.65625, + "grad_norm_var": 0.012548828125, + "learning_rate": 0.0001, + "loss": 5.2338, + "loss/crossentropy": 2.4236754179000854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3061629384756088, + "step": 2178 + }, + { + "epoch": 0.0436, + "grad_norm": 2.78125, + "grad_norm_var": 0.012809244791666667, + "learning_rate": 0.0001, + "loss": 5.1707, + "loss/crossentropy": 2.1282758712768555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3050261586904526, + "step": 2180 + }, + { + "epoch": 0.04364, + "grad_norm": 2.828125, + "grad_norm_var": 0.008561197916666667, + "learning_rate": 0.0001, + "loss": 5.4629, + "loss/crossentropy": 2.4244707822799683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33464157581329346, + "step": 2182 + }, + { + "epoch": 0.04368, + "grad_norm": 2.640625, + "grad_norm_var": 0.010904947916666666, + "learning_rate": 0.0001, + "loss": 5.3891, + "loss/crossentropy": 2.289917469024658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3181813210248947, + "step": 2184 + }, + { + "epoch": 0.04372, + "grad_norm": 2.890625, + "grad_norm_var": 0.01031494140625, + "learning_rate": 0.0001, + "loss": 5.4207, + "loss/crossentropy": 2.1540024280548096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.298043891787529, + "step": 2186 + }, + { + "epoch": 0.04376, + "grad_norm": 2.609375, + "grad_norm_var": 0.0145660400390625, + "learning_rate": 0.0001, + "loss": 4.8595, + "loss/crossentropy": 1.6615915298461914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23485098034143448, + "step": 2188 + }, + { + "epoch": 0.0438, + "grad_norm": 2.796875, + "grad_norm_var": 0.01357421875, + "learning_rate": 0.0001, + "loss": 5.0595, + "loss/crossentropy": 2.352560341358185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2965056747198105, + "step": 2190 + }, + { + "epoch": 0.04384, + "grad_norm": 2.625, + "grad_norm_var": 0.0183746337890625, + "learning_rate": 0.0001, + "loss": 5.1463, + "loss/crossentropy": 2.0864007472991943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2859686613082886, + "step": 2192 + }, + { + "epoch": 0.04388, + "grad_norm": 2.765625, + "grad_norm_var": 0.016600545247395834, + "learning_rate": 0.0001, + "loss": 5.2599, + "loss/crossentropy": 1.8934992551803589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24994677305221558, + "step": 2194 + }, + { + "epoch": 0.04392, + "grad_norm": 2.578125, + "grad_norm_var": 0.02476806640625, + "learning_rate": 0.0001, + "loss": 4.9976, + "loss/crossentropy": 2.2395824193954468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2719078063964844, + "step": 2196 + }, + { + "epoch": 0.04396, + "grad_norm": 2.84375, + "grad_norm_var": 0.02515869140625, + "learning_rate": 0.0001, + "loss": 5.2631, + "loss/crossentropy": 2.089230954647064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2759791761636734, + "step": 2198 + }, + { + "epoch": 0.044, + "grad_norm": 2.75, + "grad_norm_var": 0.023567708333333333, + "learning_rate": 0.0001, + "loss": 5.298, + "loss/crossentropy": 2.2770241498947144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31005042791366577, + "step": 2200 + }, + { + "epoch": 0.04404, + "grad_norm": 2.46875, + "grad_norm_var": 0.028938802083333333, + "learning_rate": 0.0001, + "loss": 4.6842, + "loss/crossentropy": 2.067028760910034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30712655186653137, + "step": 2202 + }, + { + "epoch": 0.04408, + "grad_norm": 2.46875, + "grad_norm_var": 0.026395670572916665, + "learning_rate": 0.0001, + "loss": 5.0557, + "loss/crossentropy": 2.4397774934768677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3508765548467636, + "step": 2204 + }, + { + "epoch": 0.04412, + "grad_norm": 2.890625, + "grad_norm_var": 0.028880818684895834, + "learning_rate": 0.0001, + "loss": 4.966, + "loss/crossentropy": 1.8136217594146729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2495090439915657, + "step": 2206 + }, + { + "epoch": 0.04416, + "grad_norm": 2.71875, + "grad_norm_var": 0.022945149739583334, + "learning_rate": 0.0001, + "loss": 5.11, + "loss/crossentropy": 2.4620203971862793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31419822573661804, + "step": 2208 + }, + { + "epoch": 0.0442, + "grad_norm": 2.625, + "grad_norm_var": 0.020042928059895833, + "learning_rate": 0.0001, + "loss": 4.9756, + "loss/crossentropy": 1.8817986249923706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2705220878124237, + "step": 2210 + }, + { + "epoch": 0.04424, + "grad_norm": 3.125, + "grad_norm_var": 0.02769775390625, + "learning_rate": 0.0001, + "loss": 5.0069, + "loss/crossentropy": 1.9593598246574402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.262384794652462, + "step": 2212 + }, + { + "epoch": 0.04428, + "grad_norm": 2.484375, + "grad_norm_var": 0.03303120930989583, + "learning_rate": 0.0001, + "loss": 4.9133, + "loss/crossentropy": 2.1003851294517517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28965799510478973, + "step": 2214 + }, + { + "epoch": 0.04432, + "grad_norm": 3.46875, + "grad_norm_var": 0.07888081868489584, + "learning_rate": 0.0001, + "loss": 5.3243, + "loss/crossentropy": 2.23227858543396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3120953291654587, + "step": 2216 + }, + { + "epoch": 0.04436, + "grad_norm": 2.625, + "grad_norm_var": 0.0726226806640625, + "learning_rate": 0.0001, + "loss": 5.0901, + "loss/crossentropy": 1.8880399465560913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2740217447280884, + "step": 2218 + }, + { + "epoch": 0.0444, + "grad_norm": 2.5, + "grad_norm_var": 0.07111714680989584, + "learning_rate": 0.0001, + "loss": 4.9444, + "loss/crossentropy": 2.132355511188507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27536119520664215, + "step": 2220 + }, + { + "epoch": 0.04444, + "grad_norm": 2.578125, + "grad_norm_var": 0.07419331868489583, + "learning_rate": 0.0001, + "loss": 4.7325, + "loss/crossentropy": 1.831633746623993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26966987550258636, + "step": 2222 + }, + { + "epoch": 0.04448, + "grad_norm": 2.703125, + "grad_norm_var": 0.08056538899739583, + "learning_rate": 0.0001, + "loss": 5.004, + "loss/crossentropy": 2.066656529903412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27303647994995117, + "step": 2224 + }, + { + "epoch": 0.04452, + "grad_norm": 2.75, + "grad_norm_var": 0.0809234619140625, + "learning_rate": 0.0001, + "loss": 4.9265, + "loss/crossentropy": 2.1416667699813843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2781240791082382, + "step": 2226 + }, + { + "epoch": 0.04456, + "grad_norm": 2.671875, + "grad_norm_var": 0.07366536458333334, + "learning_rate": 0.0001, + "loss": 5.0701, + "loss/crossentropy": 1.7953566908836365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27528999745845795, + "step": 2228 + }, + { + "epoch": 0.0446, + "grad_norm": 2.84375, + "grad_norm_var": 0.06728515625, + "learning_rate": 0.0001, + "loss": 5.0182, + "loss/crossentropy": 2.1580333709716797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2998732179403305, + "step": 2230 + }, + { + "epoch": 0.04464, + "grad_norm": 2.9375, + "grad_norm_var": 0.034326171875, + "learning_rate": 0.0001, + "loss": 5.3619, + "loss/crossentropy": 2.1685701608657837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3457205891609192, + "step": 2232 + }, + { + "epoch": 0.04468, + "grad_norm": 2.578125, + "grad_norm_var": 0.03345438639322917, + "learning_rate": 0.0001, + "loss": 4.7602, + "loss/crossentropy": 1.9424286484718323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2728651314973831, + "step": 2234 + }, + { + "epoch": 0.04472, + "grad_norm": 2.765625, + "grad_norm_var": 0.029736328125, + "learning_rate": 0.0001, + "loss": 5.2351, + "loss/crossentropy": 2.2802772521972656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30141082406044006, + "step": 2236 + }, + { + "epoch": 0.04476, + "grad_norm": 2.515625, + "grad_norm_var": 0.03277587890625, + "learning_rate": 0.0001, + "loss": 4.8906, + "loss/crossentropy": 1.9490987062454224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2573640048503876, + "step": 2238 + }, + { + "epoch": 0.0448, + "grad_norm": 3.171875, + "grad_norm_var": 0.037679036458333336, + "learning_rate": 0.0001, + "loss": 5.2112, + "loss/crossentropy": 1.993924081325531, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25628305971622467, + "step": 2240 + }, + { + "epoch": 0.04484, + "grad_norm": 2.6875, + "grad_norm_var": 0.0359283447265625, + "learning_rate": 0.0001, + "loss": 5.268, + "loss/crossentropy": 2.5151875019073486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.326447993516922, + "step": 2242 + }, + { + "epoch": 0.04488, + "grad_norm": 2.609375, + "grad_norm_var": 0.04641520182291667, + "learning_rate": 0.0001, + "loss": 5.0191, + "loss/crossentropy": 2.5175565481185913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3036232739686966, + "step": 2244 + }, + { + "epoch": 0.04492, + "grad_norm": 2.546875, + "grad_norm_var": 0.05388997395833333, + "learning_rate": 0.0001, + "loss": 4.8489, + "loss/crossentropy": 2.020721971988678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28087201714515686, + "step": 2246 + }, + { + "epoch": 0.04496, + "grad_norm": 2.96875, + "grad_norm_var": 0.045703125, + "learning_rate": 0.0001, + "loss": 5.6809, + "loss/crossentropy": 2.4800511598587036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3100287467241287, + "step": 2248 + }, + { + "epoch": 0.045, + "grad_norm": 2.59375, + "grad_norm_var": 0.0449127197265625, + "learning_rate": 0.0001, + "loss": 4.9055, + "loss/crossentropy": 1.826172411441803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2782330811023712, + "step": 2250 + }, + { + "epoch": 0.04504, + "grad_norm": 2.828125, + "grad_norm_var": 0.04533589680989583, + "learning_rate": 0.0001, + "loss": 5.133, + "loss/crossentropy": 2.256316304206848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3070906698703766, + "step": 2252 + }, + { + "epoch": 0.04508, + "grad_norm": 2.765625, + "grad_norm_var": 0.041402180989583336, + "learning_rate": 0.0001, + "loss": 5.173, + "loss/crossentropy": 1.9046601057052612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2894355356693268, + "step": 2254 + }, + { + "epoch": 0.04512, + "grad_norm": 2.828125, + "grad_norm_var": 0.03288472493489583, + "learning_rate": 0.0001, + "loss": 5.0311, + "loss/crossentropy": 1.8359373211860657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2705196440219879, + "step": 2256 + }, + { + "epoch": 0.04516, + "grad_norm": 2.96875, + "grad_norm_var": 0.0349273681640625, + "learning_rate": 0.0001, + "loss": 4.717, + "loss/crossentropy": 2.096512258052826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26635921001434326, + "step": 2258 + }, + { + "epoch": 0.0452, + "grad_norm": 2.984375, + "grad_norm_var": 0.031371053059895834, + "learning_rate": 0.0001, + "loss": 5.6736, + "loss/crossentropy": 2.4621278047561646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3513137400150299, + "step": 2260 + }, + { + "epoch": 0.04524, + "grad_norm": 2.828125, + "grad_norm_var": 0.020442708333333334, + "learning_rate": 0.0001, + "loss": 5.1413, + "loss/crossentropy": 1.8345229029655457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2414976954460144, + "step": 2262 + }, + { + "epoch": 0.04528, + "grad_norm": 2.65625, + "grad_norm_var": 0.026432291666666666, + "learning_rate": 0.0001, + "loss": 4.9746, + "loss/crossentropy": 2.24505877494812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31794628500938416, + "step": 2264 + }, + { + "epoch": 0.04532, + "grad_norm": 2.609375, + "grad_norm_var": 0.0285552978515625, + "learning_rate": 0.0001, + "loss": 5.1691, + "loss/crossentropy": 2.2141382694244385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2925301343202591, + "step": 2266 + }, + { + "epoch": 0.04536, + "grad_norm": 3.1875, + "grad_norm_var": 0.03681233723958333, + "learning_rate": 0.0001, + "loss": 5.0559, + "loss/crossentropy": 2.1515613794326782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2843547910451889, + "step": 2268 + }, + { + "epoch": 0.0454, + "grad_norm": 2.765625, + "grad_norm_var": 0.03422749837239583, + "learning_rate": 0.0001, + "loss": 4.6899, + "loss/crossentropy": 2.1234883666038513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2981575280427933, + "step": 2270 + }, + { + "epoch": 0.04544, + "grad_norm": 2.65625, + "grad_norm_var": 0.03806864420572917, + "learning_rate": 0.0001, + "loss": 4.9871, + "loss/crossentropy": 2.1212490797042847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.276496559381485, + "step": 2272 + }, + { + "epoch": 0.04548, + "grad_norm": 2.921875, + "grad_norm_var": 0.03574930826822917, + "learning_rate": 0.0001, + "loss": 5.2925, + "loss/crossentropy": 2.4330636262893677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2815839499235153, + "step": 2274 + }, + { + "epoch": 0.04552, + "grad_norm": 2.65625, + "grad_norm_var": 0.028238932291666668, + "learning_rate": 0.0001, + "loss": 5.2874, + "loss/crossentropy": 2.110591411590576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2832919806241989, + "step": 2276 + }, + { + "epoch": 0.04556, + "grad_norm": 2.875, + "grad_norm_var": 0.20754801432291667, + "learning_rate": 0.0001, + "loss": 5.0303, + "loss/crossentropy": 2.231989800930023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28183089196681976, + "step": 2278 + }, + { + "epoch": 0.0456, + "grad_norm": 2.84375, + "grad_norm_var": 0.194873046875, + "learning_rate": 0.0001, + "loss": 5.3746, + "loss/crossentropy": 2.1275558471679688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29279497265815735, + "step": 2280 + }, + { + "epoch": 0.04564, + "grad_norm": 2.8125, + "grad_norm_var": 0.1891510009765625, + "learning_rate": 0.0001, + "loss": 5.2023, + "loss/crossentropy": 1.7988306283950806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2625636160373688, + "step": 2282 + }, + { + "epoch": 0.04568, + "grad_norm": 2.65625, + "grad_norm_var": 0.18694254557291667, + "learning_rate": 0.0001, + "loss": 5.2017, + "loss/crossentropy": 2.3405990600585938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30855217576026917, + "step": 2284 + }, + { + "epoch": 0.04572, + "grad_norm": 2.828125, + "grad_norm_var": 0.18612874348958333, + "learning_rate": 0.0001, + "loss": 5.4582, + "loss/crossentropy": 2.2062121629714966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30755001306533813, + "step": 2286 + }, + { + "epoch": 0.04576, + "grad_norm": 2.5625, + "grad_norm_var": 0.18968098958333332, + "learning_rate": 0.0001, + "loss": 4.8984, + "loss/crossentropy": 1.9439310431480408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26844222843647003, + "step": 2288 + }, + { + "epoch": 0.0458, + "grad_norm": 2.78125, + "grad_norm_var": 0.19010416666666666, + "learning_rate": 0.0001, + "loss": 5.2097, + "loss/crossentropy": 2.3106162548065186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30821681022644043, + "step": 2290 + }, + { + "epoch": 0.04584, + "grad_norm": 2.640625, + "grad_norm_var": 0.19246317545572916, + "learning_rate": 0.0001, + "loss": 5.1401, + "loss/crossentropy": 2.3809561729431152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3307010233402252, + "step": 2292 + }, + { + "epoch": 0.04588, + "grad_norm": 2.703125, + "grad_norm_var": 0.01103515625, + "learning_rate": 0.0001, + "loss": 5.4066, + "loss/crossentropy": 2.209702253341675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2795914113521576, + "step": 2294 + }, + { + "epoch": 0.04592, + "grad_norm": 2.5625, + "grad_norm_var": 0.0158111572265625, + "learning_rate": 0.0001, + "loss": 4.8772, + "loss/crossentropy": 2.4084372520446777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31208573281764984, + "step": 2296 + }, + { + "epoch": 0.04596, + "grad_norm": 2.6875, + "grad_norm_var": 0.0146392822265625, + "learning_rate": 0.0001, + "loss": 4.7757, + "loss/crossentropy": 1.9384723901748657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2625032365322113, + "step": 2298 + }, + { + "epoch": 0.046, + "grad_norm": 2.78125, + "grad_norm_var": 0.013704427083333333, + "learning_rate": 0.0001, + "loss": 5.2455, + "loss/crossentropy": 2.150592088699341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.278359517455101, + "step": 2300 + }, + { + "epoch": 0.04604, + "grad_norm": 2.65625, + "grad_norm_var": 0.01256103515625, + "learning_rate": 0.0001, + "loss": 5.0788, + "loss/crossentropy": 1.8317970037460327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2927433103322983, + "step": 2302 + }, + { + "epoch": 0.04608, + "grad_norm": 3.09375, + "grad_norm_var": 0.029564412434895833, + "learning_rate": 0.0001, + "loss": 5.091, + "loss/crossentropy": 2.323367118835449, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2943577915430069, + "step": 2304 + }, + { + "epoch": 0.04612, + "grad_norm": 2.859375, + "grad_norm_var": 0.03943583170572917, + "learning_rate": 0.0001, + "loss": 5.5301, + "loss/crossentropy": 2.369907855987549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2990037202835083, + "step": 2306 + }, + { + "epoch": 0.04616, + "grad_norm": 2.921875, + "grad_norm_var": 0.04127197265625, + "learning_rate": 0.0001, + "loss": 4.7508, + "loss/crossentropy": 1.691443145275116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27280642092227936, + "step": 2308 + }, + { + "epoch": 0.0462, + "grad_norm": 2.71875, + "grad_norm_var": 0.0464508056640625, + "learning_rate": 0.0001, + "loss": 4.9413, + "loss/crossentropy": 2.2883838415145874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3064710944890976, + "step": 2310 + }, + { + "epoch": 0.04624, + "grad_norm": 2.671875, + "grad_norm_var": 0.035380045572916664, + "learning_rate": 0.0001, + "loss": 5.4165, + "loss/crossentropy": 2.2042444944381714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3241504430770874, + "step": 2312 + }, + { + "epoch": 0.04628, + "grad_norm": 2.609375, + "grad_norm_var": 0.048680623372395836, + "learning_rate": 0.0001, + "loss": 4.6657, + "loss/crossentropy": 1.977793574333191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2775610163807869, + "step": 2314 + }, + { + "epoch": 0.04632, + "grad_norm": 24.875, + "grad_norm_var": 30.570881144205728, + "learning_rate": 0.0001, + "loss": 5.8585, + "loss/crossentropy": 2.034530758857727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27170561254024506, + "step": 2316 + }, + { + "epoch": 0.04636, + "grad_norm": 2.875, + "grad_norm_var": 30.404881795247395, + "learning_rate": 0.0001, + "loss": 5.1565, + "loss/crossentropy": 2.439123511314392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2999258190393448, + "step": 2318 + }, + { + "epoch": 0.0464, + "grad_norm": 2.5, + "grad_norm_var": 30.530557250976564, + "learning_rate": 0.0001, + "loss": 4.8907, + "loss/crossentropy": 2.2600624561309814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2952795475721359, + "step": 2320 + }, + { + "epoch": 0.04644, + "grad_norm": 2.859375, + "grad_norm_var": 30.544131469726562, + "learning_rate": 0.0001, + "loss": 5.0521, + "loss/crossentropy": 2.144679367542267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28713342547416687, + "step": 2322 + }, + { + "epoch": 0.04648, + "grad_norm": 2.890625, + "grad_norm_var": 30.469155883789064, + "learning_rate": 0.0001, + "loss": 5.5054, + "loss/crossentropy": 2.34474778175354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.292633980512619, + "step": 2324 + }, + { + "epoch": 0.04652, + "grad_norm": 2.609375, + "grad_norm_var": 30.491536458333332, + "learning_rate": 0.0001, + "loss": 4.5903, + "loss/crossentropy": 1.96743243932724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.265610933303833, + "step": 2326 + }, + { + "epoch": 0.04656, + "grad_norm": 2.53125, + "grad_norm_var": 30.524051920572916, + "learning_rate": 0.0001, + "loss": 5.2353, + "loss/crossentropy": 2.3895785808563232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3287513107061386, + "step": 2328 + }, + { + "epoch": 0.0466, + "grad_norm": 2.984375, + "grad_norm_var": 30.446451822916668, + "learning_rate": 0.0001, + "loss": 4.9017, + "loss/crossentropy": 2.0607098937034607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30049796402454376, + "step": 2330 + }, + { + "epoch": 0.04664, + "grad_norm": 2.96875, + "grad_norm_var": 0.08322652180989583, + "learning_rate": 0.0001, + "loss": 5.1698, + "loss/crossentropy": 2.162124752998352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2983020693063736, + "step": 2332 + }, + { + "epoch": 0.04668, + "grad_norm": 2.890625, + "grad_norm_var": 0.05139567057291667, + "learning_rate": 0.0001, + "loss": 5.2702, + "loss/crossentropy": 2.34970760345459, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3322293907403946, + "step": 2334 + }, + { + "epoch": 0.04672, + "grad_norm": 2.703125, + "grad_norm_var": 0.04527587890625, + "learning_rate": 0.0001, + "loss": 4.9763, + "loss/crossentropy": 1.9286972284317017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2752893418073654, + "step": 2336 + }, + { + "epoch": 0.04676, + "grad_norm": 2.828125, + "grad_norm_var": 0.04159749348958333, + "learning_rate": 0.0001, + "loss": 5.1677, + "loss/crossentropy": 2.182044267654419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30808278918266296, + "step": 2338 + }, + { + "epoch": 0.0468, + "grad_norm": 2.96875, + "grad_norm_var": 0.03916727701822917, + "learning_rate": 0.0001, + "loss": 5.28, + "loss/crossentropy": 2.0002610087394714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2944178581237793, + "step": 2340 + }, + { + "epoch": 0.04684, + "grad_norm": 2.609375, + "grad_norm_var": 0.034830729166666664, + "learning_rate": 0.0001, + "loss": 4.984, + "loss/crossentropy": 2.0721842646598816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2769011855125427, + "step": 2342 + }, + { + "epoch": 0.04688, + "grad_norm": 2.625, + "grad_norm_var": 0.03178609212239583, + "learning_rate": 0.0001, + "loss": 5.0911, + "loss/crossentropy": 1.9710460305213928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28619086742401123, + "step": 2344 + }, + { + "epoch": 0.04692, + "grad_norm": 3.28125, + "grad_norm_var": 1.54605712890625, + "learning_rate": 0.0001, + "loss": 5.5092, + "loss/crossentropy": 2.0506762266159058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31705181300640106, + "step": 2346 + }, + { + "epoch": 0.04696, + "grad_norm": 2.609375, + "grad_norm_var": 1.5601064046223958, + "learning_rate": 0.0001, + "loss": 5.0045, + "loss/crossentropy": 2.0095930695533752, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2831149846315384, + "step": 2348 + }, + { + "epoch": 0.047, + "grad_norm": 2.890625, + "grad_norm_var": 1.573631795247396, + "learning_rate": 0.0001, + "loss": 5.19, + "loss/crossentropy": 2.023163616657257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2813292294740677, + "step": 2350 + }, + { + "epoch": 0.04704, + "grad_norm": 2.671875, + "grad_norm_var": 1.561424763997396, + "learning_rate": 0.0001, + "loss": 5.2907, + "loss/crossentropy": 2.230435371398926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30891451239585876, + "step": 2352 + }, + { + "epoch": 0.04708, + "grad_norm": 3.0625, + "grad_norm_var": 1.5700154622395834, + "learning_rate": 0.0001, + "loss": 4.9261, + "loss/crossentropy": 2.1553521156311035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29839709401130676, + "step": 2354 + }, + { + "epoch": 0.04712, + "grad_norm": 2.96875, + "grad_norm_var": 1.5770792643229166, + "learning_rate": 0.0001, + "loss": 5.2553, + "loss/crossentropy": 2.175648272037506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2972148358821869, + "step": 2356 + }, + { + "epoch": 0.04716, + "grad_norm": 2.703125, + "grad_norm_var": 1.5980377197265625, + "learning_rate": 0.0001, + "loss": 5.0436, + "loss/crossentropy": 2.3852503299713135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3166612535715103, + "step": 2358 + }, + { + "epoch": 0.0472, + "grad_norm": 2.75, + "grad_norm_var": 1.5826456705729166, + "learning_rate": 0.0001, + "loss": 5.1827, + "loss/crossentropy": 2.1905999183654785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29487256705760956, + "step": 2360 + }, + { + "epoch": 0.04724, + "grad_norm": 2.5625, + "grad_norm_var": 0.0413482666015625, + "learning_rate": 0.0001, + "loss": 4.9295, + "loss/crossentropy": 1.9224759340286255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27821336686611176, + "step": 2362 + }, + { + "epoch": 0.04728, + "grad_norm": 3.109375, + "grad_norm_var": 0.04057515462239583, + "learning_rate": 0.0001, + "loss": 5.1078, + "loss/crossentropy": 2.5025261640548706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3345927745103836, + "step": 2364 + }, + { + "epoch": 0.04732, + "grad_norm": 2.640625, + "grad_norm_var": 0.04006245930989583, + "learning_rate": 0.0001, + "loss": 5.0502, + "loss/crossentropy": 2.2385451793670654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27095621824264526, + "step": 2366 + }, + { + "epoch": 0.04736, + "grad_norm": 4.5, + "grad_norm_var": 0.22924702962239582, + "learning_rate": 0.0001, + "loss": 5.2761, + "loss/crossentropy": 2.0266553163528442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2856762409210205, + "step": 2368 + }, + { + "epoch": 0.0474, + "grad_norm": 3.359375, + "grad_norm_var": 0.24001363118489583, + "learning_rate": 0.0001, + "loss": 5.4918, + "loss/crossentropy": 2.5139355659484863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3186872750520706, + "step": 2370 + }, + { + "epoch": 0.04744, + "grad_norm": 2.609375, + "grad_norm_var": 0.24321187337239583, + "learning_rate": 0.0001, + "loss": 5.1302, + "loss/crossentropy": 2.066476881504059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2889470160007477, + "step": 2372 + }, + { + "epoch": 0.04748, + "grad_norm": 2.671875, + "grad_norm_var": 0.23772786458333334, + "learning_rate": 0.0001, + "loss": 5.0031, + "loss/crossentropy": 2.0537307262420654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31795741617679596, + "step": 2374 + }, + { + "epoch": 0.04752, + "grad_norm": 2.546875, + "grad_norm_var": 0.24492085774739583, + "learning_rate": 0.0001, + "loss": 4.9922, + "loss/crossentropy": 1.9254986643791199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25411880016326904, + "step": 2376 + }, + { + "epoch": 0.04756, + "grad_norm": 2.671875, + "grad_norm_var": 0.23855692545572918, + "learning_rate": 0.0001, + "loss": 5.0284, + "loss/crossentropy": 2.221043348312378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28534361720085144, + "step": 2378 + }, + { + "epoch": 0.0476, + "grad_norm": 2.90625, + "grad_norm_var": 0.228271484375, + "learning_rate": 0.0001, + "loss": 5.2106, + "loss/crossentropy": 2.3516281843185425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3128499984741211, + "step": 2380 + }, + { + "epoch": 0.04764, + "grad_norm": 2.734375, + "grad_norm_var": 0.22349853515625, + "learning_rate": 0.0001, + "loss": 5.6266, + "loss/crossentropy": 2.2144338488578796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3071902245283127, + "step": 2382 + }, + { + "epoch": 0.04768, + "grad_norm": 2.625, + "grad_norm_var": 0.0567291259765625, + "learning_rate": 0.0001, + "loss": 5.2429, + "loss/crossentropy": 2.3324203491210938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.322970449924469, + "step": 2384 + }, + { + "epoch": 0.04772, + "grad_norm": 2.546875, + "grad_norm_var": 0.03332926432291667, + "learning_rate": 0.0001, + "loss": 4.7732, + "loss/crossentropy": 2.08349871635437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2774003893136978, + "step": 2386 + }, + { + "epoch": 0.04776, + "grad_norm": 2.4375, + "grad_norm_var": 0.03902587890625, + "learning_rate": 0.0001, + "loss": 4.8585, + "loss/crossentropy": 1.9565780758857727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29790589213371277, + "step": 2388 + }, + { + "epoch": 0.0478, + "grad_norm": 2.625, + "grad_norm_var": 0.022782389322916666, + "learning_rate": 0.0001, + "loss": 5.0176, + "loss/crossentropy": 2.261398434638977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3143853694200516, + "step": 2390 + }, + { + "epoch": 0.04784, + "grad_norm": 2.96875, + "grad_norm_var": 0.024494425455729166, + "learning_rate": 0.0001, + "loss": 5.0688, + "loss/crossentropy": 1.9077460169792175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25433091819286346, + "step": 2392 + }, + { + "epoch": 0.04788, + "grad_norm": 2.65625, + "grad_norm_var": 0.0240142822265625, + "learning_rate": 0.0001, + "loss": 4.9531, + "loss/crossentropy": 1.9948468208312988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2598777562379837, + "step": 2394 + }, + { + "epoch": 0.04792, + "grad_norm": 2.9375, + "grad_norm_var": 0.45455729166666664, + "learning_rate": 0.0001, + "loss": 5.0972, + "loss/crossentropy": 2.1177526116371155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2775426208972931, + "step": 2396 + }, + { + "epoch": 0.04796, + "grad_norm": 2.5, + "grad_norm_var": 0.4634348551432292, + "learning_rate": 0.0001, + "loss": 4.8571, + "loss/crossentropy": 2.1756062507629395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33678852021694183, + "step": 2398 + }, + { + "epoch": 0.048, + "grad_norm": 2.984375, + "grad_norm_var": 0.4576171875, + "learning_rate": 0.0001, + "loss": 5.2617, + "loss/crossentropy": 2.0923725366592407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3255026638507843, + "step": 2400 + }, + { + "epoch": 0.04804, + "grad_norm": 2.84375, + "grad_norm_var": 0.4471181233723958, + "learning_rate": 0.0001, + "loss": 4.9421, + "loss/crossentropy": 1.9236682653427124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2642124071717262, + "step": 2402 + }, + { + "epoch": 0.04808, + "grad_norm": 2.84375, + "grad_norm_var": 0.43757222493489584, + "learning_rate": 0.0001, + "loss": 5.0604, + "loss/crossentropy": 2.1742242574691772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30006398260593414, + "step": 2404 + }, + { + "epoch": 0.04812, + "grad_norm": 2.6875, + "grad_norm_var": 0.43835347493489585, + "learning_rate": 0.0001, + "loss": 4.7077, + "loss/crossentropy": 1.7445701956748962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25791122019290924, + "step": 2406 + }, + { + "epoch": 0.04816, + "grad_norm": 4.34375, + "grad_norm_var": 0.5614735921223958, + "learning_rate": 0.0001, + "loss": 5.1289, + "loss/crossentropy": 1.8616467714309692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2532489001750946, + "step": 2408 + }, + { + "epoch": 0.0482, + "grad_norm": 3.203125, + "grad_norm_var": 0.5608723958333334, + "learning_rate": 0.0001, + "loss": 5.0486, + "loss/crossentropy": 1.9146783351898193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2659924626350403, + "step": 2410 + }, + { + "epoch": 0.04824, + "grad_norm": 2.859375, + "grad_norm_var": 0.21818745930989583, + "learning_rate": 0.0001, + "loss": 5.0972, + "loss/crossentropy": 2.179564118385315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29064056277275085, + "step": 2412 + }, + { + "epoch": 0.04828, + "grad_norm": 2.65625, + "grad_norm_var": 0.21357421875, + "learning_rate": 0.0001, + "loss": 4.9578, + "loss/crossentropy": 2.04409658908844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27891072630882263, + "step": 2414 + }, + { + "epoch": 0.04832, + "grad_norm": 2.625, + "grad_norm_var": 0.22568257649739584, + "learning_rate": 0.0001, + "loss": 4.8833, + "loss/crossentropy": 2.590337038040161, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3158426731824875, + "step": 2416 + }, + { + "epoch": 0.04836, + "grad_norm": 2.625, + "grad_norm_var": 0.23155924479166667, + "learning_rate": 0.0001, + "loss": 4.6919, + "loss/crossentropy": 1.8753941059112549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2669401317834854, + "step": 2418 + }, + { + "epoch": 0.0484, + "grad_norm": 2.734375, + "grad_norm_var": 0.23361714680989584, + "learning_rate": 0.0001, + "loss": 5.0231, + "loss/crossentropy": 2.1412659287452698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2839512526988983, + "step": 2420 + }, + { + "epoch": 0.04844, + "grad_norm": 2.65625, + "grad_norm_var": 0.23371988932291668, + "learning_rate": 0.0001, + "loss": 5.1187, + "loss/crossentropy": 2.545991063117981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3331379294395447, + "step": 2422 + }, + { + "epoch": 0.04848, + "grad_norm": 3.25, + "grad_norm_var": 0.089599609375, + "learning_rate": 0.0001, + "loss": 5.1246, + "loss/crossentropy": 2.12838077545166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3043065369129181, + "step": 2424 + }, + { + "epoch": 0.04852, + "grad_norm": 2.953125, + "grad_norm_var": 0.03660380045572917, + "learning_rate": 0.0001, + "loss": 5.3821, + "loss/crossentropy": 2.1983221769332886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2846619784832001, + "step": 2426 + }, + { + "epoch": 0.04856, + "grad_norm": 3.0625, + "grad_norm_var": 0.03819986979166667, + "learning_rate": 0.0001, + "loss": 5.1044, + "loss/crossentropy": 2.241136312484741, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30275705456733704, + "step": 2428 + }, + { + "epoch": 0.0486, + "grad_norm": 2.890625, + "grad_norm_var": 0.03831278483072917, + "learning_rate": 0.0001, + "loss": 5.203, + "loss/crossentropy": 2.097459554672241, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2986721396446228, + "step": 2430 + }, + { + "epoch": 0.04864, + "grad_norm": 4.4375, + "grad_norm_var": 0.20159403483072916, + "learning_rate": 0.0001, + "loss": 5.1158, + "loss/crossentropy": 2.333081007003784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2975248098373413, + "step": 2432 + }, + { + "epoch": 0.04868, + "grad_norm": 2.625, + "grad_norm_var": 0.19758707682291668, + "learning_rate": 0.0001, + "loss": 4.9183, + "loss/crossentropy": 2.3171510696411133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28452740609645844, + "step": 2434 + }, + { + "epoch": 0.04872, + "grad_norm": 2.375, + "grad_norm_var": 0.2141754150390625, + "learning_rate": 0.0001, + "loss": 4.8853, + "loss/crossentropy": 1.8334497213363647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25271379947662354, + "step": 2436 + }, + { + "epoch": 0.04876, + "grad_norm": 2.78125, + "grad_norm_var": 0.20258687337239584, + "learning_rate": 0.0001, + "loss": 5.3809, + "loss/crossentropy": 2.2712661027908325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30827929079532623, + "step": 2438 + }, + { + "epoch": 0.0488, + "grad_norm": 3.3125, + "grad_norm_var": 0.20465087890625, + "learning_rate": 0.0001, + "loss": 5.0863, + "loss/crossentropy": 2.160263180732727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3078690320253372, + "step": 2440 + }, + { + "epoch": 0.04884, + "grad_norm": 2.96875, + "grad_norm_var": 0.20429585774739584, + "learning_rate": 0.0001, + "loss": 5.2224, + "loss/crossentropy": 2.071319878101349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2733730524778366, + "step": 2442 + }, + { + "epoch": 0.04888, + "grad_norm": 2.796875, + "grad_norm_var": 0.203076171875, + "learning_rate": 0.0001, + "loss": 5.1476, + "loss/crossentropy": 2.0742560029029846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2835587412118912, + "step": 2444 + }, + { + "epoch": 0.04892, + "grad_norm": 2.765625, + "grad_norm_var": 0.24807535807291667, + "learning_rate": 0.0001, + "loss": 5.0211, + "loss/crossentropy": 1.8836837410926819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3089100867509842, + "step": 2446 + }, + { + "epoch": 0.04896, + "grad_norm": 2.875, + "grad_norm_var": 0.177099609375, + "learning_rate": 0.0001, + "loss": 4.9537, + "loss/crossentropy": 1.8539315462112427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2697101980447769, + "step": 2448 + }, + { + "epoch": 0.049, + "grad_norm": 2.828125, + "grad_norm_var": 0.17392578125, + "learning_rate": 0.0001, + "loss": 5.1907, + "loss/crossentropy": 2.219490647315979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2898380011320114, + "step": 2450 + }, + { + "epoch": 0.04904, + "grad_norm": 3.1875, + "grad_norm_var": 0.13853251139322917, + "learning_rate": 0.0001, + "loss": 5.468, + "loss/crossentropy": 2.328765392303467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4013051837682724, + "step": 2452 + }, + { + "epoch": 0.04908, + "grad_norm": 2.5625, + "grad_norm_var": 0.14879557291666667, + "learning_rate": 0.0001, + "loss": 4.8967, + "loss/crossentropy": 1.9204192161560059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25281261652708054, + "step": 2454 + }, + { + "epoch": 0.04912, + "grad_norm": 2.765625, + "grad_norm_var": 0.15563151041666667, + "learning_rate": 0.0001, + "loss": 5.0935, + "loss/crossentropy": 2.377043604850769, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2928486764431, + "step": 2456 + }, + { + "epoch": 0.04916, + "grad_norm": 2.796875, + "grad_norm_var": 0.15930989583333333, + "learning_rate": 0.0001, + "loss": 5.4528, + "loss/crossentropy": 2.4364209175109863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3111976683139801, + "step": 2458 + }, + { + "epoch": 0.0492, + "grad_norm": 2.703125, + "grad_norm_var": 0.15985921223958333, + "learning_rate": 0.0001, + "loss": 5.1357, + "loss/crossentropy": 2.3738330602645874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3019126206636429, + "step": 2460 + }, + { + "epoch": 0.04924, + "grad_norm": 2.765625, + "grad_norm_var": 0.1141265869140625, + "learning_rate": 0.0001, + "loss": 5.2876, + "loss/crossentropy": 2.4575772285461426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3428986072540283, + "step": 2462 + }, + { + "epoch": 0.04928, + "grad_norm": 2.734375, + "grad_norm_var": 0.028888956705729166, + "learning_rate": 0.0001, + "loss": 4.6505, + "loss/crossentropy": 1.9849627017974854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27611130475997925, + "step": 2464 + }, + { + "epoch": 0.04932, + "grad_norm": 2.625, + "grad_norm_var": 0.030402628580729167, + "learning_rate": 0.0001, + "loss": 4.8482, + "loss/crossentropy": 1.9617170691490173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.257377490401268, + "step": 2466 + }, + { + "epoch": 0.04936, + "grad_norm": 2.640625, + "grad_norm_var": 0.01871337890625, + "learning_rate": 0.0001, + "loss": 5.1739, + "loss/crossentropy": 2.4031273126602173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28843145072460175, + "step": 2468 + }, + { + "epoch": 0.0494, + "grad_norm": 2.484375, + "grad_norm_var": 0.0165435791015625, + "learning_rate": 0.0001, + "loss": 4.8452, + "loss/crossentropy": 1.7263792753219604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25591571629047394, + "step": 2470 + }, + { + "epoch": 0.04944, + "grad_norm": 2.734375, + "grad_norm_var": 0.0191802978515625, + "learning_rate": 0.0001, + "loss": 4.7154, + "loss/crossentropy": 2.106898784637451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.280623197555542, + "step": 2472 + }, + { + "epoch": 0.04948, + "grad_norm": 2.6875, + "grad_norm_var": 0.018358357747395835, + "learning_rate": 0.0001, + "loss": 5.0092, + "loss/crossentropy": 2.325208902359009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27794161438941956, + "step": 2474 + }, + { + "epoch": 0.04952, + "grad_norm": 2.65625, + "grad_norm_var": 0.013736979166666666, + "learning_rate": 0.0001, + "loss": 5.1128, + "loss/crossentropy": 2.367414712905884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31346653401851654, + "step": 2476 + }, + { + "epoch": 0.04956, + "grad_norm": 2.59375, + "grad_norm_var": 0.013792928059895833, + "learning_rate": 0.0001, + "loss": 5.2611, + "loss/crossentropy": 2.191115140914917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30560287833213806, + "step": 2478 + }, + { + "epoch": 0.0496, + "grad_norm": 2.765625, + "grad_norm_var": 0.012430826822916666, + "learning_rate": 0.0001, + "loss": 4.9993, + "loss/crossentropy": 2.2559624314308167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3029457628726959, + "step": 2480 + }, + { + "epoch": 0.04964, + "grad_norm": 3.390625, + "grad_norm_var": 0.04160868326822917, + "learning_rate": 0.0001, + "loss": 5.125, + "loss/crossentropy": 2.0088382363319397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2923784404993057, + "step": 2482 + }, + { + "epoch": 0.04968, + "grad_norm": 2.71875, + "grad_norm_var": 0.03943583170572917, + "learning_rate": 0.0001, + "loss": 4.8858, + "loss/crossentropy": 1.8445329070091248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2587483897805214, + "step": 2484 + }, + { + "epoch": 0.04972, + "grad_norm": 2.8125, + "grad_norm_var": 0.03534749348958333, + "learning_rate": 0.0001, + "loss": 4.7918, + "loss/crossentropy": 2.015140950679779, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27480585873126984, + "step": 2486 + }, + { + "epoch": 0.04976, + "grad_norm": 2.765625, + "grad_norm_var": 0.030833943684895834, + "learning_rate": 0.0001, + "loss": 5.1959, + "loss/crossentropy": 1.918801188468933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28786011040210724, + "step": 2488 + }, + { + "epoch": 0.0498, + "grad_norm": 2.734375, + "grad_norm_var": 0.030269368489583334, + "learning_rate": 0.0001, + "loss": 5.0108, + "loss/crossentropy": 1.9899121522903442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25822708010673523, + "step": 2490 + }, + { + "epoch": 0.04984, + "grad_norm": 2.59375, + "grad_norm_var": 0.037206013997395836, + "learning_rate": 0.0001, + "loss": 4.7259, + "loss/crossentropy": 2.3775535821914673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31345370411872864, + "step": 2492 + }, + { + "epoch": 0.04988, + "grad_norm": 2.71875, + "grad_norm_var": 0.03791402180989583, + "learning_rate": 0.0001, + "loss": 4.9496, + "loss/crossentropy": 2.0874632596969604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2836592495441437, + "step": 2494 + }, + { + "epoch": 0.04992, + "grad_norm": 2.78125, + "grad_norm_var": 0.046019490559895834, + "learning_rate": 0.0001, + "loss": 5.2404, + "loss/crossentropy": 2.226976454257965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27826404571533203, + "step": 2496 + }, + { + "epoch": 0.04996, + "grad_norm": 2.71875, + "grad_norm_var": 0.023900349934895832, + "learning_rate": 0.0001, + "loss": 5.1503, + "loss/crossentropy": 2.4569294452667236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31981319189071655, + "step": 2498 + }, + { + "epoch": 0.05, + "grad_norm": 2.703125, + "grad_norm_var": 0.025581868489583333, + "learning_rate": 0.0001, + "loss": 4.922, + "loss/crossentropy": 2.1134212017059326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28260529041290283, + "step": 2500 + }, + { + "epoch": 0.05004, + "grad_norm": 3.0, + "grad_norm_var": 0.029195149739583332, + "learning_rate": 0.0001, + "loss": 5.3555, + "loss/crossentropy": 2.2914888858795166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3182682394981384, + "step": 2502 + }, + { + "epoch": 0.05008, + "grad_norm": 2.65625, + "grad_norm_var": 0.030078125, + "learning_rate": 0.0001, + "loss": 4.9644, + "loss/crossentropy": 2.3261003494262695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30558250844478607, + "step": 2504 + }, + { + "epoch": 0.05012, + "grad_norm": 2.703125, + "grad_norm_var": 0.030598958333333332, + "learning_rate": 0.0001, + "loss": 4.9517, + "loss/crossentropy": 2.351989507675171, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2934701144695282, + "step": 2506 + }, + { + "epoch": 0.05016, + "grad_norm": 2.5625, + "grad_norm_var": 0.025031534830729167, + "learning_rate": 0.0001, + "loss": 4.71, + "loss/crossentropy": 1.8742690086364746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26305729895830154, + "step": 2508 + }, + { + "epoch": 0.0502, + "grad_norm": 2.78125, + "grad_norm_var": 0.0247222900390625, + "learning_rate": 0.0001, + "loss": 4.8214, + "loss/crossentropy": 2.1668856143951416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2728075534105301, + "step": 2510 + }, + { + "epoch": 0.05024, + "grad_norm": 2.734375, + "grad_norm_var": 0.0137115478515625, + "learning_rate": 0.0001, + "loss": 4.7171, + "loss/crossentropy": 1.773424208164215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26808495819568634, + "step": 2512 + }, + { + "epoch": 0.05028, + "grad_norm": 2.59375, + "grad_norm_var": 0.013109334309895833, + "learning_rate": 0.0001, + "loss": 4.9224, + "loss/crossentropy": 1.7541643977165222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2600491940975189, + "step": 2514 + }, + { + "epoch": 0.05032, + "grad_norm": 2.796875, + "grad_norm_var": 0.011324055989583333, + "learning_rate": 0.0001, + "loss": 5.129, + "loss/crossentropy": 1.9693496227264404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2844541072845459, + "step": 2516 + }, + { + "epoch": 0.05036, + "grad_norm": 2.765625, + "grad_norm_var": 0.009065755208333333, + "learning_rate": 0.0001, + "loss": 4.9202, + "loss/crossentropy": 1.7539461851119995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2542608380317688, + "step": 2518 + }, + { + "epoch": 0.0504, + "grad_norm": 2.984375, + "grad_norm_var": 0.017878214518229168, + "learning_rate": 0.0001, + "loss": 5.0686, + "loss/crossentropy": 2.155138313770294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3023010194301605, + "step": 2520 + }, + { + "epoch": 0.05044, + "grad_norm": 2.609375, + "grad_norm_var": 0.018619791666666666, + "learning_rate": 0.0001, + "loss": 4.9674, + "loss/crossentropy": 2.069350838661194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26573850214481354, + "step": 2522 + }, + { + "epoch": 0.05048, + "grad_norm": 2.5, + "grad_norm_var": 0.022101847330729167, + "learning_rate": 0.0001, + "loss": 5.0295, + "loss/crossentropy": 2.1864534616470337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27845603227615356, + "step": 2524 + }, + { + "epoch": 0.05052, + "grad_norm": 2.5, + "grad_norm_var": 0.023509724934895834, + "learning_rate": 0.0001, + "loss": 5.1173, + "loss/crossentropy": 2.2462135553359985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29221346974372864, + "step": 2526 + }, + { + "epoch": 0.05056, + "grad_norm": 2.78125, + "grad_norm_var": 0.030492146809895832, + "learning_rate": 0.0001, + "loss": 5.0027, + "loss/crossentropy": 2.043266773223877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25857964158058167, + "step": 2528 + }, + { + "epoch": 0.0506, + "grad_norm": 2.65625, + "grad_norm_var": 0.0299957275390625, + "learning_rate": 0.0001, + "loss": 4.9672, + "loss/crossentropy": 1.8892702460289001, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2879898101091385, + "step": 2530 + }, + { + "epoch": 0.05064, + "grad_norm": 2.46875, + "grad_norm_var": 0.03238525390625, + "learning_rate": 0.0001, + "loss": 4.5332, + "loss/crossentropy": 1.9220558404922485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26652154326438904, + "step": 2532 + }, + { + "epoch": 0.05068, + "grad_norm": 2.515625, + "grad_norm_var": 0.03430989583333333, + "learning_rate": 0.0001, + "loss": 4.4176, + "loss/crossentropy": 1.7282914519309998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2367371916770935, + "step": 2534 + }, + { + "epoch": 0.05072, + "grad_norm": 2.78125, + "grad_norm_var": 0.021092732747395832, + "learning_rate": 0.0001, + "loss": 4.9589, + "loss/crossentropy": 1.729803204536438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25624871999025345, + "step": 2536 + }, + { + "epoch": 0.05076, + "grad_norm": 2.578125, + "grad_norm_var": 0.020873006184895834, + "learning_rate": 0.0001, + "loss": 4.6952, + "loss/crossentropy": 2.0921449661254883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2855897545814514, + "step": 2538 + }, + { + "epoch": 0.0508, + "grad_norm": 2.6875, + "grad_norm_var": 0.015404256184895833, + "learning_rate": 0.0001, + "loss": 4.8967, + "loss/crossentropy": 2.0569751858711243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2829667925834656, + "step": 2540 + }, + { + "epoch": 0.05084, + "grad_norm": 2.609375, + "grad_norm_var": 0.015973917643229165, + "learning_rate": 0.0001, + "loss": 5.2438, + "loss/crossentropy": 1.983904242515564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30939212441444397, + "step": 2542 + }, + { + "epoch": 0.05088, + "grad_norm": 2.703125, + "grad_norm_var": 0.011864217122395833, + "learning_rate": 0.0001, + "loss": 4.9968, + "loss/crossentropy": 2.2631462812423706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27867285907268524, + "step": 2544 + }, + { + "epoch": 0.05092, + "grad_norm": 3.078125, + "grad_norm_var": 0.023856608072916667, + "learning_rate": 0.0001, + "loss": 4.9621, + "loss/crossentropy": 1.9918989539146423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27703428268432617, + "step": 2546 + }, + { + "epoch": 0.05096, + "grad_norm": 2.78125, + "grad_norm_var": 0.021110026041666667, + "learning_rate": 0.0001, + "loss": 5.2041, + "loss/crossentropy": 2.1356931924819946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27815964818000793, + "step": 2548 + }, + { + "epoch": 0.051, + "grad_norm": 2.46875, + "grad_norm_var": 0.019071451822916665, + "learning_rate": 0.0001, + "loss": 4.6514, + "loss/crossentropy": 2.172751545906067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2971910834312439, + "step": 2550 + }, + { + "epoch": 0.05104, + "grad_norm": 2.578125, + "grad_norm_var": 0.019527180989583334, + "learning_rate": 0.0001, + "loss": 5.0178, + "loss/crossentropy": 2.0799094438552856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30459292232990265, + "step": 2552 + }, + { + "epoch": 0.05108, + "grad_norm": 2.5, + "grad_norm_var": 0.0211090087890625, + "learning_rate": 0.0001, + "loss": 4.8235, + "loss/crossentropy": 1.7769129872322083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2526697665452957, + "step": 2554 + }, + { + "epoch": 0.05112, + "grad_norm": 2.8125, + "grad_norm_var": 0.024494425455729166, + "learning_rate": 0.0001, + "loss": 4.8457, + "loss/crossentropy": 2.044790804386139, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2821648418903351, + "step": 2556 + }, + { + "epoch": 0.05116, + "grad_norm": 2.921875, + "grad_norm_var": 0.0287109375, + "learning_rate": 0.0001, + "loss": 5.5336, + "loss/crossentropy": 2.3708614110946655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30810464918613434, + "step": 2558 + }, + { + "epoch": 0.0512, + "grad_norm": 2.71875, + "grad_norm_var": 0.028709920247395833, + "learning_rate": 0.0001, + "loss": 5.2385, + "loss/crossentropy": 2.2216718196868896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2919304668903351, + "step": 2560 + }, + { + "epoch": 0.05124, + "grad_norm": 2.671875, + "grad_norm_var": 0.019466145833333334, + "learning_rate": 0.0001, + "loss": 5.28, + "loss/crossentropy": 2.4692097902297974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30620162189006805, + "step": 2562 + }, + { + "epoch": 0.05128, + "grad_norm": 2.984375, + "grad_norm_var": 0.026936848958333332, + "learning_rate": 0.0001, + "loss": 4.9476, + "loss/crossentropy": 2.0491623282432556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25617313385009766, + "step": 2564 + }, + { + "epoch": 0.05132, + "grad_norm": 2.8125, + "grad_norm_var": 0.023368326822916667, + "learning_rate": 0.0001, + "loss": 4.958, + "loss/crossentropy": 1.8305597305297852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25237561762332916, + "step": 2566 + }, + { + "epoch": 0.05136, + "grad_norm": 2.640625, + "grad_norm_var": 0.0223297119140625, + "learning_rate": 0.0001, + "loss": 4.9853, + "loss/crossentropy": 1.9471853971481323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2712964415550232, + "step": 2568 + }, + { + "epoch": 0.0514, + "grad_norm": 2.765625, + "grad_norm_var": 0.0198638916015625, + "learning_rate": 0.0001, + "loss": 5.0932, + "loss/crossentropy": 2.575412631034851, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3106851130723953, + "step": 2570 + }, + { + "epoch": 0.05144, + "grad_norm": 2.515625, + "grad_norm_var": 0.020686848958333334, + "learning_rate": 0.0001, + "loss": 4.6755, + "loss/crossentropy": 2.0210241079330444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.269680991768837, + "step": 2572 + }, + { + "epoch": 0.05148, + "grad_norm": 2.4375, + "grad_norm_var": 0.02076416015625, + "learning_rate": 0.0001, + "loss": 4.6308, + "loss/crossentropy": 1.9054389595985413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24933087825775146, + "step": 2574 + }, + { + "epoch": 0.05152, + "grad_norm": 2.625, + "grad_norm_var": 0.021284993489583334, + "learning_rate": 0.0001, + "loss": 4.9682, + "loss/crossentropy": 2.142069697380066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3162301778793335, + "step": 2576 + }, + { + "epoch": 0.05156, + "grad_norm": 2.59375, + "grad_norm_var": 0.0197662353515625, + "learning_rate": 0.0001, + "loss": 5.018, + "loss/crossentropy": 1.9952309727668762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2560836151242256, + "step": 2578 + }, + { + "epoch": 0.0516, + "grad_norm": 2.765625, + "grad_norm_var": 0.016405232747395835, + "learning_rate": 0.0001, + "loss": 4.8889, + "loss/crossentropy": 2.0579317212104797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26214616745710373, + "step": 2580 + }, + { + "epoch": 0.05164, + "grad_norm": 2.46875, + "grad_norm_var": 0.016927083333333332, + "learning_rate": 0.0001, + "loss": 4.6306, + "loss/crossentropy": 2.076499104499817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26244185864925385, + "step": 2582 + }, + { + "epoch": 0.05168, + "grad_norm": 2.890625, + "grad_norm_var": 0.026146443684895833, + "learning_rate": 0.0001, + "loss": 4.9393, + "loss/crossentropy": 2.2277501821517944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32310059666633606, + "step": 2584 + }, + { + "epoch": 0.05172, + "grad_norm": 2.53125, + "grad_norm_var": 0.03141988118489583, + "learning_rate": 0.0001, + "loss": 5.0929, + "loss/crossentropy": 2.101436138153076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26044395565986633, + "step": 2586 + }, + { + "epoch": 0.05176, + "grad_norm": 2.703125, + "grad_norm_var": 0.030939737955729168, + "learning_rate": 0.0001, + "loss": 5.045, + "loss/crossentropy": 2.2617305517196655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2768043726682663, + "step": 2588 + }, + { + "epoch": 0.0518, + "grad_norm": 2.515625, + "grad_norm_var": 0.0349029541015625, + "learning_rate": 0.0001, + "loss": 4.989, + "loss/crossentropy": 2.2669495940208435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2941794842481613, + "step": 2590 + }, + { + "epoch": 0.05184, + "grad_norm": 2.546875, + "grad_norm_var": 0.03871968587239583, + "learning_rate": 0.0001, + "loss": 4.7676, + "loss/crossentropy": 1.8102391958236694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24766983091831207, + "step": 2592 + }, + { + "epoch": 0.05188, + "grad_norm": 2.53125, + "grad_norm_var": 0.04006754557291667, + "learning_rate": 0.0001, + "loss": 5.0022, + "loss/crossentropy": 2.1426219940185547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2808763086795807, + "step": 2594 + }, + { + "epoch": 0.05192, + "grad_norm": 2.75, + "grad_norm_var": 0.0368804931640625, + "learning_rate": 0.0001, + "loss": 4.8649, + "loss/crossentropy": 2.0731321573257446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28768520057201385, + "step": 2596 + }, + { + "epoch": 0.05196, + "grad_norm": 2.46875, + "grad_norm_var": 0.03619384765625, + "learning_rate": 0.0001, + "loss": 4.8019, + "loss/crossentropy": 1.8902159333229065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27209727466106415, + "step": 2598 + }, + { + "epoch": 0.052, + "grad_norm": 2.390625, + "grad_norm_var": 0.030855305989583335, + "learning_rate": 0.0001, + "loss": 4.837, + "loss/crossentropy": 2.1860097646713257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3191404938697815, + "step": 2600 + }, + { + "epoch": 0.05204, + "grad_norm": 2.78125, + "grad_norm_var": 0.027074178059895832, + "learning_rate": 0.0001, + "loss": 4.8471, + "loss/crossentropy": 2.1010658740997314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2924908995628357, + "step": 2602 + }, + { + "epoch": 0.05208, + "grad_norm": 2.71875, + "grad_norm_var": 0.0285064697265625, + "learning_rate": 0.0001, + "loss": 5.1725, + "loss/crossentropy": 2.0668399930000305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2691944092512131, + "step": 2604 + }, + { + "epoch": 0.05212, + "grad_norm": 2.8125, + "grad_norm_var": 0.023824055989583332, + "learning_rate": 0.0001, + "loss": 5.0115, + "loss/crossentropy": 2.310541272163391, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31448885798454285, + "step": 2606 + }, + { + "epoch": 0.05216, + "grad_norm": 2.75, + "grad_norm_var": 0.0222808837890625, + "learning_rate": 0.0001, + "loss": 4.7731, + "loss/crossentropy": 2.023577332496643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28528447449207306, + "step": 2608 + }, + { + "epoch": 0.0522, + "grad_norm": 2.5625, + "grad_norm_var": 0.0217193603515625, + "learning_rate": 0.0001, + "loss": 4.8408, + "loss/crossentropy": 2.0232901573181152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25540125370025635, + "step": 2610 + }, + { + "epoch": 0.05224, + "grad_norm": 2.5625, + "grad_norm_var": 0.019554646809895833, + "learning_rate": 0.0001, + "loss": 4.8374, + "loss/crossentropy": 2.147561550140381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2914447784423828, + "step": 2612 + }, + { + "epoch": 0.05228, + "grad_norm": 2.46875, + "grad_norm_var": 0.0193359375, + "learning_rate": 0.0001, + "loss": 4.8044, + "loss/crossentropy": 1.9136184453964233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27879445254802704, + "step": 2614 + }, + { + "epoch": 0.05232, + "grad_norm": 3.03125, + "grad_norm_var": 0.0606842041015625, + "learning_rate": 0.0001, + "loss": 5.1245, + "loss/crossentropy": 2.1118494272232056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2990800142288208, + "step": 2616 + }, + { + "epoch": 0.05236, + "grad_norm": 2.640625, + "grad_norm_var": 0.0599761962890625, + "learning_rate": 0.0001, + "loss": 5.2189, + "loss/crossentropy": 1.9564325213432312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2805679142475128, + "step": 2618 + }, + { + "epoch": 0.0524, + "grad_norm": 3.046875, + "grad_norm_var": 0.06641337076822916, + "learning_rate": 0.0001, + "loss": 4.9511, + "loss/crossentropy": 2.0683051347732544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2906472980976105, + "step": 2620 + }, + { + "epoch": 0.05244, + "grad_norm": 2.609375, + "grad_norm_var": 0.06653645833333334, + "learning_rate": 0.0001, + "loss": 5.058, + "loss/crossentropy": 2.0510823130607605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28148986399173737, + "step": 2622 + }, + { + "epoch": 0.05248, + "grad_norm": 2.640625, + "grad_norm_var": 0.06256103515625, + "learning_rate": 0.0001, + "loss": 5.2113, + "loss/crossentropy": 2.2972904443740845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32902073860168457, + "step": 2624 + }, + { + "epoch": 0.05252, + "grad_norm": 2.890625, + "grad_norm_var": 0.0582427978515625, + "learning_rate": 0.0001, + "loss": 5.0939, + "loss/crossentropy": 1.9502179026603699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2844041585922241, + "step": 2626 + }, + { + "epoch": 0.05256, + "grad_norm": 2.640625, + "grad_norm_var": 0.05607808430989583, + "learning_rate": 0.0001, + "loss": 5.1078, + "loss/crossentropy": 2.1577298045158386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2673380598425865, + "step": 2628 + }, + { + "epoch": 0.0526, + "grad_norm": 2.59375, + "grad_norm_var": 0.05271708170572917, + "learning_rate": 0.0001, + "loss": 5.0514, + "loss/crossentropy": 2.1707664132118225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2697260081768036, + "step": 2630 + }, + { + "epoch": 0.05264, + "grad_norm": 2.71875, + "grad_norm_var": 0.017606608072916665, + "learning_rate": 0.0001, + "loss": 4.9617, + "loss/crossentropy": 2.0975311398506165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27033862471580505, + "step": 2632 + }, + { + "epoch": 0.05268, + "grad_norm": 3.25, + "grad_norm_var": 0.033869425455729164, + "learning_rate": 0.0001, + "loss": 5.1841, + "loss/crossentropy": 2.197197914123535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28717951476573944, + "step": 2634 + }, + { + "epoch": 0.05272, + "grad_norm": 2.765625, + "grad_norm_var": 0.03570556640625, + "learning_rate": 0.0001, + "loss": 5.2558, + "loss/crossentropy": 2.2898266315460205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31359314918518066, + "step": 2636 + }, + { + "epoch": 0.05276, + "grad_norm": 2.796875, + "grad_norm_var": 0.0347564697265625, + "learning_rate": 0.0001, + "loss": 5.4849, + "loss/crossentropy": 2.525710701942444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3199215829372406, + "step": 2638 + }, + { + "epoch": 0.0528, + "grad_norm": 2.796875, + "grad_norm_var": 0.03349609375, + "learning_rate": 0.0001, + "loss": 5.0495, + "loss/crossentropy": 2.2799761295318604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28715676069259644, + "step": 2640 + }, + { + "epoch": 0.05284, + "grad_norm": 2.796875, + "grad_norm_var": 0.032373046875, + "learning_rate": 0.0001, + "loss": 4.9837, + "loss/crossentropy": 1.8681190013885498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25435833632946014, + "step": 2642 + }, + { + "epoch": 0.05288, + "grad_norm": 2.578125, + "grad_norm_var": 0.03730061848958333, + "learning_rate": 0.0001, + "loss": 4.807, + "loss/crossentropy": 1.9067540168762207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25444281101226807, + "step": 2644 + }, + { + "epoch": 0.05292, + "grad_norm": 2.53125, + "grad_norm_var": 0.03732096354166667, + "learning_rate": 0.0001, + "loss": 5.0326, + "loss/crossentropy": 2.370971202850342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31173495948314667, + "step": 2646 + }, + { + "epoch": 0.05296, + "grad_norm": 2.671875, + "grad_norm_var": 0.03752848307291667, + "learning_rate": 0.0001, + "loss": 5.1148, + "loss/crossentropy": 2.1829749941825867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29970400035381317, + "step": 2648 + }, + { + "epoch": 0.053, + "grad_norm": 2.609375, + "grad_norm_var": 0.020052083333333335, + "learning_rate": 0.0001, + "loss": 5.1086, + "loss/crossentropy": 2.0818498134613037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27712464332580566, + "step": 2650 + }, + { + "epoch": 0.05304, + "grad_norm": 2.609375, + "grad_norm_var": 0.010789998372395833, + "learning_rate": 0.0001, + "loss": 5.1071, + "loss/crossentropy": 2.2080377340316772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27552157640457153, + "step": 2652 + }, + { + "epoch": 0.05308, + "grad_norm": 2.53125, + "grad_norm_var": 0.01109619140625, + "learning_rate": 0.0001, + "loss": 4.9685, + "loss/crossentropy": 1.7115904092788696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23266373574733734, + "step": 2654 + }, + { + "epoch": 0.05312, + "grad_norm": 2.59375, + "grad_norm_var": 0.009598795572916667, + "learning_rate": 0.0001, + "loss": 4.8654, + "loss/crossentropy": 2.1736810207366943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29696571826934814, + "step": 2656 + }, + { + "epoch": 0.05316, + "grad_norm": 2.765625, + "grad_norm_var": 0.012886555989583333, + "learning_rate": 0.0001, + "loss": 5.0834, + "loss/crossentropy": 2.2366485595703125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3062315583229065, + "step": 2658 + }, + { + "epoch": 0.0532, + "grad_norm": 2.609375, + "grad_norm_var": 0.018748982747395834, + "learning_rate": 0.0001, + "loss": 5.0888, + "loss/crossentropy": 1.9835070371627808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35663366317749023, + "step": 2660 + }, + { + "epoch": 0.05324, + "grad_norm": 2.4375, + "grad_norm_var": 0.026688639322916666, + "learning_rate": 0.0001, + "loss": 4.6725, + "loss/crossentropy": 2.148723840713501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2687191218137741, + "step": 2662 + }, + { + "epoch": 0.05328, + "grad_norm": 2.625, + "grad_norm_var": 0.02789306640625, + "learning_rate": 0.0001, + "loss": 4.9191, + "loss/crossentropy": 2.2642128467559814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2738381028175354, + "step": 2664 + }, + { + "epoch": 0.05332, + "grad_norm": 2.546875, + "grad_norm_var": 0.028483072916666668, + "learning_rate": 0.0001, + "loss": 4.8209, + "loss/crossentropy": 1.839052438735962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2566695362329483, + "step": 2666 + }, + { + "epoch": 0.05336, + "grad_norm": 2.6875, + "grad_norm_var": 0.026927693684895834, + "learning_rate": 0.0001, + "loss": 4.8771, + "loss/crossentropy": 2.083684980869293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2632910907268524, + "step": 2668 + }, + { + "epoch": 0.0534, + "grad_norm": 2.75, + "grad_norm_var": 0.028434244791666667, + "learning_rate": 0.0001, + "loss": 4.8585, + "loss/crossentropy": 2.2125936150550842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31363190710544586, + "step": 2670 + }, + { + "epoch": 0.05344, + "grad_norm": 2.734375, + "grad_norm_var": 0.029157511393229165, + "learning_rate": 0.0001, + "loss": 5.0354, + "loss/crossentropy": 2.2075263261795044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2850564122200012, + "step": 2672 + }, + { + "epoch": 0.05348, + "grad_norm": 2.859375, + "grad_norm_var": 0.026656087239583334, + "learning_rate": 0.0001, + "loss": 5.0145, + "loss/crossentropy": 2.0876463651657104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2972792685031891, + "step": 2674 + }, + { + "epoch": 0.05352, + "grad_norm": 2.84375, + "grad_norm_var": 0.020406087239583332, + "learning_rate": 0.0001, + "loss": 5.2755, + "loss/crossentropy": 2.4033172130584717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.304116889834404, + "step": 2676 + }, + { + "epoch": 0.05356, + "grad_norm": 2.734375, + "grad_norm_var": 0.014501953125, + "learning_rate": 0.0001, + "loss": 5.0921, + "loss/crossentropy": 2.30586314201355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3264722675085068, + "step": 2678 + }, + { + "epoch": 0.0536, + "grad_norm": 2.46875, + "grad_norm_var": 0.015152994791666667, + "learning_rate": 0.0001, + "loss": 5.0941, + "loss/crossentropy": 2.2175174951553345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2661540359258652, + "step": 2680 + }, + { + "epoch": 0.05364, + "grad_norm": 2.921875, + "grad_norm_var": 0.022606404622395833, + "learning_rate": 0.0001, + "loss": 5.1162, + "loss/crossentropy": 2.0583900213241577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26995618641376495, + "step": 2682 + }, + { + "epoch": 0.05368, + "grad_norm": 2.59375, + "grad_norm_var": 0.020173136393229166, + "learning_rate": 0.0001, + "loss": 4.9357, + "loss/crossentropy": 2.310309052467346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29642508924007416, + "step": 2684 + }, + { + "epoch": 0.05372, + "grad_norm": 3.15625, + "grad_norm_var": 0.03332926432291667, + "learning_rate": 0.0001, + "loss": 4.7945, + "loss/crossentropy": 2.134134352207184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2619224041700363, + "step": 2686 + }, + { + "epoch": 0.05376, + "grad_norm": 2.609375, + "grad_norm_var": 0.03306884765625, + "learning_rate": 0.0001, + "loss": 4.8411, + "loss/crossentropy": 1.930562138557434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27108173072338104, + "step": 2688 + }, + { + "epoch": 0.0538, + "grad_norm": 2.46875, + "grad_norm_var": 0.03455403645833333, + "learning_rate": 0.0001, + "loss": 4.9591, + "loss/crossentropy": 2.3414769172668457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.270420178771019, + "step": 2690 + }, + { + "epoch": 0.05384, + "grad_norm": 2.78125, + "grad_norm_var": 0.0356842041015625, + "learning_rate": 0.0001, + "loss": 4.83, + "loss/crossentropy": 2.1726938486099243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2710433751344681, + "step": 2692 + }, + { + "epoch": 0.05388, + "grad_norm": 2.640625, + "grad_norm_var": 0.0361968994140625, + "learning_rate": 0.0001, + "loss": 5.0163, + "loss/crossentropy": 2.220117926597595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3212582617998123, + "step": 2694 + }, + { + "epoch": 0.05392, + "grad_norm": 2.71875, + "grad_norm_var": 0.03297119140625, + "learning_rate": 0.0001, + "loss": 5.1845, + "loss/crossentropy": 2.2557668685913086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3096832036972046, + "step": 2696 + }, + { + "epoch": 0.05396, + "grad_norm": 2.734375, + "grad_norm_var": 0.0246246337890625, + "learning_rate": 0.0001, + "loss": 5.0399, + "loss/crossentropy": 1.94975346326828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2530350238084793, + "step": 2698 + }, + { + "epoch": 0.054, + "grad_norm": 2.6875, + "grad_norm_var": 0.024169921875, + "learning_rate": 0.0001, + "loss": 5.19, + "loss/crossentropy": 2.4622775316238403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3135389983654022, + "step": 2700 + }, + { + "epoch": 0.05404, + "grad_norm": 2.65625, + "grad_norm_var": 0.0110748291015625, + "learning_rate": 0.0001, + "loss": 5.2005, + "loss/crossentropy": 2.5367363691329956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30742934346199036, + "step": 2702 + }, + { + "epoch": 0.05408, + "grad_norm": 2.5, + "grad_norm_var": 0.014631144205729167, + "learning_rate": 0.0001, + "loss": 5.146, + "loss/crossentropy": 2.5733184814453125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33410580456256866, + "step": 2704 + }, + { + "epoch": 0.05412, + "grad_norm": 2.6875, + "grad_norm_var": 0.011725870768229167, + "learning_rate": 0.0001, + "loss": 4.8888, + "loss/crossentropy": 1.9339997172355652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28760699927806854, + "step": 2706 + }, + { + "epoch": 0.05416, + "grad_norm": 2.484375, + "grad_norm_var": 0.010724894205729167, + "learning_rate": 0.0001, + "loss": 4.8719, + "loss/crossentropy": 1.8515672087669373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23648252338171005, + "step": 2708 + }, + { + "epoch": 0.0542, + "grad_norm": 2.546875, + "grad_norm_var": 0.014420572916666667, + "learning_rate": 0.0001, + "loss": 4.6598, + "loss/crossentropy": 2.0973429083824158, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2605738639831543, + "step": 2710 + }, + { + "epoch": 0.05424, + "grad_norm": 2.90625, + "grad_norm_var": 0.021219889322916668, + "learning_rate": 0.0001, + "loss": 5.2795, + "loss/crossentropy": 2.406570076942444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2992263287305832, + "step": 2712 + }, + { + "epoch": 0.05428, + "grad_norm": 2.53125, + "grad_norm_var": 0.024958292643229168, + "learning_rate": 0.0001, + "loss": 4.8591, + "loss/crossentropy": 2.040315330028534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27726222574710846, + "step": 2714 + }, + { + "epoch": 0.05432, + "grad_norm": 2.40625, + "grad_norm_var": 0.025178019205729166, + "learning_rate": 0.0001, + "loss": 4.7879, + "loss/crossentropy": 2.250051259994507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2764698565006256, + "step": 2716 + }, + { + "epoch": 0.05436, + "grad_norm": 2.4375, + "grad_norm_var": 0.026985677083333333, + "learning_rate": 0.0001, + "loss": 4.8813, + "loss/crossentropy": 2.25112247467041, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3101722151041031, + "step": 2718 + }, + { + "epoch": 0.0544, + "grad_norm": 2.421875, + "grad_norm_var": 0.028709920247395833, + "learning_rate": 0.0001, + "loss": 4.7242, + "loss/crossentropy": 2.261968731880188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2805032432079315, + "step": 2720 + }, + { + "epoch": 0.05444, + "grad_norm": 2.59375, + "grad_norm_var": 0.030078125, + "learning_rate": 0.0001, + "loss": 5.0449, + "loss/crossentropy": 2.376634955406189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25400323420763016, + "step": 2722 + }, + { + "epoch": 0.05448, + "grad_norm": 2.515625, + "grad_norm_var": 0.03351949055989583, + "learning_rate": 0.0001, + "loss": 5.2325, + "loss/crossentropy": 2.61246657371521, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3170415759086609, + "step": 2724 + }, + { + "epoch": 0.05452, + "grad_norm": 2.59375, + "grad_norm_var": 0.0296051025390625, + "learning_rate": 0.0001, + "loss": 5.0433, + "loss/crossentropy": 2.3982752561569214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2730572074651718, + "step": 2726 + }, + { + "epoch": 0.05456, + "grad_norm": 2.609375, + "grad_norm_var": 0.0207916259765625, + "learning_rate": 0.0001, + "loss": 4.8836, + "loss/crossentropy": 1.9890516996383667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26453813910484314, + "step": 2728 + }, + { + "epoch": 0.0546, + "grad_norm": 2.4375, + "grad_norm_var": 0.016559855143229166, + "learning_rate": 0.0001, + "loss": 4.7252, + "loss/crossentropy": 2.1825047731399536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28369753062725067, + "step": 2730 + }, + { + "epoch": 0.05464, + "grad_norm": 2.5, + "grad_norm_var": 0.015360514322916666, + "learning_rate": 0.0001, + "loss": 4.9445, + "loss/crossentropy": 1.9745987057685852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2662041634321213, + "step": 2732 + }, + { + "epoch": 0.05468, + "grad_norm": 2.640625, + "grad_norm_var": 0.020783487955729166, + "learning_rate": 0.0001, + "loss": 4.7591, + "loss/crossentropy": 2.2962852716445923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2986691743135452, + "step": 2734 + }, + { + "epoch": 0.05472, + "grad_norm": 2.65625, + "grad_norm_var": 0.018342081705729166, + "learning_rate": 0.0001, + "loss": 4.9712, + "loss/crossentropy": 2.0517550110816956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27293455600738525, + "step": 2736 + }, + { + "epoch": 0.05476, + "grad_norm": 2.859375, + "grad_norm_var": 0.021434529622395834, + "learning_rate": 0.0001, + "loss": 5.4052, + "loss/crossentropy": 2.327734112739563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3019224554300308, + "step": 2738 + }, + { + "epoch": 0.0548, + "grad_norm": 2.5, + "grad_norm_var": 0.018680826822916666, + "learning_rate": 0.0001, + "loss": 4.7002, + "loss/crossentropy": 2.236689567565918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2816064953804016, + "step": 2740 + }, + { + "epoch": 0.05484, + "grad_norm": 2.75, + "grad_norm_var": 0.020894368489583332, + "learning_rate": 0.0001, + "loss": 5.1321, + "loss/crossentropy": 2.0209690928459167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2734442874789238, + "step": 2742 + }, + { + "epoch": 0.05488, + "grad_norm": 2.640625, + "grad_norm_var": 0.026276652018229166, + "learning_rate": 0.0001, + "loss": 4.9841, + "loss/crossentropy": 2.2264864444732666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28523482382297516, + "step": 2744 + }, + { + "epoch": 0.05492, + "grad_norm": 2.421875, + "grad_norm_var": 0.024738566080729166, + "learning_rate": 0.0001, + "loss": 4.8661, + "loss/crossentropy": 1.9897980690002441, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26623376458883286, + "step": 2746 + }, + { + "epoch": 0.05496, + "grad_norm": 2.53125, + "grad_norm_var": 0.024442545572916665, + "learning_rate": 0.0001, + "loss": 4.9299, + "loss/crossentropy": 2.0583779215812683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2803298681974411, + "step": 2748 + }, + { + "epoch": 0.055, + "grad_norm": 2.515625, + "grad_norm_var": 0.022001139322916665, + "learning_rate": 0.0001, + "loss": 4.9818, + "loss/crossentropy": 1.8448269367218018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2606130689382553, + "step": 2750 + }, + { + "epoch": 0.05504, + "grad_norm": 2.640625, + "grad_norm_var": 0.02301025390625, + "learning_rate": 0.0001, + "loss": 5.0885, + "loss/crossentropy": 2.294836401939392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29403699934482574, + "step": 2752 + }, + { + "epoch": 0.05508, + "grad_norm": 2.296875, + "grad_norm_var": 0.02808837890625, + "learning_rate": 0.0001, + "loss": 4.4637, + "loss/crossentropy": 2.2402058839797974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28176650404930115, + "step": 2754 + }, + { + "epoch": 0.05512, + "grad_norm": 2.734375, + "grad_norm_var": 0.0298980712890625, + "learning_rate": 0.0001, + "loss": 4.7805, + "loss/crossentropy": 1.7882421612739563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25275079905986786, + "step": 2756 + }, + { + "epoch": 0.05516, + "grad_norm": 2.8125, + "grad_norm_var": 0.02919921875, + "learning_rate": 0.0001, + "loss": 5.1277, + "loss/crossentropy": 2.4185458421707153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30663780868053436, + "step": 2758 + }, + { + "epoch": 0.0552, + "grad_norm": 2.484375, + "grad_norm_var": 0.0318511962890625, + "learning_rate": 0.0001, + "loss": 4.9259, + "loss/crossentropy": 2.2588642835617065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26785464584827423, + "step": 2760 + }, + { + "epoch": 0.05524, + "grad_norm": 2.484375, + "grad_norm_var": 0.034012858072916666, + "learning_rate": 0.0001, + "loss": 4.8359, + "loss/crossentropy": 2.145754337310791, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2710302472114563, + "step": 2762 + }, + { + "epoch": 0.05528, + "grad_norm": 2.609375, + "grad_norm_var": 0.03648681640625, + "learning_rate": 0.0001, + "loss": 4.739, + "loss/crossentropy": 2.3627569675445557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2926081120967865, + "step": 2764 + }, + { + "epoch": 0.05532, + "grad_norm": 2.609375, + "grad_norm_var": 0.03632405598958333, + "learning_rate": 0.0001, + "loss": 4.8512, + "loss/crossentropy": 1.988103210926056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24308288842439651, + "step": 2766 + }, + { + "epoch": 0.05536, + "grad_norm": 2.828125, + "grad_norm_var": 0.040445963541666664, + "learning_rate": 0.0001, + "loss": 5.1075, + "loss/crossentropy": 2.2497689723968506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.313574954867363, + "step": 2768 + }, + { + "epoch": 0.0554, + "grad_norm": 2.609375, + "grad_norm_var": 0.030492146809895832, + "learning_rate": 0.0001, + "loss": 4.8978, + "loss/crossentropy": 2.2603683471679688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27997657656669617, + "step": 2770 + }, + { + "epoch": 0.05544, + "grad_norm": 2.796875, + "grad_norm_var": 0.030614217122395832, + "learning_rate": 0.0001, + "loss": 4.9485, + "loss/crossentropy": 2.2585065364837646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2818940281867981, + "step": 2772 + }, + { + "epoch": 0.05548, + "grad_norm": 2.578125, + "grad_norm_var": 0.027730305989583332, + "learning_rate": 0.0001, + "loss": 5.2222, + "loss/crossentropy": 2.1413429975509644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2950669527053833, + "step": 2774 + }, + { + "epoch": 0.05552, + "grad_norm": 2.734375, + "grad_norm_var": 0.0171875, + "learning_rate": 0.0001, + "loss": 5.1567, + "loss/crossentropy": 1.9994583129882812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3946940451860428, + "step": 2776 + }, + { + "epoch": 0.05556, + "grad_norm": 2.796875, + "grad_norm_var": 0.014762369791666667, + "learning_rate": 0.0001, + "loss": 5.2877, + "loss/crossentropy": 2.423824667930603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30817069113254547, + "step": 2778 + }, + { + "epoch": 0.0556, + "grad_norm": 2.4375, + "grad_norm_var": 0.013719685872395833, + "learning_rate": 0.0001, + "loss": 4.6657, + "loss/crossentropy": 1.8579126000404358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24312064796686172, + "step": 2780 + }, + { + "epoch": 0.05564, + "grad_norm": 2.390625, + "grad_norm_var": 0.018485514322916667, + "learning_rate": 0.0001, + "loss": 4.7653, + "loss/crossentropy": 2.3444939851760864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28466545045375824, + "step": 2782 + }, + { + "epoch": 0.05568, + "grad_norm": 2.71875, + "grad_norm_var": 0.03806864420572917, + "learning_rate": 0.0001, + "loss": 5.1187, + "loss/crossentropy": 2.221144914627075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2694521099328995, + "step": 2784 + }, + { + "epoch": 0.05572, + "grad_norm": 2.578125, + "grad_norm_var": 0.0376861572265625, + "learning_rate": 0.0001, + "loss": 5.0401, + "loss/crossentropy": 1.9372909665107727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2698594778776169, + "step": 2786 + }, + { + "epoch": 0.05576, + "grad_norm": 2.765625, + "grad_norm_var": 0.038834635416666666, + "learning_rate": 0.0001, + "loss": 4.834, + "loss/crossentropy": 2.129204750061035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27333614230155945, + "step": 2788 + }, + { + "epoch": 0.0558, + "grad_norm": 2.5, + "grad_norm_var": 0.04045817057291667, + "learning_rate": 0.0001, + "loss": 4.8462, + "loss/crossentropy": 1.6917370557785034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24290545284748077, + "step": 2790 + }, + { + "epoch": 0.05584, + "grad_norm": 2.765625, + "grad_norm_var": 0.04096577962239583, + "learning_rate": 0.0001, + "loss": 4.6942, + "loss/crossentropy": 1.7883749604225159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2554834187030792, + "step": 2792 + }, + { + "epoch": 0.05588, + "grad_norm": 2.765625, + "grad_norm_var": 0.04560445149739583, + "learning_rate": 0.0001, + "loss": 4.6835, + "loss/crossentropy": 1.867136001586914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2667583078145981, + "step": 2794 + }, + { + "epoch": 0.05592, + "grad_norm": 2.546875, + "grad_norm_var": 0.04143473307291667, + "learning_rate": 0.0001, + "loss": 4.9686, + "loss/crossentropy": 2.032800853252411, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2580890506505966, + "step": 2796 + }, + { + "epoch": 0.05596, + "grad_norm": 3.03125, + "grad_norm_var": 0.04389546712239583, + "learning_rate": 0.0001, + "loss": 5.018, + "loss/crossentropy": 1.9867863655090332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2694792151451111, + "step": 2798 + }, + { + "epoch": 0.056, + "grad_norm": 2.59375, + "grad_norm_var": 0.026786295572916667, + "learning_rate": 0.0001, + "loss": 4.9137, + "loss/crossentropy": 2.1026757955551147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2664051130414009, + "step": 2800 + }, + { + "epoch": 0.05604, + "grad_norm": 2.65625, + "grad_norm_var": 0.0259918212890625, + "learning_rate": 0.0001, + "loss": 4.9883, + "loss/crossentropy": 2.0649160742759705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2473129704594612, + "step": 2802 + }, + { + "epoch": 0.05608, + "grad_norm": 2.921875, + "grad_norm_var": 0.030451456705729168, + "learning_rate": 0.0001, + "loss": 4.8959, + "loss/crossentropy": 2.2110248804092407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27131715416908264, + "step": 2804 + }, + { + "epoch": 0.05612, + "grad_norm": 2.65625, + "grad_norm_var": 0.032933553059895836, + "learning_rate": 0.0001, + "loss": 4.9603, + "loss/crossentropy": 2.3483060598373413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27071496844291687, + "step": 2806 + }, + { + "epoch": 0.05616, + "grad_norm": 2.640625, + "grad_norm_var": 0.03277587890625, + "learning_rate": 0.0001, + "loss": 4.9273, + "loss/crossentropy": 1.9061944484710693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2550426125526428, + "step": 2808 + }, + { + "epoch": 0.0562, + "grad_norm": 2.671875, + "grad_norm_var": 0.0278472900390625, + "learning_rate": 0.0001, + "loss": 5.1734, + "loss/crossentropy": 2.3073103427886963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30051568150520325, + "step": 2810 + }, + { + "epoch": 0.05624, + "grad_norm": 2.671875, + "grad_norm_var": 0.027106730143229167, + "learning_rate": 0.0001, + "loss": 5.0268, + "loss/crossentropy": 2.393891453742981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29226459562778473, + "step": 2812 + }, + { + "epoch": 0.05628, + "grad_norm": 2.484375, + "grad_norm_var": 0.018578084309895833, + "learning_rate": 0.0001, + "loss": 5.0839, + "loss/crossentropy": 2.3082761764526367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29539754986763, + "step": 2814 + }, + { + "epoch": 0.05632, + "grad_norm": 2.515625, + "grad_norm_var": 0.021776326497395835, + "learning_rate": 0.0001, + "loss": 4.8973, + "loss/crossentropy": 2.7815494537353516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33258646726608276, + "step": 2816 + }, + { + "epoch": 0.05636, + "grad_norm": 2.390625, + "grad_norm_var": 0.031538899739583334, + "learning_rate": 0.0001, + "loss": 4.7095, + "loss/crossentropy": 2.077984571456909, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.261022225022316, + "step": 2818 + }, + { + "epoch": 0.0564, + "grad_norm": 2.484375, + "grad_norm_var": 0.0251617431640625, + "learning_rate": 0.0001, + "loss": 4.6389, + "loss/crossentropy": 2.0524688363075256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2936979830265045, + "step": 2820 + }, + { + "epoch": 0.05644, + "grad_norm": 2.5, + "grad_norm_var": 0.018212890625, + "learning_rate": 0.0001, + "loss": 4.9657, + "loss/crossentropy": 1.8323140740394592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26426824927330017, + "step": 2822 + }, + { + "epoch": 0.05648, + "grad_norm": 2.765625, + "grad_norm_var": 0.0201171875, + "learning_rate": 0.0001, + "loss": 4.9513, + "loss/crossentropy": 2.2290462255477905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28852027654647827, + "step": 2824 + }, + { + "epoch": 0.05652, + "grad_norm": 2.5625, + "grad_norm_var": 0.016600545247395834, + "learning_rate": 0.0001, + "loss": 4.7828, + "loss/crossentropy": 1.8788678050041199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24312467128038406, + "step": 2826 + }, + { + "epoch": 0.05656, + "grad_norm": 2.46875, + "grad_norm_var": 0.01646728515625, + "learning_rate": 0.0001, + "loss": 4.7882, + "loss/crossentropy": 1.9402090311050415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25435876101255417, + "step": 2828 + }, + { + "epoch": 0.0566, + "grad_norm": 2.390625, + "grad_norm_var": 0.0144683837890625, + "learning_rate": 0.0001, + "loss": 4.5823, + "loss/crossentropy": 2.4833847284317017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27234241366386414, + "step": 2830 + }, + { + "epoch": 0.05664, + "grad_norm": 2.515625, + "grad_norm_var": 0.0154296875, + "learning_rate": 0.0001, + "loss": 5.0521, + "loss/crossentropy": 2.351561665534973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28028184175491333, + "step": 2832 + }, + { + "epoch": 0.05668, + "grad_norm": 2.953125, + "grad_norm_var": 0.019722493489583333, + "learning_rate": 0.0001, + "loss": 5.0688, + "loss/crossentropy": 2.0372352600097656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26005299389362335, + "step": 2834 + }, + { + "epoch": 0.05672, + "grad_norm": 2.828125, + "grad_norm_var": 0.021744791666666666, + "learning_rate": 0.0001, + "loss": 4.4282, + "loss/crossentropy": 1.6522082090377808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26683834940195084, + "step": 2836 + }, + { + "epoch": 0.05676, + "grad_norm": 2.390625, + "grad_norm_var": 0.023778279622395832, + "learning_rate": 0.0001, + "loss": 4.7165, + "loss/crossentropy": 1.8886643052101135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23464814573526382, + "step": 2838 + }, + { + "epoch": 0.0568, + "grad_norm": 2.734375, + "grad_norm_var": 0.023119099934895835, + "learning_rate": 0.0001, + "loss": 4.9477, + "loss/crossentropy": 2.2161459922790527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29381541907787323, + "step": 2840 + }, + { + "epoch": 0.05684, + "grad_norm": 2.421875, + "grad_norm_var": 0.024779256184895834, + "learning_rate": 0.0001, + "loss": 4.7408, + "loss/crossentropy": 2.1542043685913086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2753119468688965, + "step": 2842 + }, + { + "epoch": 0.05688, + "grad_norm": 2.78125, + "grad_norm_var": 0.026090494791666665, + "learning_rate": 0.0001, + "loss": 5.1288, + "loss/crossentropy": 2.5605628490448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.329460546374321, + "step": 2844 + }, + { + "epoch": 0.05692, + "grad_norm": 2.609375, + "grad_norm_var": 0.023224894205729166, + "learning_rate": 0.0001, + "loss": 4.9955, + "loss/crossentropy": 2.1000319719314575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.273986279964447, + "step": 2846 + }, + { + "epoch": 0.05696, + "grad_norm": 2.484375, + "grad_norm_var": 0.023558553059895834, + "learning_rate": 0.0001, + "loss": 4.6813, + "loss/crossentropy": 2.1036760210990906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26293954253196716, + "step": 2848 + }, + { + "epoch": 0.057, + "grad_norm": 2.640625, + "grad_norm_var": 0.014286295572916666, + "learning_rate": 0.0001, + "loss": 4.9799, + "loss/crossentropy": 2.2130608558654785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2977828085422516, + "step": 2850 + }, + { + "epoch": 0.05704, + "grad_norm": 2.578125, + "grad_norm_var": 0.01461181640625, + "learning_rate": 0.0001, + "loss": 5.1761, + "loss/crossentropy": 2.1878823041915894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27322643995285034, + "step": 2852 + }, + { + "epoch": 0.05708, + "grad_norm": 2.703125, + "grad_norm_var": 0.012547810872395834, + "learning_rate": 0.0001, + "loss": 4.9448, + "loss/crossentropy": 2.1559258699417114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32625503838062286, + "step": 2854 + }, + { + "epoch": 0.05712, + "grad_norm": 2.390625, + "grad_norm_var": 0.015848795572916668, + "learning_rate": 0.0001, + "loss": 4.4611, + "loss/crossentropy": 2.0818992257118225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2677721679210663, + "step": 2856 + }, + { + "epoch": 0.05716, + "grad_norm": 2.578125, + "grad_norm_var": 0.016243489583333333, + "learning_rate": 0.0001, + "loss": 5.2755, + "loss/crossentropy": 2.544907331466675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30028045177459717, + "step": 2858 + }, + { + "epoch": 0.0572, + "grad_norm": 3.203125, + "grad_norm_var": 0.04491780598958333, + "learning_rate": 0.0001, + "loss": 5.2855, + "loss/crossentropy": 2.5932188034057617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30431173741817474, + "step": 2860 + }, + { + "epoch": 0.05724, + "grad_norm": 2.453125, + "grad_norm_var": 0.046219889322916666, + "learning_rate": 0.0001, + "loss": 4.9799, + "loss/crossentropy": 2.1007986068725586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2989690601825714, + "step": 2862 + }, + { + "epoch": 0.05728, + "grad_norm": 2.5625, + "grad_norm_var": 0.044169108072916664, + "learning_rate": 0.0001, + "loss": 4.7706, + "loss/crossentropy": 2.0102875232696533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26936179399490356, + "step": 2864 + }, + { + "epoch": 0.05732, + "grad_norm": 3.109375, + "grad_norm_var": 0.056559244791666664, + "learning_rate": 0.0001, + "loss": 5.2099, + "loss/crossentropy": 2.3457159996032715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29194609820842743, + "step": 2866 + }, + { + "epoch": 0.05736, + "grad_norm": 2.625, + "grad_norm_var": 0.055582682291666664, + "learning_rate": 0.0001, + "loss": 4.8768, + "loss/crossentropy": 2.559054732322693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3018783777952194, + "step": 2868 + }, + { + "epoch": 0.0574, + "grad_norm": 2.703125, + "grad_norm_var": 0.05831705729166667, + "learning_rate": 0.0001, + "loss": 4.9863, + "loss/crossentropy": 2.0641059279441833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2789234220981598, + "step": 2870 + }, + { + "epoch": 0.05744, + "grad_norm": 2.734375, + "grad_norm_var": 0.04683837890625, + "learning_rate": 0.0001, + "loss": 4.9222, + "loss/crossentropy": 2.0819749236106873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26825186610221863, + "step": 2872 + }, + { + "epoch": 0.05748, + "grad_norm": 2.703125, + "grad_norm_var": 0.046263631184895834, + "learning_rate": 0.0001, + "loss": 4.9586, + "loss/crossentropy": 2.1114020347595215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2613547742366791, + "step": 2874 + }, + { + "epoch": 0.05752, + "grad_norm": 2.4375, + "grad_norm_var": 0.02916259765625, + "learning_rate": 0.0001, + "loss": 4.9474, + "loss/crossentropy": 2.174915075302124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29503974318504333, + "step": 2876 + }, + { + "epoch": 0.05756, + "grad_norm": 2.6875, + "grad_norm_var": 0.0265289306640625, + "learning_rate": 0.0001, + "loss": 5.0541, + "loss/crossentropy": 2.4342113733291626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30623678863048553, + "step": 2878 + }, + { + "epoch": 0.0576, + "grad_norm": 2.703125, + "grad_norm_var": 0.024918619791666666, + "learning_rate": 0.0001, + "loss": 5.0316, + "loss/crossentropy": 2.0518307089805603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28422991931438446, + "step": 2880 + }, + { + "epoch": 0.05764, + "grad_norm": 2.6875, + "grad_norm_var": 0.011921183268229166, + "learning_rate": 0.0001, + "loss": 5.0806, + "loss/crossentropy": 2.5378612279891968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29730531573295593, + "step": 2882 + }, + { + "epoch": 0.05768, + "grad_norm": 2.5, + "grad_norm_var": 0.014972941080729166, + "learning_rate": 0.0001, + "loss": 4.8685, + "loss/crossentropy": 2.2670027017593384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26347288489341736, + "step": 2884 + }, + { + "epoch": 0.05772, + "grad_norm": 3.671875, + "grad_norm_var": 0.07822265625, + "learning_rate": 0.0001, + "loss": 5.0385, + "loss/crossentropy": 2.351823568344116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3020637035369873, + "step": 2886 + }, + { + "epoch": 0.05776, + "grad_norm": 2.65625, + "grad_norm_var": 0.0792633056640625, + "learning_rate": 0.0001, + "loss": 4.9699, + "loss/crossentropy": 2.190839111804962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2639586254954338, + "step": 2888 + }, + { + "epoch": 0.0578, + "grad_norm": 2.3125, + "grad_norm_var": 0.08765869140625, + "learning_rate": 0.0001, + "loss": 4.6882, + "loss/crossentropy": 2.148400902748108, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27762140333652496, + "step": 2890 + }, + { + "epoch": 0.05784, + "grad_norm": 2.984375, + "grad_norm_var": 0.09696858723958333, + "learning_rate": 0.0001, + "loss": 5.1033, + "loss/crossentropy": 2.01130074262619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2678648605942726, + "step": 2892 + }, + { + "epoch": 0.05788, + "grad_norm": 2.75, + "grad_norm_var": 0.09986572265625, + "learning_rate": 0.0001, + "loss": 5.0829, + "loss/crossentropy": 2.2239269018173218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2971164286136627, + "step": 2894 + }, + { + "epoch": 0.05792, + "grad_norm": 2.328125, + "grad_norm_var": 0.10974934895833334, + "learning_rate": 0.0001, + "loss": 4.48, + "loss/crossentropy": 1.9573850631713867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.241651751101017, + "step": 2896 + }, + { + "epoch": 0.05796, + "grad_norm": 2.5, + "grad_norm_var": 0.11169331868489583, + "learning_rate": 0.0001, + "loss": 4.8806, + "loss/crossentropy": 1.9522782564163208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2685594707727432, + "step": 2898 + }, + { + "epoch": 0.058, + "grad_norm": 2.671875, + "grad_norm_var": 0.10886128743489583, + "learning_rate": 0.0001, + "loss": 4.9335, + "loss/crossentropy": 2.1742069721221924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27867090702056885, + "step": 2900 + }, + { + "epoch": 0.05804, + "grad_norm": 5.15625, + "grad_norm_var": 0.44882405598958336, + "learning_rate": 0.0001, + "loss": 4.9164, + "loss/crossentropy": 2.201782703399658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2731190174818039, + "step": 2902 + }, + { + "epoch": 0.05808, + "grad_norm": 2.734375, + "grad_norm_var": 0.4452301025390625, + "learning_rate": 0.0001, + "loss": 4.9787, + "loss/crossentropy": 2.159709095954895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2940017879009247, + "step": 2904 + }, + { + "epoch": 0.05812, + "grad_norm": 2.71875, + "grad_norm_var": 0.4297271728515625, + "learning_rate": 0.0001, + "loss": 4.9192, + "loss/crossentropy": 2.1989885568618774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2776087671518326, + "step": 2906 + }, + { + "epoch": 0.05816, + "grad_norm": 2.546875, + "grad_norm_var": 0.42688395182291666, + "learning_rate": 0.0001, + "loss": 4.8851, + "loss/crossentropy": 1.9634575247764587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27917809784412384, + "step": 2908 + }, + { + "epoch": 0.0582, + "grad_norm": 2.390625, + "grad_norm_var": 0.43651936848958334, + "learning_rate": 0.0001, + "loss": 4.7776, + "loss/crossentropy": 2.1553479433059692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28876082599163055, + "step": 2910 + }, + { + "epoch": 0.05824, + "grad_norm": 2.515625, + "grad_norm_var": 0.42451883951822916, + "learning_rate": 0.0001, + "loss": 4.881, + "loss/crossentropy": 2.1035598516464233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2851293236017227, + "step": 2912 + }, + { + "epoch": 0.05828, + "grad_norm": 2.890625, + "grad_norm_var": 0.42097880045572916, + "learning_rate": 0.0001, + "loss": 5.2733, + "loss/crossentropy": 2.17076575756073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28258734941482544, + "step": 2914 + }, + { + "epoch": 0.05832, + "grad_norm": 2.484375, + "grad_norm_var": 0.42477213541666664, + "learning_rate": 0.0001, + "loss": 4.9147, + "loss/crossentropy": 2.2914711236953735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29451698064804077, + "step": 2916 + }, + { + "epoch": 0.05836, + "grad_norm": 2.703125, + "grad_norm_var": 0.021410115559895835, + "learning_rate": 0.0001, + "loss": 5.1395, + "loss/crossentropy": 2.554638981819153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30992935597896576, + "step": 2918 + }, + { + "epoch": 0.0584, + "grad_norm": 2.546875, + "grad_norm_var": 0.05735575358072917, + "learning_rate": 0.0001, + "loss": 4.6262, + "loss/crossentropy": 1.808376431465149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22574126720428467, + "step": 2920 + }, + { + "epoch": 0.05844, + "grad_norm": 2.78125, + "grad_norm_var": 0.05660400390625, + "learning_rate": 0.0001, + "loss": 5.1636, + "loss/crossentropy": 2.1471784114837646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27325738966464996, + "step": 2922 + }, + { + "epoch": 0.05848, + "grad_norm": 2.765625, + "grad_norm_var": 0.054182942708333334, + "learning_rate": 0.0001, + "loss": 5.071, + "loss/crossentropy": 2.2175731658935547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3013547956943512, + "step": 2924 + }, + { + "epoch": 0.05852, + "grad_norm": 2.953125, + "grad_norm_var": 0.04949544270833333, + "learning_rate": 0.0001, + "loss": 5.467, + "loss/crossentropy": 2.369373917579651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29249751567840576, + "step": 2926 + }, + { + "epoch": 0.05856, + "grad_norm": 2.453125, + "grad_norm_var": 0.054915364583333334, + "learning_rate": 0.0001, + "loss": 4.8778, + "loss/crossentropy": 2.1758522987365723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28846102952957153, + "step": 2928 + }, + { + "epoch": 0.0586, + "grad_norm": 2.546875, + "grad_norm_var": 0.05413411458333333, + "learning_rate": 0.0001, + "loss": 4.929, + "loss/crossentropy": 2.46218478679657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2824682295322418, + "step": 2930 + }, + { + "epoch": 0.05864, + "grad_norm": 2.40625, + "grad_norm_var": 0.0606597900390625, + "learning_rate": 0.0001, + "loss": 4.5557, + "loss/crossentropy": 2.058937907218933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26767465472221375, + "step": 2932 + }, + { + "epoch": 0.05868, + "grad_norm": 2.546875, + "grad_norm_var": 0.06109619140625, + "learning_rate": 0.0001, + "loss": 4.9103, + "loss/crossentropy": 2.312442421913147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2952658236026764, + "step": 2934 + }, + { + "epoch": 0.05872, + "grad_norm": 2.71875, + "grad_norm_var": 0.026911417643229168, + "learning_rate": 0.0001, + "loss": 5.1452, + "loss/crossentropy": 2.18839955329895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31788623332977295, + "step": 2936 + }, + { + "epoch": 0.05876, + "grad_norm": 2.671875, + "grad_norm_var": 0.025537109375, + "learning_rate": 0.0001, + "loss": 5.1358, + "loss/crossentropy": 2.1330811977386475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28829698264598846, + "step": 2938 + }, + { + "epoch": 0.0588, + "grad_norm": 2.9375, + "grad_norm_var": 0.0294586181640625, + "learning_rate": 0.0001, + "loss": 5.1071, + "loss/crossentropy": 2.124038338661194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2708826810121536, + "step": 2940 + }, + { + "epoch": 0.05884, + "grad_norm": 2.53125, + "grad_norm_var": 0.024312337239583332, + "learning_rate": 0.0001, + "loss": 4.8479, + "loss/crossentropy": 2.1934449076652527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27314090728759766, + "step": 2942 + }, + { + "epoch": 0.05888, + "grad_norm": 2.6875, + "grad_norm_var": 0.024535115559895834, + "learning_rate": 0.0001, + "loss": 4.9904, + "loss/crossentropy": 1.967636525630951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23761005699634552, + "step": 2944 + }, + { + "epoch": 0.05892, + "grad_norm": 3.015625, + "grad_norm_var": 0.034821573893229166, + "learning_rate": 0.0001, + "loss": 4.9214, + "loss/crossentropy": 2.2380826473236084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3402934819459915, + "step": 2946 + }, + { + "epoch": 0.05896, + "grad_norm": 2.5625, + "grad_norm_var": 0.0299468994140625, + "learning_rate": 0.0001, + "loss": 4.7702, + "loss/crossentropy": 2.227039933204651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27934837341308594, + "step": 2948 + }, + { + "epoch": 0.059, + "grad_norm": 2.6875, + "grad_norm_var": 0.03258056640625, + "learning_rate": 0.0001, + "loss": 4.789, + "loss/crossentropy": 1.999170958995819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.275806725025177, + "step": 2950 + }, + { + "epoch": 0.05904, + "grad_norm": 2.546875, + "grad_norm_var": 0.03241780598958333, + "learning_rate": 0.0001, + "loss": 4.5754, + "loss/crossentropy": 1.8843520879745483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26203446090221405, + "step": 2952 + }, + { + "epoch": 0.05908, + "grad_norm": 2.5, + "grad_norm_var": 0.033426920572916664, + "learning_rate": 0.0001, + "loss": 4.5485, + "loss/crossentropy": 2.0742241740226746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28777699172496796, + "step": 2954 + }, + { + "epoch": 0.05912, + "grad_norm": 2.6875, + "grad_norm_var": 0.028206380208333333, + "learning_rate": 0.0001, + "loss": 4.9676, + "loss/crossentropy": 2.2272751331329346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27761510014533997, + "step": 2956 + }, + { + "epoch": 0.05916, + "grad_norm": 2.734375, + "grad_norm_var": 0.029878743489583335, + "learning_rate": 0.0001, + "loss": 4.6329, + "loss/crossentropy": 2.2758638858795166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28144824504852295, + "step": 2958 + }, + { + "epoch": 0.0592, + "grad_norm": 2.46875, + "grad_norm_var": 0.060530598958333334, + "learning_rate": 0.0001, + "loss": 4.9543, + "loss/crossentropy": 2.245158016681671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3030128926038742, + "step": 2960 + }, + { + "epoch": 0.05924, + "grad_norm": 2.453125, + "grad_norm_var": 0.053498331705729166, + "learning_rate": 0.0001, + "loss": 4.567, + "loss/crossentropy": 2.3337708711624146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29228493571281433, + "step": 2962 + }, + { + "epoch": 0.05928, + "grad_norm": 2.5, + "grad_norm_var": 0.05788472493489583, + "learning_rate": 0.0001, + "loss": 4.6477, + "loss/crossentropy": 2.3765406608581543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2715871036052704, + "step": 2964 + }, + { + "epoch": 0.05932, + "grad_norm": 2.375, + "grad_norm_var": 0.0562652587890625, + "learning_rate": 0.0001, + "loss": 4.8649, + "loss/crossentropy": 2.090362787246704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2505037933588028, + "step": 2966 + }, + { + "epoch": 0.05936, + "grad_norm": 2.453125, + "grad_norm_var": 0.056396484375, + "learning_rate": 0.0001, + "loss": 4.8669, + "loss/crossentropy": 2.011539399623871, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27219071984291077, + "step": 2968 + }, + { + "epoch": 0.0594, + "grad_norm": 2.734375, + "grad_norm_var": 0.05693359375, + "learning_rate": 0.0001, + "loss": 5.2081, + "loss/crossentropy": 2.1791563034057617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2796429842710495, + "step": 2970 + }, + { + "epoch": 0.05944, + "grad_norm": 2.65625, + "grad_norm_var": 0.05869852701822917, + "learning_rate": 0.0001, + "loss": 5.2689, + "loss/crossentropy": 2.4645297527313232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3106658458709717, + "step": 2972 + }, + { + "epoch": 0.05948, + "grad_norm": 7.0, + "grad_norm_var": 1.25670166015625, + "learning_rate": 0.0001, + "loss": 5.0715, + "loss/crossentropy": 2.2050880193710327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26860976219177246, + "step": 2974 + }, + { + "epoch": 0.05952, + "grad_norm": 2.828125, + "grad_norm_var": 1.236034138997396, + "learning_rate": 0.0001, + "loss": 4.5815, + "loss/crossentropy": 2.0141921639442444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27980539202690125, + "step": 2976 + }, + { + "epoch": 0.05956, + "grad_norm": 4.4375, + "grad_norm_var": 1.3665924072265625, + "learning_rate": 0.0001, + "loss": 4.8067, + "loss/crossentropy": 1.9399088025093079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26379524916410446, + "step": 2978 + }, + { + "epoch": 0.0596, + "grad_norm": 2.796875, + "grad_norm_var": 1.3243886311848958, + "learning_rate": 0.0001, + "loss": 5.1743, + "loss/crossentropy": 2.5415157079696655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.291620597243309, + "step": 2980 + }, + { + "epoch": 0.05964, + "grad_norm": 2.5625, + "grad_norm_var": 1.3170562744140626, + "learning_rate": 0.0001, + "loss": 5.0695, + "loss/crossentropy": 2.1215542554855347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26189403235912323, + "step": 2982 + }, + { + "epoch": 0.05968, + "grad_norm": 2.578125, + "grad_norm_var": 1.32301025390625, + "learning_rate": 0.0001, + "loss": 5.0049, + "loss/crossentropy": 1.7749422788619995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27546317875385284, + "step": 2984 + }, + { + "epoch": 0.05972, + "grad_norm": 2.59375, + "grad_norm_var": 1.3371490478515624, + "learning_rate": 0.0001, + "loss": 4.8331, + "loss/crossentropy": 2.3383208513259888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2790983319282532, + "step": 2986 + }, + { + "epoch": 0.05976, + "grad_norm": 2.5625, + "grad_norm_var": 1.35230712890625, + "learning_rate": 0.0001, + "loss": 4.8656, + "loss/crossentropy": 2.3688780069351196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2752939611673355, + "step": 2988 + }, + { + "epoch": 0.0598, + "grad_norm": 2.6875, + "grad_norm_var": 0.22967020670572916, + "learning_rate": 0.0001, + "loss": 4.7299, + "loss/crossentropy": 2.261468529701233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2725592106580734, + "step": 2990 + }, + { + "epoch": 0.05984, + "grad_norm": 2.671875, + "grad_norm_var": 0.2226226806640625, + "learning_rate": 0.0001, + "loss": 4.7417, + "loss/crossentropy": 2.0632028579711914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2617062032222748, + "step": 2992 + }, + { + "epoch": 0.05988, + "grad_norm": 2.25, + "grad_norm_var": 0.022151692708333334, + "learning_rate": 0.0001, + "loss": 4.4282, + "loss/crossentropy": 2.1409813165664673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2657589614391327, + "step": 2994 + }, + { + "epoch": 0.05992, + "grad_norm": 2.921875, + "grad_norm_var": 0.0258941650390625, + "learning_rate": 0.0001, + "loss": 5.1816, + "loss/crossentropy": 2.431404948234558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3087555915117264, + "step": 2996 + }, + { + "epoch": 0.05996, + "grad_norm": 2.390625, + "grad_norm_var": 0.029743448893229166, + "learning_rate": 0.0001, + "loss": 4.8253, + "loss/crossentropy": 2.165238618850708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29382775723934174, + "step": 2998 + }, + { + "epoch": 0.06, + "grad_norm": 2.546875, + "grad_norm_var": 0.034566243489583336, + "learning_rate": 0.0001, + "loss": 4.6328, + "loss/crossentropy": 1.9987847208976746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24466252326965332, + "step": 3000 + }, + { + "epoch": 0.06004, + "grad_norm": 2.203125, + "grad_norm_var": 0.04084370930989583, + "learning_rate": 0.0001, + "loss": 4.5474, + "loss/crossentropy": 2.1153565049171448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2435970976948738, + "step": 3002 + }, + { + "epoch": 0.06008, + "grad_norm": 2.3125, + "grad_norm_var": 0.046402994791666666, + "learning_rate": 0.0001, + "loss": 4.7765, + "loss/crossentropy": 1.8660866618156433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24413185566663742, + "step": 3004 + }, + { + "epoch": 0.06012, + "grad_norm": 2.40625, + "grad_norm_var": 0.04309794108072917, + "learning_rate": 0.0001, + "loss": 4.7476, + "loss/crossentropy": 2.1816678047180176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28010787069797516, + "step": 3006 + }, + { + "epoch": 0.06016, + "grad_norm": 2.53125, + "grad_norm_var": 0.040827433268229164, + "learning_rate": 0.0001, + "loss": 4.5995, + "loss/crossentropy": 2.1673622131347656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2659531533718109, + "step": 3008 + }, + { + "epoch": 0.0602, + "grad_norm": 2.78125, + "grad_norm_var": 0.040913899739583336, + "learning_rate": 0.0001, + "loss": 5.2466, + "loss/crossentropy": 2.277818202972412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2929365783929825, + "step": 3010 + }, + { + "epoch": 0.06024, + "grad_norm": 2.65625, + "grad_norm_var": 0.0284576416015625, + "learning_rate": 0.0001, + "loss": 4.8904, + "loss/crossentropy": 2.294926404953003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2753874659538269, + "step": 3012 + }, + { + "epoch": 0.06028, + "grad_norm": 2.359375, + "grad_norm_var": 0.028173828125, + "learning_rate": 0.0001, + "loss": 4.735, + "loss/crossentropy": 2.0222257375717163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27295801043510437, + "step": 3014 + }, + { + "epoch": 0.06032, + "grad_norm": 2.484375, + "grad_norm_var": 0.023628743489583333, + "learning_rate": 0.0001, + "loss": 4.818, + "loss/crossentropy": 2.43736469745636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2934436649084091, + "step": 3016 + }, + { + "epoch": 0.06036, + "grad_norm": 2.703125, + "grad_norm_var": 0.020426432291666668, + "learning_rate": 0.0001, + "loss": 5.0053, + "loss/crossentropy": 2.027767241001129, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26353244483470917, + "step": 3018 + }, + { + "epoch": 0.0604, + "grad_norm": 2.59375, + "grad_norm_var": 0.015034993489583334, + "learning_rate": 0.0001, + "loss": 4.9059, + "loss/crossentropy": 2.150290071964264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27290941774845123, + "step": 3020 + }, + { + "epoch": 0.06044, + "grad_norm": 2.421875, + "grad_norm_var": 0.014697265625, + "learning_rate": 0.0001, + "loss": 4.8307, + "loss/crossentropy": 1.8709848523139954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25272610783576965, + "step": 3022 + }, + { + "epoch": 0.06048, + "grad_norm": 2.46875, + "grad_norm_var": 0.015949503580729166, + "learning_rate": 0.0001, + "loss": 5.0166, + "loss/crossentropy": 2.2170883417129517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29401010274887085, + "step": 3024 + }, + { + "epoch": 0.06052, + "grad_norm": 2.53125, + "grad_norm_var": 0.0109375, + "learning_rate": 0.0001, + "loss": 4.6221, + "loss/crossentropy": 1.728318691253662, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24246910959482193, + "step": 3026 + }, + { + "epoch": 0.06056, + "grad_norm": 2.34375, + "grad_norm_var": 0.0095123291015625, + "learning_rate": 0.0001, + "loss": 4.5597, + "loss/crossentropy": 1.8453290462493896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24357211589813232, + "step": 3028 + }, + { + "epoch": 0.0606, + "grad_norm": 2.640625, + "grad_norm_var": 0.010456339518229166, + "learning_rate": 0.0001, + "loss": 4.6617, + "loss/crossentropy": 2.273196220397949, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29239149391651154, + "step": 3030 + }, + { + "epoch": 0.06064, + "grad_norm": 2.484375, + "grad_norm_var": 0.012165323893229166, + "learning_rate": 0.0001, + "loss": 4.9464, + "loss/crossentropy": 2.031871259212494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2604901194572449, + "step": 3032 + }, + { + "epoch": 0.06068, + "grad_norm": 2.515625, + "grad_norm_var": 0.008915201822916666, + "learning_rate": 0.0001, + "loss": 5.1618, + "loss/crossentropy": 2.1781771183013916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2853371948003769, + "step": 3034 + }, + { + "epoch": 0.06072, + "grad_norm": 2.484375, + "grad_norm_var": 0.006590779622395833, + "learning_rate": 0.0001, + "loss": 4.816, + "loss/crossentropy": 1.9578353762626648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23352904617786407, + "step": 3036 + }, + { + "epoch": 0.06076, + "grad_norm": 2.671875, + "grad_norm_var": 0.012516276041666666, + "learning_rate": 0.0001, + "loss": 5.3012, + "loss/crossentropy": 2.470985770225525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28890977799892426, + "step": 3038 + }, + { + "epoch": 0.0608, + "grad_norm": 2.65625, + "grad_norm_var": 0.013361612955729166, + "learning_rate": 0.0001, + "loss": 4.9243, + "loss/crossentropy": 2.056231141090393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2963344603776932, + "step": 3040 + }, + { + "epoch": 0.06084, + "grad_norm": 2.609375, + "grad_norm_var": 0.013646443684895834, + "learning_rate": 0.0001, + "loss": 4.8612, + "loss/crossentropy": 2.000037968158722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27152783423662186, + "step": 3042 + }, + { + "epoch": 0.06088, + "grad_norm": 2.59375, + "grad_norm_var": 0.0111480712890625, + "learning_rate": 0.0001, + "loss": 4.8576, + "loss/crossentropy": 2.323809027671814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29995349049568176, + "step": 3044 + }, + { + "epoch": 0.06092, + "grad_norm": 2.4375, + "grad_norm_var": 0.010407511393229167, + "learning_rate": 0.0001, + "loss": 5.0094, + "loss/crossentropy": 2.06933856010437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26831966638565063, + "step": 3046 + }, + { + "epoch": 0.06096, + "grad_norm": 2.53125, + "grad_norm_var": 0.008463541666666666, + "learning_rate": 0.0001, + "loss": 5.042, + "loss/crossentropy": 2.1903880834579468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2833031117916107, + "step": 3048 + }, + { + "epoch": 0.061, + "grad_norm": 2.46875, + "grad_norm_var": 0.008893839518229167, + "learning_rate": 0.0001, + "loss": 4.9557, + "loss/crossentropy": 1.910680890083313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2708408683538437, + "step": 3050 + }, + { + "epoch": 0.06104, + "grad_norm": 2.546875, + "grad_norm_var": 0.0083404541015625, + "learning_rate": 0.0001, + "loss": 4.8902, + "loss/crossentropy": 2.5203059911727905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2981104403734207, + "step": 3052 + }, + { + "epoch": 0.06108, + "grad_norm": 2.5, + "grad_norm_var": 0.00611572265625, + "learning_rate": 0.0001, + "loss": 4.68, + "loss/crossentropy": 1.7918179035186768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2462354451417923, + "step": 3054 + }, + { + "epoch": 0.06112, + "grad_norm": 2.65625, + "grad_norm_var": 0.006441243489583333, + "learning_rate": 0.0001, + "loss": 4.9908, + "loss/crossentropy": 2.185121774673462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3046490252017975, + "step": 3056 + }, + { + "epoch": 0.06116, + "grad_norm": 2.28125, + "grad_norm_var": 0.01064453125, + "learning_rate": 0.0001, + "loss": 4.8041, + "loss/crossentropy": 1.859390914440155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2523205131292343, + "step": 3058 + }, + { + "epoch": 0.0612, + "grad_norm": 2.40625, + "grad_norm_var": 0.011930338541666667, + "learning_rate": 0.0001, + "loss": 4.7388, + "loss/crossentropy": 1.9202255606651306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26971636712551117, + "step": 3060 + }, + { + "epoch": 0.06124, + "grad_norm": 2.578125, + "grad_norm_var": 0.011481730143229167, + "learning_rate": 0.0001, + "loss": 4.7421, + "loss/crossentropy": 2.059127449989319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25721821188926697, + "step": 3062 + }, + { + "epoch": 0.06128, + "grad_norm": 2.6875, + "grad_norm_var": 0.0154205322265625, + "learning_rate": 0.0001, + "loss": 5.1392, + "loss/crossentropy": 2.3587669134140015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30162671208381653, + "step": 3064 + }, + { + "epoch": 0.06132, + "grad_norm": 2.59375, + "grad_norm_var": 0.018050130208333334, + "learning_rate": 0.0001, + "loss": 5.2748, + "loss/crossentropy": 2.239704966545105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2713697552680969, + "step": 3066 + }, + { + "epoch": 0.06136, + "grad_norm": 2.796875, + "grad_norm_var": 0.02076416015625, + "learning_rate": 0.0001, + "loss": 5.0534, + "loss/crossentropy": 2.230944514274597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27418144047260284, + "step": 3068 + }, + { + "epoch": 0.0614, + "grad_norm": 2.515625, + "grad_norm_var": 0.020563761393229168, + "learning_rate": 0.0001, + "loss": 4.9145, + "loss/crossentropy": 2.196586310863495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24946682900190353, + "step": 3070 + }, + { + "epoch": 0.06144, + "grad_norm": 2.65625, + "grad_norm_var": 0.019873046875, + "learning_rate": 0.0001, + "loss": 5.058, + "loss/crossentropy": 2.115275800228119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2668849602341652, + "step": 3072 + }, + { + "epoch": 0.06148, + "grad_norm": 2.765625, + "grad_norm_var": 0.014286295572916666, + "learning_rate": 0.0001, + "loss": 5.1311, + "loss/crossentropy": 2.0227994322776794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2587117701768875, + "step": 3074 + }, + { + "epoch": 0.06152, + "grad_norm": 2.421875, + "grad_norm_var": 0.016022745768229166, + "learning_rate": 0.0001, + "loss": 4.711, + "loss/crossentropy": 2.231368660926819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2621329501271248, + "step": 3076 + }, + { + "epoch": 0.06156, + "grad_norm": 2.421875, + "grad_norm_var": 0.0190093994140625, + "learning_rate": 0.0001, + "loss": 4.6707, + "loss/crossentropy": 2.469847083091736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2758704572916031, + "step": 3078 + }, + { + "epoch": 0.0616, + "grad_norm": 2.5, + "grad_norm_var": 0.01666259765625, + "learning_rate": 0.0001, + "loss": 4.5753, + "loss/crossentropy": 2.257850766181946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28196004033088684, + "step": 3080 + }, + { + "epoch": 0.06164, + "grad_norm": 2.46875, + "grad_norm_var": 0.014969889322916667, + "learning_rate": 0.0001, + "loss": 4.8783, + "loss/crossentropy": 2.52754008769989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.301498681306839, + "step": 3082 + }, + { + "epoch": 0.06168, + "grad_norm": 2.484375, + "grad_norm_var": 0.012279256184895834, + "learning_rate": 0.0001, + "loss": 4.6595, + "loss/crossentropy": 2.120129644870758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30073782801628113, + "step": 3084 + }, + { + "epoch": 0.06172, + "grad_norm": 2.546875, + "grad_norm_var": 0.0231353759765625, + "learning_rate": 0.0001, + "loss": 5.017, + "loss/crossentropy": 2.1483632922172546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29400819540023804, + "step": 3086 + }, + { + "epoch": 0.06176, + "grad_norm": 2.59375, + "grad_norm_var": 0.022297159830729166, + "learning_rate": 0.0001, + "loss": 5.2447, + "loss/crossentropy": 2.5588048696517944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3099561035633087, + "step": 3088 + }, + { + "epoch": 0.0618, + "grad_norm": 2.359375, + "grad_norm_var": 0.019852701822916666, + "learning_rate": 0.0001, + "loss": 4.5507, + "loss/crossentropy": 2.338167190551758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2638262137770653, + "step": 3090 + }, + { + "epoch": 0.06184, + "grad_norm": 2.5, + "grad_norm_var": 0.019603474934895834, + "learning_rate": 0.0001, + "loss": 5.0471, + "loss/crossentropy": 2.61361825466156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29420773684978485, + "step": 3092 + }, + { + "epoch": 0.06188, + "grad_norm": 2.5625, + "grad_norm_var": 0.019108072916666666, + "learning_rate": 0.0001, + "loss": 4.6931, + "loss/crossentropy": 2.1078842878341675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27283959090709686, + "step": 3094 + }, + { + "epoch": 0.06192, + "grad_norm": 2.296875, + "grad_norm_var": 0.023958333333333335, + "learning_rate": 0.0001, + "loss": 4.463, + "loss/crossentropy": 2.1501123905181885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3052368611097336, + "step": 3096 + }, + { + "epoch": 0.06196, + "grad_norm": 2.671875, + "grad_norm_var": 0.026656087239583334, + "learning_rate": 0.0001, + "loss": 5.0482, + "loss/crossentropy": 2.3622714281082153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2851791977882385, + "step": 3098 + }, + { + "epoch": 0.062, + "grad_norm": 2.671875, + "grad_norm_var": 0.05203450520833333, + "learning_rate": 0.0001, + "loss": 4.3506, + "loss/crossentropy": 1.9757406115531921, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24451570957899094, + "step": 3100 + }, + { + "epoch": 0.06204, + "grad_norm": 2.859375, + "grad_norm_var": 0.07011311848958333, + "learning_rate": 0.0001, + "loss": 5.0409, + "loss/crossentropy": 2.1915838718414307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2749434858560562, + "step": 3102 + }, + { + "epoch": 0.06208, + "grad_norm": 2.78125, + "grad_norm_var": 0.07214253743489583, + "learning_rate": 0.0001, + "loss": 4.8408, + "loss/crossentropy": 2.130649447441101, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2590179592370987, + "step": 3104 + }, + { + "epoch": 0.06212, + "grad_norm": 2.59375, + "grad_norm_var": 0.06317952473958334, + "learning_rate": 0.0001, + "loss": 4.9083, + "loss/crossentropy": 2.4478825330734253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30854369699954987, + "step": 3106 + }, + { + "epoch": 0.06216, + "grad_norm": 2.578125, + "grad_norm_var": 0.06524149576822917, + "learning_rate": 0.0001, + "loss": 4.8249, + "loss/crossentropy": 1.7108858227729797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22502756118774414, + "step": 3108 + }, + { + "epoch": 0.0622, + "grad_norm": 2.46875, + "grad_norm_var": 0.06788736979166667, + "learning_rate": 0.0001, + "loss": 4.5355, + "loss/crossentropy": 1.79804128408432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24767793715000153, + "step": 3110 + }, + { + "epoch": 0.06224, + "grad_norm": 3.28125, + "grad_norm_var": 0.07480061848958333, + "learning_rate": 0.0001, + "loss": 4.8779, + "loss/crossentropy": 1.899846076965332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23476070165634155, + "step": 3112 + }, + { + "epoch": 0.06228, + "grad_norm": 2.453125, + "grad_norm_var": 0.0790679931640625, + "learning_rate": 0.0001, + "loss": 4.7506, + "loss/crossentropy": 1.8458155393600464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24417708814144135, + "step": 3114 + }, + { + "epoch": 0.06232, + "grad_norm": 2.5, + "grad_norm_var": 0.06979166666666667, + "learning_rate": 0.0001, + "loss": 4.814, + "loss/crossentropy": 1.947302520275116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2566901594400406, + "step": 3116 + }, + { + "epoch": 0.06236, + "grad_norm": 2.546875, + "grad_norm_var": 0.046923828125, + "learning_rate": 0.0001, + "loss": 4.8424, + "loss/crossentropy": 2.039161205291748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26955537497997284, + "step": 3118 + }, + { + "epoch": 0.0624, + "grad_norm": 2.65625, + "grad_norm_var": 0.04488016764322917, + "learning_rate": 0.0001, + "loss": 5.1656, + "loss/crossentropy": 2.1528661251068115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.291052982211113, + "step": 3120 + }, + { + "epoch": 0.06244, + "grad_norm": 2.640625, + "grad_norm_var": 0.04644775390625, + "learning_rate": 0.0001, + "loss": 4.7941, + "loss/crossentropy": 2.4113996028900146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2962254136800766, + "step": 3122 + }, + { + "epoch": 0.06248, + "grad_norm": 2.65625, + "grad_norm_var": 0.044722493489583334, + "learning_rate": 0.0001, + "loss": 4.8463, + "loss/crossentropy": 2.0718825459480286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24676042795181274, + "step": 3124 + }, + { + "epoch": 0.06252, + "grad_norm": 2.546875, + "grad_norm_var": 0.04442952473958333, + "learning_rate": 0.0001, + "loss": 4.9723, + "loss/crossentropy": 2.433539032936096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.263284370303154, + "step": 3126 + }, + { + "epoch": 0.06256, + "grad_norm": 2.421875, + "grad_norm_var": 0.0087799072265625, + "learning_rate": 0.0001, + "loss": 4.7688, + "loss/crossentropy": 2.047150671482086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2655043303966522, + "step": 3128 + }, + { + "epoch": 0.0626, + "grad_norm": 2.546875, + "grad_norm_var": 0.008219401041666666, + "learning_rate": 0.0001, + "loss": 4.9257, + "loss/crossentropy": 1.8897674679756165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2314881831407547, + "step": 3130 + }, + { + "epoch": 0.06264, + "grad_norm": 2.390625, + "grad_norm_var": 0.00797119140625, + "learning_rate": 0.0001, + "loss": 4.8053, + "loss/crossentropy": 2.1436809301376343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26931750774383545, + "step": 3132 + }, + { + "epoch": 0.06268, + "grad_norm": 2.421875, + "grad_norm_var": 0.0081695556640625, + "learning_rate": 0.0001, + "loss": 4.67, + "loss/crossentropy": 1.9773973226547241, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26880529522895813, + "step": 3134 + }, + { + "epoch": 0.06272, + "grad_norm": 2.984375, + "grad_norm_var": 0.021903483072916667, + "learning_rate": 0.0001, + "loss": 5.1236, + "loss/crossentropy": 2.208973228931427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29979653656482697, + "step": 3136 + }, + { + "epoch": 0.06276, + "grad_norm": 2.8125, + "grad_norm_var": 0.026676432291666666, + "learning_rate": 0.0001, + "loss": 5.0619, + "loss/crossentropy": 2.64120090007782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29249200224876404, + "step": 3138 + }, + { + "epoch": 0.0628, + "grad_norm": 2.703125, + "grad_norm_var": 0.0306549072265625, + "learning_rate": 0.0001, + "loss": 5.0152, + "loss/crossentropy": 2.293095588684082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2689897269010544, + "step": 3140 + }, + { + "epoch": 0.06284, + "grad_norm": 2.5625, + "grad_norm_var": 0.028758748372395834, + "learning_rate": 0.0001, + "loss": 4.71, + "loss/crossentropy": 2.455227494239807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3202812373638153, + "step": 3142 + }, + { + "epoch": 0.06288, + "grad_norm": 2.578125, + "grad_norm_var": 0.0253814697265625, + "learning_rate": 0.0001, + "loss": 4.9622, + "loss/crossentropy": 1.9667487740516663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2702512592077255, + "step": 3144 + }, + { + "epoch": 0.06292, + "grad_norm": 2.578125, + "grad_norm_var": 0.024637858072916668, + "learning_rate": 0.0001, + "loss": 4.7756, + "loss/crossentropy": 1.8181490898132324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2310035452246666, + "step": 3146 + }, + { + "epoch": 0.06296, + "grad_norm": 2.640625, + "grad_norm_var": 0.0373687744140625, + "learning_rate": 0.0001, + "loss": 4.9556, + "loss/crossentropy": 1.9985284805297852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2944463640451431, + "step": 3148 + }, + { + "epoch": 0.063, + "grad_norm": 2.28125, + "grad_norm_var": 0.04142964680989583, + "learning_rate": 0.0001, + "loss": 5.0316, + "loss/crossentropy": 2.3850419521331787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29605095088481903, + "step": 3150 + }, + { + "epoch": 0.06304, + "grad_norm": 4.125, + "grad_norm_var": 0.17021077473958332, + "learning_rate": 0.0001, + "loss": 5.2062, + "loss/crossentropy": 2.3815460205078125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30247962474823, + "step": 3152 + }, + { + "epoch": 0.06308, + "grad_norm": 3.421875, + "grad_norm_var": 0.19840087890625, + "learning_rate": 0.0001, + "loss": 4.5851, + "loss/crossentropy": 1.995104193687439, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2506560683250427, + "step": 3154 + }, + { + "epoch": 0.06312, + "grad_norm": 3.328125, + "grad_norm_var": 0.2305084228515625, + "learning_rate": 0.0001, + "loss": 4.5885, + "loss/crossentropy": 2.2037696838378906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2716705799102783, + "step": 3156 + }, + { + "epoch": 0.06316, + "grad_norm": 2.84375, + "grad_norm_var": 0.23944905598958333, + "learning_rate": 0.0001, + "loss": 4.7155, + "loss/crossentropy": 2.067265272140503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25495442003011703, + "step": 3158 + }, + { + "epoch": 0.0632, + "grad_norm": 2.78125, + "grad_norm_var": 0.23763020833333334, + "learning_rate": 0.0001, + "loss": 5.1219, + "loss/crossentropy": 2.227355480194092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27113111317157745, + "step": 3160 + }, + { + "epoch": 0.06324, + "grad_norm": 2.546875, + "grad_norm_var": 0.23921610514322916, + "learning_rate": 0.0001, + "loss": 4.6777, + "loss/crossentropy": 1.9077441096305847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2630976662039757, + "step": 3162 + }, + { + "epoch": 0.06328, + "grad_norm": 2.59375, + "grad_norm_var": 0.23515523274739583, + "learning_rate": 0.0001, + "loss": 5.0084, + "loss/crossentropy": 2.1737552881240845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28365984559059143, + "step": 3164 + }, + { + "epoch": 0.06332, + "grad_norm": 2.5625, + "grad_norm_var": 0.22542317708333334, + "learning_rate": 0.0001, + "loss": 4.9822, + "loss/crossentropy": 1.9357402920722961, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27245980501174927, + "step": 3166 + }, + { + "epoch": 0.06336, + "grad_norm": 2.390625, + "grad_norm_var": 0.0995269775390625, + "learning_rate": 0.0001, + "loss": 4.7681, + "loss/crossentropy": 1.8484191298484802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2933400571346283, + "step": 3168 + }, + { + "epoch": 0.0634, + "grad_norm": 2.609375, + "grad_norm_var": 0.057738240559895834, + "learning_rate": 0.0001, + "loss": 5.0719, + "loss/crossentropy": 2.3085306882858276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2926720678806305, + "step": 3170 + }, + { + "epoch": 0.06344, + "grad_norm": 2.78125, + "grad_norm_var": 0.040380859375, + "learning_rate": 0.0001, + "loss": 5.1464, + "loss/crossentropy": 2.080895185470581, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2638559564948082, + "step": 3172 + }, + { + "epoch": 0.06348, + "grad_norm": 2.53125, + "grad_norm_var": 0.03287353515625, + "learning_rate": 0.0001, + "loss": 4.6857, + "loss/crossentropy": 2.035506248474121, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2692716419696808, + "step": 3174 + }, + { + "epoch": 0.06352, + "grad_norm": 2.859375, + "grad_norm_var": 0.035252888997395836, + "learning_rate": 0.0001, + "loss": 4.9479, + "loss/crossentropy": 2.1712740659713745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26579485833644867, + "step": 3176 + }, + { + "epoch": 0.06356, + "grad_norm": 2.96875, + "grad_norm_var": 0.04252827962239583, + "learning_rate": 0.0001, + "loss": 5.1654, + "loss/crossentropy": 2.2256147861480713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27605514228343964, + "step": 3178 + }, + { + "epoch": 0.0636, + "grad_norm": 2.359375, + "grad_norm_var": 0.046507771809895834, + "learning_rate": 0.0001, + "loss": 4.6489, + "loss/crossentropy": 2.0917118191719055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2560323476791382, + "step": 3180 + }, + { + "epoch": 0.06364, + "grad_norm": 2.671875, + "grad_norm_var": 0.045491536458333336, + "learning_rate": 0.0001, + "loss": 4.9833, + "loss/crossentropy": 2.3109938502311707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27040477097034454, + "step": 3182 + }, + { + "epoch": 0.06368, + "grad_norm": 2.65625, + "grad_norm_var": 0.03984273274739583, + "learning_rate": 0.0001, + "loss": 5.1441, + "loss/crossentropy": 2.3505672812461853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29180124402046204, + "step": 3184 + }, + { + "epoch": 0.06372, + "grad_norm": 2.578125, + "grad_norm_var": 0.04014383951822917, + "learning_rate": 0.0001, + "loss": 5.0224, + "loss/crossentropy": 2.2471169233322144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2676118314266205, + "step": 3186 + }, + { + "epoch": 0.06376, + "grad_norm": 2.46875, + "grad_norm_var": 0.0264312744140625, + "learning_rate": 0.0001, + "loss": 4.8247, + "loss/crossentropy": 2.3312125205993652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29243919253349304, + "step": 3188 + }, + { + "epoch": 0.0638, + "grad_norm": 2.4375, + "grad_norm_var": 0.02681884765625, + "learning_rate": 0.0001, + "loss": 4.9374, + "loss/crossentropy": 2.1709930896759033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2786664366722107, + "step": 3190 + }, + { + "epoch": 0.06384, + "grad_norm": 2.609375, + "grad_norm_var": 0.02476806640625, + "learning_rate": 0.0001, + "loss": 4.7391, + "loss/crossentropy": 1.8477665185928345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26132869720458984, + "step": 3192 + }, + { + "epoch": 0.06388, + "grad_norm": 2.6875, + "grad_norm_var": 0.01724853515625, + "learning_rate": 0.0001, + "loss": 4.8167, + "loss/crossentropy": 2.2102121114730835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26027651876211166, + "step": 3194 + }, + { + "epoch": 0.06392, + "grad_norm": 2.640625, + "grad_norm_var": 0.0168365478515625, + "learning_rate": 0.0001, + "loss": 4.6381, + "loss/crossentropy": 2.0011088252067566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25629256665706635, + "step": 3196 + }, + { + "epoch": 0.06396, + "grad_norm": 2.625, + "grad_norm_var": 0.01685791015625, + "learning_rate": 0.0001, + "loss": 4.9215, + "loss/crossentropy": 2.319412350654602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28315384685993195, + "step": 3198 + }, + { + "epoch": 0.064, + "grad_norm": 2.328125, + "grad_norm_var": 0.0145172119140625, + "learning_rate": 0.0001, + "loss": 4.7936, + "loss/crossentropy": 2.192026138305664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.264492504298687, + "step": 3200 + }, + { + "epoch": 0.06404, + "grad_norm": 2.78125, + "grad_norm_var": 0.017024739583333334, + "learning_rate": 0.0001, + "loss": 4.8178, + "loss/crossentropy": 2.1745734214782715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29369065165519714, + "step": 3202 + }, + { + "epoch": 0.06408, + "grad_norm": 3.109375, + "grad_norm_var": 0.042313639322916666, + "learning_rate": 0.0001, + "loss": 5.577, + "loss/crossentropy": 2.5039013624191284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2835303544998169, + "step": 3204 + }, + { + "epoch": 0.06412, + "grad_norm": 2.53125, + "grad_norm_var": 0.04075419108072917, + "learning_rate": 0.0001, + "loss": 5.0308, + "loss/crossentropy": 2.4255706071853638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3014884740114212, + "step": 3206 + }, + { + "epoch": 0.06416, + "grad_norm": 2.5625, + "grad_norm_var": 0.03873291015625, + "learning_rate": 0.0001, + "loss": 5.002, + "loss/crossentropy": 2.266150116920471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2698349952697754, + "step": 3208 + }, + { + "epoch": 0.0642, + "grad_norm": 2.4375, + "grad_norm_var": 0.03805338541666667, + "learning_rate": 0.0001, + "loss": 5.0173, + "loss/crossentropy": 2.2042946815490723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28830648958683014, + "step": 3210 + }, + { + "epoch": 0.06424, + "grad_norm": 2.5625, + "grad_norm_var": 0.03578999837239583, + "learning_rate": 0.0001, + "loss": 5.0292, + "loss/crossentropy": 2.4034690856933594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2834039032459259, + "step": 3212 + }, + { + "epoch": 0.06428, + "grad_norm": 2.4375, + "grad_norm_var": 0.0444000244140625, + "learning_rate": 0.0001, + "loss": 4.6655, + "loss/crossentropy": 2.1347755193710327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26405099034309387, + "step": 3214 + }, + { + "epoch": 0.06432, + "grad_norm": 3.34375, + "grad_norm_var": 0.07415262858072917, + "learning_rate": 0.0001, + "loss": 5.0862, + "loss/crossentropy": 2.018262207508087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27144598215818405, + "step": 3216 + }, + { + "epoch": 0.06436, + "grad_norm": 2.46875, + "grad_norm_var": 0.076953125, + "learning_rate": 0.0001, + "loss": 4.3969, + "loss/crossentropy": 2.0254003405570984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24976836144924164, + "step": 3218 + }, + { + "epoch": 0.0644, + "grad_norm": 2.78125, + "grad_norm_var": 0.06026102701822917, + "learning_rate": 0.0001, + "loss": 4.6779, + "loss/crossentropy": 2.1952659487724304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2786559462547302, + "step": 3220 + }, + { + "epoch": 0.06444, + "grad_norm": 2.609375, + "grad_norm_var": 0.06024983723958333, + "learning_rate": 0.0001, + "loss": 4.8828, + "loss/crossentropy": 2.1383036375045776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2570757120847702, + "step": 3222 + }, + { + "epoch": 0.06448, + "grad_norm": 2.65625, + "grad_norm_var": 0.05915425618489583, + "learning_rate": 0.0001, + "loss": 4.8248, + "loss/crossentropy": 2.267812967300415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2774328589439392, + "step": 3224 + }, + { + "epoch": 0.06452, + "grad_norm": 2.515625, + "grad_norm_var": 0.0587066650390625, + "learning_rate": 0.0001, + "loss": 4.9389, + "loss/crossentropy": 1.906205415725708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2528962790966034, + "step": 3226 + }, + { + "epoch": 0.06456, + "grad_norm": 2.453125, + "grad_norm_var": 0.05895182291666667, + "learning_rate": 0.0001, + "loss": 4.9823, + "loss/crossentropy": 2.1628336906433105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27087944746017456, + "step": 3228 + }, + { + "epoch": 0.0646, + "grad_norm": 2.390625, + "grad_norm_var": 0.05807291666666667, + "learning_rate": 0.0001, + "loss": 4.5893, + "loss/crossentropy": 1.8845162391662598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2241554707288742, + "step": 3230 + }, + { + "epoch": 0.06464, + "grad_norm": 2.390625, + "grad_norm_var": 0.019237263997395834, + "learning_rate": 0.0001, + "loss": 4.9143, + "loss/crossentropy": 2.4157146215438843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2770439088344574, + "step": 3232 + }, + { + "epoch": 0.06468, + "grad_norm": 2.515625, + "grad_norm_var": 0.021825154622395832, + "learning_rate": 0.0001, + "loss": 4.832, + "loss/crossentropy": 1.9979816675186157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23696578294038773, + "step": 3234 + }, + { + "epoch": 0.06472, + "grad_norm": 2.484375, + "grad_norm_var": 0.016364542643229167, + "learning_rate": 0.0001, + "loss": 5.1123, + "loss/crossentropy": 2.1790190935134888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.252600260078907, + "step": 3236 + }, + { + "epoch": 0.06476, + "grad_norm": 2.4375, + "grad_norm_var": 0.016624959309895833, + "learning_rate": 0.0001, + "loss": 4.5943, + "loss/crossentropy": 2.1612111926078796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27929094433784485, + "step": 3238 + }, + { + "epoch": 0.0648, + "grad_norm": 2.546875, + "grad_norm_var": 0.08124898274739584, + "learning_rate": 0.0001, + "loss": 4.8293, + "loss/crossentropy": 2.261025071144104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3034510314464569, + "step": 3240 + }, + { + "epoch": 0.06484, + "grad_norm": 2.46875, + "grad_norm_var": 0.081689453125, + "learning_rate": 0.0001, + "loss": 4.8243, + "loss/crossentropy": 2.02247554063797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2655429244041443, + "step": 3242 + }, + { + "epoch": 0.06488, + "grad_norm": 2.390625, + "grad_norm_var": 0.08289286295572916, + "learning_rate": 0.0001, + "loss": 4.9428, + "loss/crossentropy": 2.495269775390625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32837189733982086, + "step": 3244 + }, + { + "epoch": 0.06492, + "grad_norm": 2.53125, + "grad_norm_var": 0.07550455729166666, + "learning_rate": 0.0001, + "loss": 4.9928, + "loss/crossentropy": 2.365166425704956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27000221610069275, + "step": 3246 + }, + { + "epoch": 0.06496, + "grad_norm": 2.4375, + "grad_norm_var": 0.0746246337890625, + "learning_rate": 0.0001, + "loss": 4.837, + "loss/crossentropy": 1.8728906512260437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2711311876773834, + "step": 3248 + }, + { + "epoch": 0.065, + "grad_norm": 2.375, + "grad_norm_var": 0.07427978515625, + "learning_rate": 0.0001, + "loss": 4.7916, + "loss/crossentropy": 1.916531264781952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23036544770002365, + "step": 3250 + }, + { + "epoch": 0.06504, + "grad_norm": 2.421875, + "grad_norm_var": 0.07463277180989583, + "learning_rate": 0.0001, + "loss": 4.9399, + "loss/crossentropy": 1.996503233909607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24317056685686111, + "step": 3252 + }, + { + "epoch": 0.06508, + "grad_norm": 2.5625, + "grad_norm_var": 0.07366536458333334, + "learning_rate": 0.0001, + "loss": 4.8746, + "loss/crossentropy": 2.101921260356903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2775810658931732, + "step": 3254 + }, + { + "epoch": 0.06512, + "grad_norm": 2.40625, + "grad_norm_var": 0.005125935872395833, + "learning_rate": 0.0001, + "loss": 4.7526, + "loss/crossentropy": 1.9286046028137207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23964455723762512, + "step": 3256 + }, + { + "epoch": 0.06516, + "grad_norm": 2.40625, + "grad_norm_var": 0.005464680989583333, + "learning_rate": 0.0001, + "loss": 4.8216, + "loss/crossentropy": 2.224915862083435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27516382932662964, + "step": 3258 + }, + { + "epoch": 0.0652, + "grad_norm": 2.34375, + "grad_norm_var": 0.0133697509765625, + "learning_rate": 0.0001, + "loss": 4.6553, + "loss/crossentropy": 2.259337306022644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27185751497745514, + "step": 3260 + }, + { + "epoch": 0.06524, + "grad_norm": 2.484375, + "grad_norm_var": 0.013117472330729166, + "learning_rate": 0.0001, + "loss": 4.7095, + "loss/crossentropy": 2.1081044673919678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23740330338478088, + "step": 3262 + }, + { + "epoch": 0.06528, + "grad_norm": 2.546875, + "grad_norm_var": 0.0131744384765625, + "learning_rate": 0.0001, + "loss": 4.6375, + "loss/crossentropy": 1.9032058119773865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2508246824145317, + "step": 3264 + }, + { + "epoch": 0.06532, + "grad_norm": 2.671875, + "grad_norm_var": 0.013353474934895833, + "learning_rate": 0.0001, + "loss": 4.7165, + "loss/crossentropy": 2.0773792266845703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2797544598579407, + "step": 3266 + }, + { + "epoch": 0.06536, + "grad_norm": 2.40625, + "grad_norm_var": 0.013509114583333334, + "learning_rate": 0.0001, + "loss": 4.9136, + "loss/crossentropy": 2.1591526865959167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2540442571043968, + "step": 3268 + }, + { + "epoch": 0.0654, + "grad_norm": 2.546875, + "grad_norm_var": 0.014351399739583333, + "learning_rate": 0.0001, + "loss": 4.6248, + "loss/crossentropy": 1.7735809683799744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24107928574085236, + "step": 3270 + }, + { + "epoch": 0.06544, + "grad_norm": 2.5, + "grad_norm_var": 0.014647420247395833, + "learning_rate": 0.0001, + "loss": 4.7568, + "loss/crossentropy": 2.069553792476654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26356737315654755, + "step": 3272 + }, + { + "epoch": 0.06548, + "grad_norm": 2.453125, + "grad_norm_var": 0.014623006184895834, + "learning_rate": 0.0001, + "loss": 4.7274, + "loss/crossentropy": 1.9688642024993896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29453104734420776, + "step": 3274 + }, + { + "epoch": 0.06552, + "grad_norm": 2.296875, + "grad_norm_var": 0.012043253580729166, + "learning_rate": 0.0001, + "loss": 4.4264, + "loss/crossentropy": 1.785252034664154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23213861882686615, + "step": 3276 + }, + { + "epoch": 0.06556, + "grad_norm": 2.65625, + "grad_norm_var": 0.013255818684895834, + "learning_rate": 0.0001, + "loss": 4.779, + "loss/crossentropy": 2.155774712562561, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25758183747529984, + "step": 3278 + }, + { + "epoch": 0.0656, + "grad_norm": 2.5625, + "grad_norm_var": 0.016161092122395835, + "learning_rate": 0.0001, + "loss": 5.0453, + "loss/crossentropy": 2.0403348803520203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25007252395153046, + "step": 3280 + }, + { + "epoch": 0.06564, + "grad_norm": 2.4375, + "grad_norm_var": 0.015208943684895834, + "learning_rate": 0.0001, + "loss": 4.7161, + "loss/crossentropy": 1.9963608384132385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2608266994357109, + "step": 3282 + }, + { + "epoch": 0.06568, + "grad_norm": 2.75, + "grad_norm_var": 0.019364420572916666, + "learning_rate": 0.0001, + "loss": 4.9275, + "loss/crossentropy": 2.2021098732948303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28361018002033234, + "step": 3284 + }, + { + "epoch": 0.06572, + "grad_norm": 2.59375, + "grad_norm_var": 0.01812744140625, + "learning_rate": 0.0001, + "loss": 4.6533, + "loss/crossentropy": 2.0363903641700745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2641526162624359, + "step": 3286 + }, + { + "epoch": 0.06576, + "grad_norm": 2.671875, + "grad_norm_var": 0.020466105143229166, + "learning_rate": 0.0001, + "loss": 4.6884, + "loss/crossentropy": 2.1052531003952026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24344927817583084, + "step": 3288 + }, + { + "epoch": 0.0658, + "grad_norm": 2.578125, + "grad_norm_var": 0.02027587890625, + "learning_rate": 0.0001, + "loss": 4.9246, + "loss/crossentropy": 2.0664029717445374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25679293274879456, + "step": 3290 + }, + { + "epoch": 0.06584, + "grad_norm": 2.625, + "grad_norm_var": 0.013678995768229167, + "learning_rate": 0.0001, + "loss": 4.7802, + "loss/crossentropy": 2.168351709842682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2759328931570053, + "step": 3292 + }, + { + "epoch": 0.06588, + "grad_norm": 2.515625, + "grad_norm_var": 0.013703409830729167, + "learning_rate": 0.0001, + "loss": 4.8481, + "loss/crossentropy": 1.9305949211120605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26165173947811127, + "step": 3294 + }, + { + "epoch": 0.06592, + "grad_norm": 2.4375, + "grad_norm_var": 0.017476399739583332, + "learning_rate": 0.0001, + "loss": 4.6942, + "loss/crossentropy": 1.9257155060768127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24499841034412384, + "step": 3296 + }, + { + "epoch": 0.06596, + "grad_norm": 2.484375, + "grad_norm_var": 0.017513020833333334, + "learning_rate": 0.0001, + "loss": 4.8941, + "loss/crossentropy": 2.1406426429748535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26676414906978607, + "step": 3298 + }, + { + "epoch": 0.066, + "grad_norm": 2.734375, + "grad_norm_var": 0.016422526041666666, + "learning_rate": 0.0001, + "loss": 4.8293, + "loss/crossentropy": 2.099781036376953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2916935384273529, + "step": 3300 + }, + { + "epoch": 0.06604, + "grad_norm": 2.875, + "grad_norm_var": 0.023094685872395833, + "learning_rate": 0.0001, + "loss": 4.9504, + "loss/crossentropy": 2.1077913641929626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2667912393808365, + "step": 3302 + }, + { + "epoch": 0.06608, + "grad_norm": 3.90625, + "grad_norm_var": 0.1297271728515625, + "learning_rate": 0.0001, + "loss": 5.2952, + "loss/crossentropy": 2.2583223581314087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2907385528087616, + "step": 3304 + }, + { + "epoch": 0.06612, + "grad_norm": 2.71875, + "grad_norm_var": 0.13405659993489583, + "learning_rate": 0.0001, + "loss": 4.5813, + "loss/crossentropy": 1.847477912902832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2272188812494278, + "step": 3306 + }, + { + "epoch": 0.06616, + "grad_norm": 2.40625, + "grad_norm_var": 0.1392486572265625, + "learning_rate": 0.0001, + "loss": 4.6502, + "loss/crossentropy": 1.7610225677490234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23973071575164795, + "step": 3308 + }, + { + "epoch": 0.0662, + "grad_norm": 2.328125, + "grad_norm_var": 0.1464019775390625, + "learning_rate": 0.0001, + "loss": 4.7123, + "loss/crossentropy": 1.977916419506073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25456882268190384, + "step": 3310 + }, + { + "epoch": 0.06624, + "grad_norm": 2.734375, + "grad_norm_var": 0.14095052083333334, + "learning_rate": 0.0001, + "loss": 4.7353, + "loss/crossentropy": 2.2196428775787354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2563377171754837, + "step": 3312 + }, + { + "epoch": 0.06628, + "grad_norm": 2.53125, + "grad_norm_var": 0.14537760416666667, + "learning_rate": 0.0001, + "loss": 4.6907, + "loss/crossentropy": 2.0861470699310303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27121224999427795, + "step": 3314 + }, + { + "epoch": 0.06632, + "grad_norm": 2.53125, + "grad_norm_var": 0.14287821451822916, + "learning_rate": 0.0001, + "loss": 4.6484, + "loss/crossentropy": 1.9716283679008484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.277963787317276, + "step": 3316 + }, + { + "epoch": 0.06636, + "grad_norm": 2.609375, + "grad_norm_var": 0.1375, + "learning_rate": 0.0001, + "loss": 4.7985, + "loss/crossentropy": 2.5604729652404785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31031742691993713, + "step": 3318 + }, + { + "epoch": 0.0664, + "grad_norm": 2.71875, + "grad_norm_var": 0.022337849934895834, + "learning_rate": 0.0001, + "loss": 5.4329, + "loss/crossentropy": 2.2672252655029297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29225634038448334, + "step": 3320 + }, + { + "epoch": 0.06644, + "grad_norm": 2.671875, + "grad_norm_var": 0.020015462239583334, + "learning_rate": 0.0001, + "loss": 5.0557, + "loss/crossentropy": 2.0992931723594666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2648598402738571, + "step": 3322 + }, + { + "epoch": 0.06648, + "grad_norm": 2.65625, + "grad_norm_var": 0.014827473958333334, + "learning_rate": 0.0001, + "loss": 5.0895, + "loss/crossentropy": 2.208917260169983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2998420298099518, + "step": 3324 + }, + { + "epoch": 0.06652, + "grad_norm": 2.671875, + "grad_norm_var": 0.012791951497395834, + "learning_rate": 0.0001, + "loss": 4.8834, + "loss/crossentropy": 2.290796995162964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2638905793428421, + "step": 3326 + }, + { + "epoch": 0.06656, + "grad_norm": 2.375, + "grad_norm_var": 0.012214152018229167, + "learning_rate": 0.0001, + "loss": 4.8521, + "loss/crossentropy": 2.4156445264816284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28743064403533936, + "step": 3328 + }, + { + "epoch": 0.0666, + "grad_norm": 2.734375, + "grad_norm_var": 0.012132771809895833, + "learning_rate": 0.0001, + "loss": 5.2205, + "loss/crossentropy": 2.4604904651641846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27585768699645996, + "step": 3330 + }, + { + "epoch": 0.06664, + "grad_norm": 2.421875, + "grad_norm_var": 0.013818359375, + "learning_rate": 0.0001, + "loss": 4.8296, + "loss/crossentropy": 1.8613844513893127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27038969844579697, + "step": 3332 + }, + { + "epoch": 0.06668, + "grad_norm": 2.53125, + "grad_norm_var": 0.014046223958333333, + "learning_rate": 0.0001, + "loss": 4.8777, + "loss/crossentropy": 2.232776403427124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2686954140663147, + "step": 3334 + }, + { + "epoch": 0.06672, + "grad_norm": 2.5625, + "grad_norm_var": 0.0120758056640625, + "learning_rate": 0.0001, + "loss": 4.9536, + "loss/crossentropy": 2.070033550262451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26176171004772186, + "step": 3336 + }, + { + "epoch": 0.06676, + "grad_norm": 2.4375, + "grad_norm_var": 0.01187744140625, + "learning_rate": 0.0001, + "loss": 4.8616, + "loss/crossentropy": 2.38937509059906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2539200633764267, + "step": 3338 + }, + { + "epoch": 0.0668, + "grad_norm": 2.53125, + "grad_norm_var": 0.010933430989583333, + "learning_rate": 0.0001, + "loss": 5.1104, + "loss/crossentropy": 2.272845983505249, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2856733053922653, + "step": 3340 + }, + { + "epoch": 0.06684, + "grad_norm": 2.5625, + "grad_norm_var": 0.01181640625, + "learning_rate": 0.0001, + "loss": 4.7935, + "loss/crossentropy": 2.427489161491394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30572691559791565, + "step": 3342 + }, + { + "epoch": 0.06688, + "grad_norm": 2.59375, + "grad_norm_var": 0.009407552083333333, + "learning_rate": 0.0001, + "loss": 4.9801, + "loss/crossentropy": 2.4701327085494995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28469331562519073, + "step": 3344 + }, + { + "epoch": 0.06692, + "grad_norm": 2.421875, + "grad_norm_var": 0.006917317708333333, + "learning_rate": 0.0001, + "loss": 4.815, + "loss/crossentropy": 2.0733558535575867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.292859822511673, + "step": 3346 + }, + { + "epoch": 0.06696, + "grad_norm": 2.640625, + "grad_norm_var": 0.0061187744140625, + "learning_rate": 0.0001, + "loss": 4.8379, + "loss/crossentropy": 2.301755905151367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3174070864915848, + "step": 3348 + }, + { + "epoch": 0.067, + "grad_norm": 2.234375, + "grad_norm_var": 0.012580362955729167, + "learning_rate": 0.0001, + "loss": 4.3359, + "loss/crossentropy": 2.039419114589691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24543144553899765, + "step": 3350 + }, + { + "epoch": 0.06704, + "grad_norm": 2.4375, + "grad_norm_var": 0.0144683837890625, + "learning_rate": 0.0001, + "loss": 4.7938, + "loss/crossentropy": 1.9247611165046692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2595005929470062, + "step": 3352 + }, + { + "epoch": 0.06708, + "grad_norm": 2.34375, + "grad_norm_var": 0.01630859375, + "learning_rate": 0.0001, + "loss": 4.8294, + "loss/crossentropy": 2.224974751472473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26880575716495514, + "step": 3354 + }, + { + "epoch": 0.06712, + "grad_norm": 2.734375, + "grad_norm_var": 0.0195220947265625, + "learning_rate": 0.0001, + "loss": 4.8922, + "loss/crossentropy": 2.2601993083953857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.279633030295372, + "step": 3356 + }, + { + "epoch": 0.06716, + "grad_norm": 2.4375, + "grad_norm_var": 0.017085774739583334, + "learning_rate": 0.0001, + "loss": 4.5911, + "loss/crossentropy": 1.8156417608261108, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22118539363145828, + "step": 3358 + }, + { + "epoch": 0.0672, + "grad_norm": 2.734375, + "grad_norm_var": 0.020140584309895834, + "learning_rate": 0.0001, + "loss": 5.2032, + "loss/crossentropy": 2.207859516143799, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2771998345851898, + "step": 3360 + }, + { + "epoch": 0.06724, + "grad_norm": 2.453125, + "grad_norm_var": 0.02017822265625, + "learning_rate": 0.0001, + "loss": 4.9352, + "loss/crossentropy": 1.9886083602905273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2545766308903694, + "step": 3362 + }, + { + "epoch": 0.06728, + "grad_norm": 2.234375, + "grad_norm_var": 0.025777180989583332, + "learning_rate": 0.0001, + "loss": 4.4323, + "loss/crossentropy": 1.7046592235565186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2207612618803978, + "step": 3364 + }, + { + "epoch": 0.06732, + "grad_norm": 2.453125, + "grad_norm_var": 0.021122233072916666, + "learning_rate": 0.0001, + "loss": 5.0209, + "loss/crossentropy": 2.0283663868904114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2743644416332245, + "step": 3366 + }, + { + "epoch": 0.06736, + "grad_norm": 2.453125, + "grad_norm_var": 0.019498697916666665, + "learning_rate": 0.0001, + "loss": 4.4788, + "loss/crossentropy": 1.975772500038147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2505730241537094, + "step": 3368 + }, + { + "epoch": 0.0674, + "grad_norm": 2.5, + "grad_norm_var": 0.019950358072916667, + "learning_rate": 0.0001, + "loss": 4.6863, + "loss/crossentropy": 1.9021872282028198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25430944561958313, + "step": 3370 + }, + { + "epoch": 0.06744, + "grad_norm": 2.671875, + "grad_norm_var": 0.0182037353515625, + "learning_rate": 0.0001, + "loss": 5.1859, + "loss/crossentropy": 2.3888463973999023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27896443009376526, + "step": 3372 + }, + { + "epoch": 0.06748, + "grad_norm": 2.71875, + "grad_norm_var": 0.02086181640625, + "learning_rate": 0.0001, + "loss": 4.6041, + "loss/crossentropy": 1.7844690680503845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2536455988883972, + "step": 3374 + }, + { + "epoch": 0.06752, + "grad_norm": 2.703125, + "grad_norm_var": 0.021442667643229166, + "learning_rate": 0.0001, + "loss": 4.4538, + "loss/crossentropy": 1.919598639011383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26129309833049774, + "step": 3376 + }, + { + "epoch": 0.06756, + "grad_norm": 2.671875, + "grad_norm_var": 0.022835286458333333, + "learning_rate": 0.0001, + "loss": 5.0568, + "loss/crossentropy": 2.2292110919952393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25612247735261917, + "step": 3378 + }, + { + "epoch": 0.0676, + "grad_norm": 3.4375, + "grad_norm_var": 0.0779296875, + "learning_rate": 0.0001, + "loss": 4.8674, + "loss/crossentropy": 2.2235841751098633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2607487738132477, + "step": 3380 + }, + { + "epoch": 0.06764, + "grad_norm": 2.34375, + "grad_norm_var": 0.08089192708333333, + "learning_rate": 0.0001, + "loss": 4.5872, + "loss/crossentropy": 2.1375235319137573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2344205006957054, + "step": 3382 + }, + { + "epoch": 0.06768, + "grad_norm": 2.28125, + "grad_norm_var": 0.08501688639322917, + "learning_rate": 0.0001, + "loss": 4.6076, + "loss/crossentropy": 2.020237445831299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2550392523407936, + "step": 3384 + }, + { + "epoch": 0.06772, + "grad_norm": 2.734375, + "grad_norm_var": 0.07911783854166667, + "learning_rate": 0.0001, + "loss": 4.6939, + "loss/crossentropy": 2.138959765434265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2742607295513153, + "step": 3386 + }, + { + "epoch": 0.06776, + "grad_norm": 2.578125, + "grad_norm_var": 0.07864481608072917, + "learning_rate": 0.0001, + "loss": 4.9065, + "loss/crossentropy": 2.521559953689575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2892928719520569, + "step": 3388 + }, + { + "epoch": 0.0678, + "grad_norm": 2.375, + "grad_norm_var": 0.07815348307291667, + "learning_rate": 0.0001, + "loss": 5.049, + "loss/crossentropy": 2.169550120830536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27750439941883087, + "step": 3390 + }, + { + "epoch": 0.06784, + "grad_norm": 2.6875, + "grad_norm_var": 0.07464090983072917, + "learning_rate": 0.0001, + "loss": 4.7401, + "loss/crossentropy": 1.975584864616394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2557516545057297, + "step": 3392 + }, + { + "epoch": 0.06788, + "grad_norm": 2.6875, + "grad_norm_var": 0.07617899576822916, + "learning_rate": 0.0001, + "loss": 4.9207, + "loss/crossentropy": 2.5837322473526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3444886952638626, + "step": 3394 + }, + { + "epoch": 0.06792, + "grad_norm": 2.546875, + "grad_norm_var": 0.022294108072916666, + "learning_rate": 0.0001, + "loss": 4.743, + "loss/crossentropy": 1.963110864162445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24120041728019714, + "step": 3396 + }, + { + "epoch": 0.06796, + "grad_norm": 2.65625, + "grad_norm_var": 0.0211090087890625, + "learning_rate": 0.0001, + "loss": 4.5759, + "loss/crossentropy": 2.381603956222534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26319295167922974, + "step": 3398 + }, + { + "epoch": 0.068, + "grad_norm": 2.25, + "grad_norm_var": 0.023078409830729167, + "learning_rate": 0.0001, + "loss": 4.5642, + "loss/crossentropy": 2.054026961326599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24540280550718307, + "step": 3400 + }, + { + "epoch": 0.06804, + "grad_norm": 2.46875, + "grad_norm_var": 0.0187652587890625, + "learning_rate": 0.0001, + "loss": 4.3661, + "loss/crossentropy": 2.047453820705414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2524164840579033, + "step": 3402 + }, + { + "epoch": 0.06808, + "grad_norm": 2.390625, + "grad_norm_var": 0.018317667643229167, + "learning_rate": 0.0001, + "loss": 4.7839, + "loss/crossentropy": 1.8616933226585388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23905867338180542, + "step": 3404 + }, + { + "epoch": 0.06812, + "grad_norm": 2.4375, + "grad_norm_var": 0.016422526041666666, + "learning_rate": 0.0001, + "loss": 4.6932, + "loss/crossentropy": 2.4304568767547607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2802550047636032, + "step": 3406 + }, + { + "epoch": 0.06816, + "grad_norm": 2.5, + "grad_norm_var": 0.01318359375, + "learning_rate": 0.0001, + "loss": 4.7005, + "loss/crossentropy": 1.8178748488426208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24266308546066284, + "step": 3408 + }, + { + "epoch": 0.0682, + "grad_norm": 2.625, + "grad_norm_var": 0.0116363525390625, + "learning_rate": 0.0001, + "loss": 5.0152, + "loss/crossentropy": 2.025859773159027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.282450333237648, + "step": 3410 + }, + { + "epoch": 0.06824, + "grad_norm": 2.53125, + "grad_norm_var": 0.010383097330729167, + "learning_rate": 0.0001, + "loss": 4.737, + "loss/crossentropy": 2.032994568347931, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26047058403491974, + "step": 3412 + }, + { + "epoch": 0.06828, + "grad_norm": 2.46875, + "grad_norm_var": 0.007840983072916667, + "learning_rate": 0.0001, + "loss": 4.6428, + "loss/crossentropy": 2.2468607425689697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2814445495605469, + "step": 3414 + }, + { + "epoch": 0.06832, + "grad_norm": 2.75, + "grad_norm_var": 0.03629150390625, + "learning_rate": 0.0001, + "loss": 4.6854, + "loss/crossentropy": 2.2534161806106567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3231179416179657, + "step": 3416 + }, + { + "epoch": 0.06836, + "grad_norm": 2.65625, + "grad_norm_var": 0.03443603515625, + "learning_rate": 0.0001, + "loss": 4.9599, + "loss/crossentropy": 2.1678181886672974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2840191572904587, + "step": 3418 + }, + { + "epoch": 0.0684, + "grad_norm": 2.328125, + "grad_norm_var": 0.03453776041666667, + "learning_rate": 0.0001, + "loss": 4.8667, + "loss/crossentropy": 2.053459882736206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.271483838558197, + "step": 3420 + }, + { + "epoch": 0.06844, + "grad_norm": 2.53125, + "grad_norm_var": 0.03243815104166667, + "learning_rate": 0.0001, + "loss": 4.714, + "loss/crossentropy": 2.2278919219970703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2781473994255066, + "step": 3422 + }, + { + "epoch": 0.06848, + "grad_norm": 2.296875, + "grad_norm_var": 0.0384674072265625, + "learning_rate": 0.0001, + "loss": 4.5262, + "loss/crossentropy": 1.9582479000091553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24171434342861176, + "step": 3424 + }, + { + "epoch": 0.06852, + "grad_norm": 2.359375, + "grad_norm_var": 0.04121805826822917, + "learning_rate": 0.0001, + "loss": 4.4734, + "loss/crossentropy": 1.808964192867279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22458729147911072, + "step": 3426 + }, + { + "epoch": 0.06856, + "grad_norm": 2.40625, + "grad_norm_var": 0.04296468098958333, + "learning_rate": 0.0001, + "loss": 4.464, + "loss/crossentropy": 1.9225260019302368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2508438527584076, + "step": 3428 + }, + { + "epoch": 0.0686, + "grad_norm": 2.578125, + "grad_norm_var": 0.044465128580729166, + "learning_rate": 0.0001, + "loss": 4.9647, + "loss/crossentropy": 2.2446881532669067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26919613778591156, + "step": 3430 + }, + { + "epoch": 0.06864, + "grad_norm": 2.515625, + "grad_norm_var": 0.017899576822916666, + "learning_rate": 0.0001, + "loss": 4.6865, + "loss/crossentropy": 2.2047033309936523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2581355720758438, + "step": 3432 + }, + { + "epoch": 0.06868, + "grad_norm": 2.46875, + "grad_norm_var": 0.015729777018229165, + "learning_rate": 0.0001, + "loss": 4.7139, + "loss/crossentropy": 2.1223543882369995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25814756751060486, + "step": 3434 + }, + { + "epoch": 0.06872, + "grad_norm": 2.59375, + "grad_norm_var": 0.016813151041666665, + "learning_rate": 0.0001, + "loss": 4.6771, + "loss/crossentropy": 2.1641053557395935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25222497433423996, + "step": 3436 + }, + { + "epoch": 0.06876, + "grad_norm": 2.640625, + "grad_norm_var": 0.0189605712890625, + "learning_rate": 0.0001, + "loss": 4.6972, + "loss/crossentropy": 1.9569795727729797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2681007981300354, + "step": 3438 + }, + { + "epoch": 0.0688, + "grad_norm": 2.453125, + "grad_norm_var": 0.013216145833333333, + "learning_rate": 0.0001, + "loss": 4.8214, + "loss/crossentropy": 2.212220251560211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28191742300987244, + "step": 3440 + }, + { + "epoch": 0.06884, + "grad_norm": 2.34375, + "grad_norm_var": 0.011546834309895834, + "learning_rate": 0.0001, + "loss": 4.4162, + "loss/crossentropy": 1.9564262628555298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2560906559228897, + "step": 3442 + }, + { + "epoch": 0.06888, + "grad_norm": 2.46875, + "grad_norm_var": 0.012043253580729166, + "learning_rate": 0.0001, + "loss": 4.6837, + "loss/crossentropy": 1.8553346395492554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24626825004816055, + "step": 3444 + }, + { + "epoch": 0.06892, + "grad_norm": 2.3125, + "grad_norm_var": 0.012007649739583333, + "learning_rate": 0.0001, + "loss": 4.7914, + "loss/crossentropy": 1.9803723692893982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2712126225233078, + "step": 3446 + }, + { + "epoch": 0.06896, + "grad_norm": 2.46875, + "grad_norm_var": 0.010423787434895833, + "learning_rate": 0.0001, + "loss": 4.857, + "loss/crossentropy": 1.9914751648902893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2565220594406128, + "step": 3448 + }, + { + "epoch": 0.069, + "grad_norm": 2.484375, + "grad_norm_var": 0.010758463541666667, + "learning_rate": 0.0001, + "loss": 4.8013, + "loss/crossentropy": 2.2114094495773315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28365010023117065, + "step": 3450 + }, + { + "epoch": 0.06904, + "grad_norm": 2.453125, + "grad_norm_var": 0.009577433268229166, + "learning_rate": 0.0001, + "loss": 5.0322, + "loss/crossentropy": 2.4366514682769775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30788426101207733, + "step": 3452 + }, + { + "epoch": 0.06908, + "grad_norm": 2.671875, + "grad_norm_var": 0.010595703125, + "learning_rate": 0.0001, + "loss": 4.7902, + "loss/crossentropy": 2.304569959640503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2835633158683777, + "step": 3454 + }, + { + "epoch": 0.06912, + "grad_norm": 2.65625, + "grad_norm_var": 0.027318318684895832, + "learning_rate": 0.0001, + "loss": 5.151, + "loss/crossentropy": 2.2518080472946167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.278719499707222, + "step": 3456 + }, + { + "epoch": 0.06916, + "grad_norm": 2.71875, + "grad_norm_var": 0.027904256184895834, + "learning_rate": 0.0001, + "loss": 5.1688, + "loss/crossentropy": 2.333768129348755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28694969415664673, + "step": 3458 + }, + { + "epoch": 0.0692, + "grad_norm": 2.46875, + "grad_norm_var": 0.0260650634765625, + "learning_rate": 0.0001, + "loss": 5.1421, + "loss/crossentropy": 2.3534432649612427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3005864769220352, + "step": 3460 + }, + { + "epoch": 0.06924, + "grad_norm": 2.4375, + "grad_norm_var": 0.022981770833333335, + "learning_rate": 0.0001, + "loss": 4.9502, + "loss/crossentropy": 2.0703017711639404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25969168543815613, + "step": 3462 + }, + { + "epoch": 0.06928, + "grad_norm": 2.65625, + "grad_norm_var": 0.023502604166666666, + "learning_rate": 0.0001, + "loss": 5.0258, + "loss/crossentropy": 2.167420506477356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32322582602500916, + "step": 3464 + }, + { + "epoch": 0.06932, + "grad_norm": 2.515625, + "grad_norm_var": 0.021222941080729165, + "learning_rate": 0.0001, + "loss": 4.5342, + "loss/crossentropy": 2.0845181941986084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26804475486278534, + "step": 3466 + }, + { + "epoch": 0.06936, + "grad_norm": 2.4375, + "grad_norm_var": 0.021219889322916668, + "learning_rate": 0.0001, + "loss": 5.0459, + "loss/crossentropy": 2.4165114164352417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2866530567407608, + "step": 3468 + }, + { + "epoch": 0.0694, + "grad_norm": 2.765625, + "grad_norm_var": 0.022359212239583332, + "learning_rate": 0.0001, + "loss": 5.0938, + "loss/crossentropy": 2.4152863025665283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30185502767562866, + "step": 3470 + }, + { + "epoch": 0.06944, + "grad_norm": 2.28125, + "grad_norm_var": 0.014924112955729167, + "learning_rate": 0.0001, + "loss": 4.6362, + "loss/crossentropy": 2.1878501176834106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2736133933067322, + "step": 3472 + }, + { + "epoch": 0.06948, + "grad_norm": 2.4375, + "grad_norm_var": 0.015262858072916666, + "learning_rate": 0.0001, + "loss": 4.812, + "loss/crossentropy": 2.055173695087433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2609568238258362, + "step": 3474 + }, + { + "epoch": 0.06952, + "grad_norm": 2.453125, + "grad_norm_var": 0.015550740559895833, + "learning_rate": 0.0001, + "loss": 4.8201, + "loss/crossentropy": 2.050000250339508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2713121324777603, + "step": 3476 + }, + { + "epoch": 0.06956, + "grad_norm": 2.65625, + "grad_norm_var": 0.019169108072916666, + "learning_rate": 0.0001, + "loss": 4.9916, + "loss/crossentropy": 2.227464199066162, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2710970640182495, + "step": 3478 + }, + { + "epoch": 0.0696, + "grad_norm": 2.46875, + "grad_norm_var": 0.016828409830729165, + "learning_rate": 0.0001, + "loss": 4.7435, + "loss/crossentropy": 2.096015691757202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29556676745414734, + "step": 3480 + }, + { + "epoch": 0.06964, + "grad_norm": 2.921875, + "grad_norm_var": 0.028804524739583334, + "learning_rate": 0.0001, + "loss": 4.6738, + "loss/crossentropy": 1.9252901673316956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2976933419704437, + "step": 3482 + }, + { + "epoch": 0.06968, + "grad_norm": 2.25, + "grad_norm_var": 0.03385009765625, + "learning_rate": 0.0001, + "loss": 4.7679, + "loss/crossentropy": 2.258090019226074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27399829030036926, + "step": 3484 + }, + { + "epoch": 0.06972, + "grad_norm": 2.265625, + "grad_norm_var": 0.03192952473958333, + "learning_rate": 0.0001, + "loss": 4.7614, + "loss/crossentropy": 2.2776867151260376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29224833846092224, + "step": 3486 + }, + { + "epoch": 0.06976, + "grad_norm": 2.703125, + "grad_norm_var": 0.031819661458333336, + "learning_rate": 0.0001, + "loss": 5.1628, + "loss/crossentropy": 2.097061276435852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26103661954402924, + "step": 3488 + }, + { + "epoch": 0.0698, + "grad_norm": 2.28125, + "grad_norm_var": 0.03264567057291667, + "learning_rate": 0.0001, + "loss": 4.7364, + "loss/crossentropy": 2.206419885158539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2775641232728958, + "step": 3490 + }, + { + "epoch": 0.06984, + "grad_norm": 2.59375, + "grad_norm_var": 0.035563151041666664, + "learning_rate": 0.0001, + "loss": 4.7216, + "loss/crossentropy": 1.962704062461853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2421455979347229, + "step": 3492 + }, + { + "epoch": 0.06988, + "grad_norm": 2.4375, + "grad_norm_var": 0.032698567708333334, + "learning_rate": 0.0001, + "loss": 4.7358, + "loss/crossentropy": 2.223302483558655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2709382176399231, + "step": 3494 + }, + { + "epoch": 0.06992, + "grad_norm": 2.390625, + "grad_norm_var": 0.034764607747395836, + "learning_rate": 0.0001, + "loss": 4.8563, + "loss/crossentropy": 2.259950876235962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29514479637145996, + "step": 3496 + }, + { + "epoch": 0.06996, + "grad_norm": 2.359375, + "grad_norm_var": 0.021141560872395833, + "learning_rate": 0.0001, + "loss": 4.6147, + "loss/crossentropy": 2.2337416410446167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2585422098636627, + "step": 3498 + }, + { + "epoch": 0.07, + "grad_norm": 2.75, + "grad_norm_var": 0.022663370768229166, + "learning_rate": 0.0001, + "loss": 4.9278, + "loss/crossentropy": 2.131904423236847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32493171095848083, + "step": 3500 + }, + { + "epoch": 0.07004, + "grad_norm": 3.109375, + "grad_norm_var": 0.04553629557291667, + "learning_rate": 0.0001, + "loss": 4.9285, + "loss/crossentropy": 2.461912155151367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2917899489402771, + "step": 3502 + }, + { + "epoch": 0.07008, + "grad_norm": 2.40625, + "grad_norm_var": 0.04366861979166667, + "learning_rate": 0.0001, + "loss": 4.6466, + "loss/crossentropy": 2.05659943819046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24786780774593353, + "step": 3504 + }, + { + "epoch": 0.07012, + "grad_norm": 2.625, + "grad_norm_var": 0.0412261962890625, + "learning_rate": 0.0001, + "loss": 4.7282, + "loss/crossentropy": 2.3972705602645874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29012058675289154, + "step": 3506 + }, + { + "epoch": 0.07016, + "grad_norm": 2.390625, + "grad_norm_var": 0.03945210774739583, + "learning_rate": 0.0001, + "loss": 4.6561, + "loss/crossentropy": 1.8465647101402283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26938022673130035, + "step": 3508 + }, + { + "epoch": 0.0702, + "grad_norm": 2.65625, + "grad_norm_var": 0.0434722900390625, + "learning_rate": 0.0001, + "loss": 4.9007, + "loss/crossentropy": 2.2447493076324463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27260421216487885, + "step": 3510 + }, + { + "epoch": 0.07024, + "grad_norm": 2.453125, + "grad_norm_var": 0.042464192708333334, + "learning_rate": 0.0001, + "loss": 4.7774, + "loss/crossentropy": 2.4258209466934204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.285652831196785, + "step": 3512 + }, + { + "epoch": 0.07028, + "grad_norm": 2.328125, + "grad_norm_var": 0.04248046875, + "learning_rate": 0.0001, + "loss": 4.7353, + "loss/crossentropy": 2.1033068895339966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2628294378519058, + "step": 3514 + }, + { + "epoch": 0.07032, + "grad_norm": 2.5, + "grad_norm_var": 0.03819071451822917, + "learning_rate": 0.0001, + "loss": 4.8036, + "loss/crossentropy": 2.2740964889526367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27228833734989166, + "step": 3516 + }, + { + "epoch": 0.07036, + "grad_norm": 2.265625, + "grad_norm_var": 0.015104166666666667, + "learning_rate": 0.0001, + "loss": 4.7699, + "loss/crossentropy": 2.2315655946731567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26613348722457886, + "step": 3518 + }, + { + "epoch": 0.0704, + "grad_norm": 2.359375, + "grad_norm_var": 0.014720662434895834, + "learning_rate": 0.0001, + "loss": 4.6038, + "loss/crossentropy": 1.9001839756965637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.248238705098629, + "step": 3520 + }, + { + "epoch": 0.07044, + "grad_norm": 2.59375, + "grad_norm_var": 0.013655598958333333, + "learning_rate": 0.0001, + "loss": 4.7034, + "loss/crossentropy": 2.0940937399864197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28850243985652924, + "step": 3522 + }, + { + "epoch": 0.07048, + "grad_norm": 2.421875, + "grad_norm_var": 0.014338175455729166, + "learning_rate": 0.0001, + "loss": 4.9187, + "loss/crossentropy": 1.9088054299354553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2577759325504303, + "step": 3524 + }, + { + "epoch": 0.07052, + "grad_norm": 2.359375, + "grad_norm_var": 0.01031494140625, + "learning_rate": 0.0001, + "loss": 4.7852, + "loss/crossentropy": 2.1965672969818115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2696071192622185, + "step": 3526 + }, + { + "epoch": 0.07056, + "grad_norm": 2.59375, + "grad_norm_var": 0.010770670572916667, + "learning_rate": 0.0001, + "loss": 4.7657, + "loss/crossentropy": 2.245758891105652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2646239101886749, + "step": 3528 + }, + { + "epoch": 0.0706, + "grad_norm": 2.546875, + "grad_norm_var": 0.013895670572916666, + "learning_rate": 0.0001, + "loss": 4.7794, + "loss/crossentropy": 2.180204927921295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2582162171602249, + "step": 3530 + }, + { + "epoch": 0.07064, + "grad_norm": 2.71875, + "grad_norm_var": 0.019562784830729166, + "learning_rate": 0.0001, + "loss": 5.04, + "loss/crossentropy": 2.193474531173706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40557755529880524, + "step": 3532 + }, + { + "epoch": 0.07068, + "grad_norm": 2.78125, + "grad_norm_var": 0.021361287434895834, + "learning_rate": 0.0001, + "loss": 4.9562, + "loss/crossentropy": 2.2667607069015503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28741554915905, + "step": 3534 + }, + { + "epoch": 0.07072, + "grad_norm": 2.3125, + "grad_norm_var": 0.022093709309895834, + "learning_rate": 0.0001, + "loss": 4.8215, + "loss/crossentropy": 1.9890388250350952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2612537443637848, + "step": 3536 + }, + { + "epoch": 0.07076, + "grad_norm": 2.390625, + "grad_norm_var": 0.02392578125, + "learning_rate": 0.0001, + "loss": 4.7544, + "loss/crossentropy": 1.9390615820884705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23259633034467697, + "step": 3538 + }, + { + "epoch": 0.0708, + "grad_norm": 2.515625, + "grad_norm_var": 0.023193359375, + "learning_rate": 0.0001, + "loss": 4.8227, + "loss/crossentropy": 2.3050389289855957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2738967537879944, + "step": 3540 + }, + { + "epoch": 0.07084, + "grad_norm": 2.265625, + "grad_norm_var": 0.025862630208333334, + "learning_rate": 0.0001, + "loss": 4.4323, + "loss/crossentropy": 2.3832077980041504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29863911867141724, + "step": 3542 + }, + { + "epoch": 0.07088, + "grad_norm": 2.4375, + "grad_norm_var": 0.02822265625, + "learning_rate": 0.0001, + "loss": 4.7101, + "loss/crossentropy": 1.8186699748039246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22366883605718613, + "step": 3544 + }, + { + "epoch": 0.07092, + "grad_norm": 2.46875, + "grad_norm_var": 0.024235026041666666, + "learning_rate": 0.0001, + "loss": 4.8151, + "loss/crossentropy": 2.2650288343429565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2733971029520035, + "step": 3546 + }, + { + "epoch": 0.07096, + "grad_norm": 2.53125, + "grad_norm_var": 0.0191070556640625, + "learning_rate": 0.0001, + "loss": 4.7514, + "loss/crossentropy": 2.432945966720581, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.269818976521492, + "step": 3548 + }, + { + "epoch": 0.071, + "grad_norm": 4.15625, + "grad_norm_var": 0.19250386555989582, + "learning_rate": 0.0001, + "loss": 4.9461, + "loss/crossentropy": 2.021497666835785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2488800287246704, + "step": 3550 + }, + { + "epoch": 0.07104, + "grad_norm": 2.3125, + "grad_norm_var": 0.2001129150390625, + "learning_rate": 0.0001, + "loss": 5.0341, + "loss/crossentropy": 2.0840535163879395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2780974507331848, + "step": 3552 + }, + { + "epoch": 0.07108, + "grad_norm": 2.515625, + "grad_norm_var": 0.19724833170572917, + "learning_rate": 0.0001, + "loss": 4.8123, + "loss/crossentropy": 2.352238416671753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2830911874771118, + "step": 3554 + }, + { + "epoch": 0.07112, + "grad_norm": 2.484375, + "grad_norm_var": 0.19795633951822916, + "learning_rate": 0.0001, + "loss": 4.9632, + "loss/crossentropy": 2.395397186279297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2861028015613556, + "step": 3556 + }, + { + "epoch": 0.07116, + "grad_norm": 2.84375, + "grad_norm_var": 0.19630533854166668, + "learning_rate": 0.0001, + "loss": 5.0203, + "loss/crossentropy": 2.6454248428344727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28888577222824097, + "step": 3558 + }, + { + "epoch": 0.0712, + "grad_norm": 2.609375, + "grad_norm_var": 0.19394429524739584, + "learning_rate": 0.0001, + "loss": 4.8228, + "loss/crossentropy": 2.21127188205719, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2676347494125366, + "step": 3560 + }, + { + "epoch": 0.07124, + "grad_norm": 2.5625, + "grad_norm_var": 0.19245503743489584, + "learning_rate": 0.0001, + "loss": 4.7502, + "loss/crossentropy": 2.0187097787857056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24967321753501892, + "step": 3562 + }, + { + "epoch": 0.07128, + "grad_norm": 2.4375, + "grad_norm_var": 0.19409077962239582, + "learning_rate": 0.0001, + "loss": 4.9548, + "loss/crossentropy": 2.1822619438171387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24893341958522797, + "step": 3564 + }, + { + "epoch": 0.07132, + "grad_norm": 2.65625, + "grad_norm_var": 0.03178609212239583, + "learning_rate": 0.0001, + "loss": 5.0912, + "loss/crossentropy": 2.486607313156128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29665203392505646, + "step": 3566 + }, + { + "epoch": 0.07136, + "grad_norm": 2.546875, + "grad_norm_var": 0.020048014322916665, + "learning_rate": 0.0001, + "loss": 5.0494, + "loss/crossentropy": 2.2631163597106934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.268096499145031, + "step": 3568 + }, + { + "epoch": 0.0714, + "grad_norm": 2.671875, + "grad_norm_var": 0.02760009765625, + "learning_rate": 0.0001, + "loss": 4.7668, + "loss/crossentropy": 2.3393882513046265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29145532846450806, + "step": 3570 + }, + { + "epoch": 0.07144, + "grad_norm": 2.65625, + "grad_norm_var": 0.03980712890625, + "learning_rate": 0.0001, + "loss": 4.8281, + "loss/crossentropy": 2.007299244403839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30512700974941254, + "step": 3572 + }, + { + "epoch": 0.07148, + "grad_norm": 2.609375, + "grad_norm_var": 0.0369049072265625, + "learning_rate": 0.0001, + "loss": 4.7891, + "loss/crossentropy": 1.9879329800605774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22510841488838196, + "step": 3574 + }, + { + "epoch": 0.07152, + "grad_norm": 2.46875, + "grad_norm_var": 0.03443603515625, + "learning_rate": 0.0001, + "loss": 4.9688, + "loss/crossentropy": 2.387833833694458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3007010221481323, + "step": 3576 + }, + { + "epoch": 0.07156, + "grad_norm": 2.625, + "grad_norm_var": 0.03664957682291667, + "learning_rate": 0.0001, + "loss": 4.7975, + "loss/crossentropy": 2.3306411504745483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.269440695643425, + "step": 3578 + }, + { + "epoch": 0.0716, + "grad_norm": 2.5, + "grad_norm_var": 0.03462626139322917, + "learning_rate": 0.0001, + "loss": 4.5348, + "loss/crossentropy": 1.8359156847000122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23353691399097443, + "step": 3580 + }, + { + "epoch": 0.07164, + "grad_norm": 2.5625, + "grad_norm_var": 0.033080037434895834, + "learning_rate": 0.0001, + "loss": 4.9226, + "loss/crossentropy": 2.257680654525757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2764490246772766, + "step": 3582 + }, + { + "epoch": 0.07168, + "grad_norm": 2.4375, + "grad_norm_var": 0.03310445149739583, + "learning_rate": 0.0001, + "loss": 4.6434, + "loss/crossentropy": 2.589483857154846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28172188997268677, + "step": 3584 + }, + { + "epoch": 0.07172, + "grad_norm": 2.390625, + "grad_norm_var": 0.04182840983072917, + "learning_rate": 0.0001, + "loss": 4.5434, + "loss/crossentropy": 1.8202016949653625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22330023348331451, + "step": 3586 + }, + { + "epoch": 0.07176, + "grad_norm": 2.421875, + "grad_norm_var": 0.028609212239583334, + "learning_rate": 0.0001, + "loss": 4.8344, + "loss/crossentropy": 2.0311816334724426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2622206509113312, + "step": 3588 + }, + { + "epoch": 0.0718, + "grad_norm": 2.296875, + "grad_norm_var": 0.028400675455729166, + "learning_rate": 0.0001, + "loss": 4.5158, + "loss/crossentropy": 1.7991753220558167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23093865811824799, + "step": 3590 + }, + { + "epoch": 0.07184, + "grad_norm": 2.265625, + "grad_norm_var": 0.030924479166666668, + "learning_rate": 0.0001, + "loss": 4.3148, + "loss/crossentropy": 1.6141473054885864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21914401650428772, + "step": 3592 + }, + { + "epoch": 0.07188, + "grad_norm": 2.34375, + "grad_norm_var": 0.0289703369140625, + "learning_rate": 0.0001, + "loss": 4.8007, + "loss/crossentropy": 2.3337208032608032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2612123265862465, + "step": 3594 + }, + { + "epoch": 0.07192, + "grad_norm": 2.265625, + "grad_norm_var": 0.0322662353515625, + "learning_rate": 0.0001, + "loss": 4.5811, + "loss/crossentropy": 2.191028594970703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2598741352558136, + "step": 3596 + }, + { + "epoch": 0.07196, + "grad_norm": 2.546875, + "grad_norm_var": 0.03178609212239583, + "learning_rate": 0.0001, + "loss": 4.9562, + "loss/crossentropy": 2.0293691158294678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27143266797065735, + "step": 3598 + }, + { + "epoch": 0.072, + "grad_norm": 2.640625, + "grad_norm_var": 0.03437398274739583, + "learning_rate": 0.0001, + "loss": 5.1335, + "loss/crossentropy": 2.1257725954055786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33300966024398804, + "step": 3600 + }, + { + "epoch": 0.07204, + "grad_norm": 2.375, + "grad_norm_var": 0.0171875, + "learning_rate": 0.0001, + "loss": 4.8223, + "loss/crossentropy": 2.296278953552246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28724825382232666, + "step": 3602 + }, + { + "epoch": 0.07208, + "grad_norm": 2.453125, + "grad_norm_var": 0.016731770833333333, + "learning_rate": 0.0001, + "loss": 4.8925, + "loss/crossentropy": 2.258358597755432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26510895788669586, + "step": 3604 + }, + { + "epoch": 0.07212, + "grad_norm": 2.421875, + "grad_norm_var": 0.0170318603515625, + "learning_rate": 0.0001, + "loss": 5.0383, + "loss/crossentropy": 2.0454649925231934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2725224494934082, + "step": 3606 + }, + { + "epoch": 0.07216, + "grad_norm": 2.390625, + "grad_norm_var": 0.015250651041666667, + "learning_rate": 0.0001, + "loss": 4.6582, + "loss/crossentropy": 2.1844204664230347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2729914039373398, + "step": 3608 + }, + { + "epoch": 0.0722, + "grad_norm": 2.484375, + "grad_norm_var": 0.015608723958333333, + "learning_rate": 0.0001, + "loss": 4.4613, + "loss/crossentropy": 1.8897106647491455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26307350397109985, + "step": 3610 + }, + { + "epoch": 0.07224, + "grad_norm": 2.5, + "grad_norm_var": 0.013377888997395834, + "learning_rate": 0.0001, + "loss": 4.5695, + "loss/crossentropy": 1.9441962838172913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24454529583454132, + "step": 3612 + }, + { + "epoch": 0.07228, + "grad_norm": 2.6875, + "grad_norm_var": 0.063623046875, + "learning_rate": 0.0001, + "loss": 4.7654, + "loss/crossentropy": 2.10969078540802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2900787442922592, + "step": 3614 + }, + { + "epoch": 0.07232, + "grad_norm": 2.453125, + "grad_norm_var": 0.06301676432291667, + "learning_rate": 0.0001, + "loss": 4.5195, + "loss/crossentropy": 2.1384644508361816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2722831964492798, + "step": 3616 + }, + { + "epoch": 0.07236, + "grad_norm": 2.296875, + "grad_norm_var": 0.06412760416666667, + "learning_rate": 0.0001, + "loss": 4.4995, + "loss/crossentropy": 2.0648157596588135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24860350787639618, + "step": 3618 + }, + { + "epoch": 0.0724, + "grad_norm": 2.421875, + "grad_norm_var": 0.06441650390625, + "learning_rate": 0.0001, + "loss": 4.7863, + "loss/crossentropy": 2.2188034057617188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2476331740617752, + "step": 3620 + }, + { + "epoch": 0.07244, + "grad_norm": 2.4375, + "grad_norm_var": 0.06516927083333333, + "learning_rate": 0.0001, + "loss": 4.792, + "loss/crossentropy": 2.1361395120620728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24962469190359116, + "step": 3622 + }, + { + "epoch": 0.07248, + "grad_norm": 2.46875, + "grad_norm_var": 0.06457417805989583, + "learning_rate": 0.0001, + "loss": 4.739, + "loss/crossentropy": 2.2646392583847046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.260420486330986, + "step": 3624 + }, + { + "epoch": 0.07252, + "grad_norm": 2.421875, + "grad_norm_var": 0.0630523681640625, + "learning_rate": 0.0001, + "loss": 4.6937, + "loss/crossentropy": 2.2822424173355103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2585812509059906, + "step": 3626 + }, + { + "epoch": 0.07256, + "grad_norm": 2.203125, + "grad_norm_var": 0.06852925618489583, + "learning_rate": 0.0001, + "loss": 4.2868, + "loss/crossentropy": 1.8197516798973083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24321961402893066, + "step": 3628 + }, + { + "epoch": 0.0726, + "grad_norm": 2.4375, + "grad_norm_var": 0.007649739583333333, + "learning_rate": 0.0001, + "loss": 4.4835, + "loss/crossentropy": 2.1475032567977905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2687895894050598, + "step": 3630 + }, + { + "epoch": 0.07264, + "grad_norm": 2.25, + "grad_norm_var": 0.006538899739583334, + "learning_rate": 0.0001, + "loss": 4.2353, + "loss/crossentropy": 1.9497992992401123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24942665547132492, + "step": 3632 + }, + { + "epoch": 0.07268, + "grad_norm": 2.5, + "grad_norm_var": 0.007059733072916667, + "learning_rate": 0.0001, + "loss": 4.7455, + "loss/crossentropy": 1.8786492347717285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23713821917772293, + "step": 3634 + }, + { + "epoch": 0.07272, + "grad_norm": 2.4375, + "grad_norm_var": 0.00738525390625, + "learning_rate": 0.0001, + "loss": 5.089, + "loss/crossentropy": 2.474532127380371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3133770227432251, + "step": 3636 + }, + { + "epoch": 0.07276, + "grad_norm": 2.484375, + "grad_norm_var": 0.0073394775390625, + "learning_rate": 0.0001, + "loss": 4.912, + "loss/crossentropy": 2.1231455206871033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2547219544649124, + "step": 3638 + }, + { + "epoch": 0.0728, + "grad_norm": 2.546875, + "grad_norm_var": 0.009007771809895834, + "learning_rate": 0.0001, + "loss": 4.4727, + "loss/crossentropy": 2.0511630177497864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26947300136089325, + "step": 3640 + }, + { + "epoch": 0.07284, + "grad_norm": 2.5625, + "grad_norm_var": 0.01060791015625, + "learning_rate": 0.0001, + "loss": 4.7332, + "loss/crossentropy": 1.7076187133789062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21764510869979858, + "step": 3642 + }, + { + "epoch": 0.07288, + "grad_norm": 2.359375, + "grad_norm_var": 0.009723917643229166, + "learning_rate": 0.0001, + "loss": 4.8396, + "loss/crossentropy": 2.069926142692566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26851218193769455, + "step": 3644 + }, + { + "epoch": 0.07292, + "grad_norm": 2.609375, + "grad_norm_var": 0.010054524739583333, + "learning_rate": 0.0001, + "loss": 5.0802, + "loss/crossentropy": 2.1369277238845825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27878354489803314, + "step": 3646 + }, + { + "epoch": 0.07296, + "grad_norm": 2.375, + "grad_norm_var": 0.008072916666666667, + "learning_rate": 0.0001, + "loss": 4.3659, + "loss/crossentropy": 2.095974624156952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25285808742046356, + "step": 3648 + }, + { + "epoch": 0.073, + "grad_norm": 2.5625, + "grad_norm_var": 0.011888631184895833, + "learning_rate": 0.0001, + "loss": 4.9301, + "loss/crossentropy": 2.240627646446228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24894578754901886, + "step": 3650 + }, + { + "epoch": 0.07304, + "grad_norm": 2.359375, + "grad_norm_var": 0.013899739583333333, + "learning_rate": 0.0001, + "loss": 4.6588, + "loss/crossentropy": 2.2270851135253906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26908691227436066, + "step": 3652 + }, + { + "epoch": 0.07308, + "grad_norm": 2.671875, + "grad_norm_var": 0.017039998372395834, + "learning_rate": 0.0001, + "loss": 4.7385, + "loss/crossentropy": 2.2684017419815063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28442947566509247, + "step": 3654 + }, + { + "epoch": 0.07312, + "grad_norm": 2.46875, + "grad_norm_var": 0.07785542805989583, + "learning_rate": 0.0001, + "loss": 4.7585, + "loss/crossentropy": 2.0922030806541443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2621888816356659, + "step": 3656 + }, + { + "epoch": 0.07316, + "grad_norm": 2.3125, + "grad_norm_var": 0.08323567708333333, + "learning_rate": 0.0001, + "loss": 4.7425, + "loss/crossentropy": 2.0134947896003723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2717447876930237, + "step": 3658 + }, + { + "epoch": 0.0732, + "grad_norm": 2.421875, + "grad_norm_var": 0.08206278483072917, + "learning_rate": 0.0001, + "loss": 4.5573, + "loss/crossentropy": 1.9246947765350342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25095127522945404, + "step": 3660 + }, + { + "epoch": 0.07324, + "grad_norm": 2.5625, + "grad_norm_var": 0.08561197916666667, + "learning_rate": 0.0001, + "loss": 4.8616, + "loss/crossentropy": 2.0655113458633423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24166538566350937, + "step": 3662 + }, + { + "epoch": 0.07328, + "grad_norm": 2.46875, + "grad_norm_var": 0.08198954264322916, + "learning_rate": 0.0001, + "loss": 4.9137, + "loss/crossentropy": 2.2706735730171204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26484307646751404, + "step": 3664 + }, + { + "epoch": 0.07332, + "grad_norm": 2.359375, + "grad_norm_var": 0.0883453369140625, + "learning_rate": 0.0001, + "loss": 4.2911, + "loss/crossentropy": 1.7969809770584106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23407897353172302, + "step": 3666 + }, + { + "epoch": 0.07336, + "grad_norm": 2.421875, + "grad_norm_var": 0.09109700520833333, + "learning_rate": 0.0001, + "loss": 4.7081, + "loss/crossentropy": 2.0398870706558228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2502745985984802, + "step": 3668 + }, + { + "epoch": 0.0734, + "grad_norm": 2.40625, + "grad_norm_var": 0.09381103515625, + "learning_rate": 0.0001, + "loss": 4.8738, + "loss/crossentropy": 2.090283453464508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2936585247516632, + "step": 3670 + }, + { + "epoch": 0.07344, + "grad_norm": 2.5625, + "grad_norm_var": 0.035008748372395836, + "learning_rate": 0.0001, + "loss": 4.8305, + "loss/crossentropy": 2.286925792694092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27951307594776154, + "step": 3672 + }, + { + "epoch": 0.07348, + "grad_norm": 2.40625, + "grad_norm_var": 0.032059733072916666, + "learning_rate": 0.0001, + "loss": 4.4885, + "loss/crossentropy": 2.0264610052108765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24908316135406494, + "step": 3674 + }, + { + "epoch": 0.07352, + "grad_norm": 2.6875, + "grad_norm_var": 0.0372467041015625, + "learning_rate": 0.0001, + "loss": 4.9239, + "loss/crossentropy": 2.1947755217552185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2865990996360779, + "step": 3676 + }, + { + "epoch": 0.07356, + "grad_norm": 2.703125, + "grad_norm_var": 0.0422515869140625, + "learning_rate": 0.0001, + "loss": 4.8784, + "loss/crossentropy": 2.0050416588783264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23362033069133759, + "step": 3678 + }, + { + "epoch": 0.0736, + "grad_norm": 2.4375, + "grad_norm_var": 0.041112263997395836, + "learning_rate": 0.0001, + "loss": 4.7423, + "loss/crossentropy": 1.8935424089431763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2532464489340782, + "step": 3680 + }, + { + "epoch": 0.07364, + "grad_norm": 2.40625, + "grad_norm_var": 0.0350250244140625, + "learning_rate": 0.0001, + "loss": 4.7389, + "loss/crossentropy": 2.0181053280830383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2544455900788307, + "step": 3682 + }, + { + "epoch": 0.07368, + "grad_norm": 2.359375, + "grad_norm_var": 0.03144124348958333, + "learning_rate": 0.0001, + "loss": 4.7099, + "loss/crossentropy": 2.1172796487808228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27609144151210785, + "step": 3684 + }, + { + "epoch": 0.07372, + "grad_norm": 2.375, + "grad_norm_var": 0.023737589518229168, + "learning_rate": 0.0001, + "loss": 4.7185, + "loss/crossentropy": 2.3926355838775635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28892992436885834, + "step": 3686 + }, + { + "epoch": 0.07376, + "grad_norm": 2.21875, + "grad_norm_var": 0.026883951822916665, + "learning_rate": 0.0001, + "loss": 4.9086, + "loss/crossentropy": 2.2512835264205933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26166096329689026, + "step": 3688 + }, + { + "epoch": 0.0738, + "grad_norm": 2.578125, + "grad_norm_var": 0.0264312744140625, + "learning_rate": 0.0001, + "loss": 4.6311, + "loss/crossentropy": 2.0656538009643555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25399819016456604, + "step": 3690 + }, + { + "epoch": 0.07384, + "grad_norm": 2.453125, + "grad_norm_var": 0.0222808837890625, + "learning_rate": 0.0001, + "loss": 4.9565, + "loss/crossentropy": 2.454928994178772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2964669317007065, + "step": 3692 + }, + { + "epoch": 0.07388, + "grad_norm": 2.4375, + "grad_norm_var": 0.009859212239583333, + "learning_rate": 0.0001, + "loss": 4.5703, + "loss/crossentropy": 1.988040804862976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27345800399780273, + "step": 3694 + }, + { + "epoch": 0.07392, + "grad_norm": 2.671875, + "grad_norm_var": 0.013752237955729166, + "learning_rate": 0.0001, + "loss": 4.9418, + "loss/crossentropy": 1.910742998123169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24037255346775055, + "step": 3696 + }, + { + "epoch": 0.07396, + "grad_norm": 2.421875, + "grad_norm_var": 0.013700358072916667, + "learning_rate": 0.0001, + "loss": 4.6541, + "loss/crossentropy": 2.19545578956604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2778017520904541, + "step": 3698 + }, + { + "epoch": 0.074, + "grad_norm": 2.46875, + "grad_norm_var": 0.0127838134765625, + "learning_rate": 0.0001, + "loss": 4.3839, + "loss/crossentropy": 2.436691403388977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26254843175411224, + "step": 3700 + }, + { + "epoch": 0.07404, + "grad_norm": 2.5, + "grad_norm_var": 0.012398274739583333, + "learning_rate": 0.0001, + "loss": 4.833, + "loss/crossentropy": 2.7458308935165405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27952516078948975, + "step": 3702 + }, + { + "epoch": 0.07408, + "grad_norm": 2.78125, + "grad_norm_var": 0.014876302083333333, + "learning_rate": 0.0001, + "loss": 4.8308, + "loss/crossentropy": 2.2321633100509644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2633504122495651, + "step": 3704 + }, + { + "epoch": 0.07412, + "grad_norm": 2.359375, + "grad_norm_var": 0.013801066080729167, + "learning_rate": 0.0001, + "loss": 4.8159, + "loss/crossentropy": 1.9883576035499573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25319087505340576, + "step": 3706 + }, + { + "epoch": 0.07416, + "grad_norm": 2.4375, + "grad_norm_var": 0.0147857666015625, + "learning_rate": 0.0001, + "loss": 4.5546, + "loss/crossentropy": 1.7647870182991028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21732009947299957, + "step": 3708 + }, + { + "epoch": 0.0742, + "grad_norm": 2.734375, + "grad_norm_var": 0.046793619791666664, + "learning_rate": 0.0001, + "loss": 4.9271, + "loss/crossentropy": 2.1113381385803223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25888970494270325, + "step": 3710 + }, + { + "epoch": 0.07424, + "grad_norm": 2.34375, + "grad_norm_var": 0.04690348307291667, + "learning_rate": 0.0001, + "loss": 4.5878, + "loss/crossentropy": 1.975549578666687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23899925500154495, + "step": 3712 + }, + { + "epoch": 0.07428, + "grad_norm": 2.40625, + "grad_norm_var": 0.04664306640625, + "learning_rate": 0.0001, + "loss": 4.9262, + "loss/crossentropy": 2.0562495589256287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31428879499435425, + "step": 3714 + }, + { + "epoch": 0.07432, + "grad_norm": 2.265625, + "grad_norm_var": 0.04951883951822917, + "learning_rate": 0.0001, + "loss": 4.3719, + "loss/crossentropy": 2.114805221557617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25172004848718643, + "step": 3716 + }, + { + "epoch": 0.07436, + "grad_norm": 2.65625, + "grad_norm_var": 0.052783203125, + "learning_rate": 0.0001, + "loss": 4.6032, + "loss/crossentropy": 2.1865739822387695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.252632200717926, + "step": 3718 + }, + { + "epoch": 0.0744, + "grad_norm": 2.53125, + "grad_norm_var": 0.04739176432291667, + "learning_rate": 0.0001, + "loss": 4.8493, + "loss/crossentropy": 2.2550876140594482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27425335347652435, + "step": 3720 + }, + { + "epoch": 0.07444, + "grad_norm": 2.640625, + "grad_norm_var": 0.047379557291666666, + "learning_rate": 0.0001, + "loss": 4.9072, + "loss/crossentropy": 2.293414354324341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26979324221611023, + "step": 3722 + }, + { + "epoch": 0.07448, + "grad_norm": 2.59375, + "grad_norm_var": 0.043745930989583334, + "learning_rate": 0.0001, + "loss": 4.4962, + "loss/crossentropy": 2.014510452747345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25709769129753113, + "step": 3724 + }, + { + "epoch": 0.07452, + "grad_norm": 2.40625, + "grad_norm_var": 0.015746053059895834, + "learning_rate": 0.0001, + "loss": 4.6485, + "loss/crossentropy": 2.0332603454589844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2585446834564209, + "step": 3726 + }, + { + "epoch": 0.07456, + "grad_norm": 2.296875, + "grad_norm_var": 0.0165191650390625, + "learning_rate": 0.0001, + "loss": 4.7268, + "loss/crossentropy": 1.9425334930419922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22921039909124374, + "step": 3728 + }, + { + "epoch": 0.0746, + "grad_norm": 2.71875, + "grad_norm_var": 0.019090779622395835, + "learning_rate": 0.0001, + "loss": 4.9306, + "loss/crossentropy": 2.1233898997306824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26014500856399536, + "step": 3730 + }, + { + "epoch": 0.07464, + "grad_norm": 2.609375, + "grad_norm_var": 0.023566691080729167, + "learning_rate": 0.0001, + "loss": 4.9958, + "loss/crossentropy": 2.3929240703582764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2791339308023453, + "step": 3732 + }, + { + "epoch": 0.07468, + "grad_norm": 2.546875, + "grad_norm_var": 0.021370442708333333, + "learning_rate": 0.0001, + "loss": 4.6072, + "loss/crossentropy": 2.163137674331665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26793332397937775, + "step": 3734 + }, + { + "epoch": 0.07472, + "grad_norm": 2.453125, + "grad_norm_var": 0.024738566080729166, + "learning_rate": 0.0001, + "loss": 4.723, + "loss/crossentropy": 2.1300129294395447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2533607929944992, + "step": 3736 + }, + { + "epoch": 0.07476, + "grad_norm": 2.5, + "grad_norm_var": 0.028055826822916668, + "learning_rate": 0.0001, + "loss": 4.7232, + "loss/crossentropy": 1.9808942675590515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25632843375205994, + "step": 3738 + }, + { + "epoch": 0.0748, + "grad_norm": 2.453125, + "grad_norm_var": 0.03328348795572917, + "learning_rate": 0.0001, + "loss": 4.8219, + "loss/crossentropy": 2.161437451839447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2529585212469101, + "step": 3740 + }, + { + "epoch": 0.07484, + "grad_norm": 2.53125, + "grad_norm_var": 0.03369140625, + "learning_rate": 0.0001, + "loss": 4.6541, + "loss/crossentropy": 1.852737545967102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2333993762731552, + "step": 3742 + }, + { + "epoch": 0.07488, + "grad_norm": 2.265625, + "grad_norm_var": 0.03439127604166667, + "learning_rate": 0.0001, + "loss": 4.4355, + "loss/crossentropy": 1.664733350276947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20309723168611526, + "step": 3744 + }, + { + "epoch": 0.07492, + "grad_norm": 2.203125, + "grad_norm_var": 0.03276265462239583, + "learning_rate": 0.0001, + "loss": 4.4554, + "loss/crossentropy": 2.1815799474716187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2470541000366211, + "step": 3746 + }, + { + "epoch": 0.07496, + "grad_norm": 2.421875, + "grad_norm_var": 0.024657185872395834, + "learning_rate": 0.0001, + "loss": 4.7423, + "loss/crossentropy": 1.9546562433242798, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2685271203517914, + "step": 3748 + }, + { + "epoch": 0.075, + "grad_norm": 2.25, + "grad_norm_var": 0.024738566080729166, + "learning_rate": 0.0001, + "loss": 4.5171, + "loss/crossentropy": 1.920817255973816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2558089941740036, + "step": 3750 + }, + { + "epoch": 0.07504, + "grad_norm": 2.265625, + "grad_norm_var": 0.024982706705729166, + "learning_rate": 0.0001, + "loss": 4.4795, + "loss/crossentropy": 1.7570490837097168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2265816479921341, + "step": 3752 + }, + { + "epoch": 0.07508, + "grad_norm": 2.890625, + "grad_norm_var": 0.203564453125, + "learning_rate": 0.0001, + "loss": 4.756, + "loss/crossentropy": 2.1815105676651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2536233216524124, + "step": 3754 + }, + { + "epoch": 0.07512, + "grad_norm": 2.4375, + "grad_norm_var": 0.19650777180989584, + "learning_rate": 0.0001, + "loss": 4.7178, + "loss/crossentropy": 2.1385504603385925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2445325404405594, + "step": 3756 + }, + { + "epoch": 0.07516, + "grad_norm": 2.34375, + "grad_norm_var": 0.19650777180989584, + "learning_rate": 0.0001, + "loss": 4.6449, + "loss/crossentropy": 1.7325092554092407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2419673353433609, + "step": 3758 + }, + { + "epoch": 0.0752, + "grad_norm": 2.21875, + "grad_norm_var": 0.19572652180989583, + "learning_rate": 0.0001, + "loss": 4.6096, + "loss/crossentropy": 2.358627676963806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2713842839002609, + "step": 3760 + }, + { + "epoch": 0.07524, + "grad_norm": 3.078125, + "grad_norm_var": 0.20392252604166666, + "learning_rate": 0.0001, + "loss": 5.1712, + "loss/crossentropy": 2.048672080039978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3007172644138336, + "step": 3762 + }, + { + "epoch": 0.07528, + "grad_norm": 2.609375, + "grad_norm_var": 0.19885660807291666, + "learning_rate": 0.0001, + "loss": 4.8473, + "loss/crossentropy": 2.2967183589935303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28548331558704376, + "step": 3764 + }, + { + "epoch": 0.07532, + "grad_norm": 2.3125, + "grad_norm_var": 0.19228108723958334, + "learning_rate": 0.0001, + "loss": 4.7541, + "loss/crossentropy": 2.1280438899993896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2825516611337662, + "step": 3766 + }, + { + "epoch": 0.07536, + "grad_norm": 2.375, + "grad_norm_var": 0.19146728515625, + "learning_rate": 0.0001, + "loss": 4.8404, + "loss/crossentropy": 2.5528002977371216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28681764006614685, + "step": 3768 + }, + { + "epoch": 0.0754, + "grad_norm": 2.546875, + "grad_norm_var": 0.0549468994140625, + "learning_rate": 0.0001, + "loss": 4.729, + "loss/crossentropy": 2.235885262489319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.259520560503006, + "step": 3770 + }, + { + "epoch": 0.07544, + "grad_norm": 2.421875, + "grad_norm_var": 0.05718994140625, + "learning_rate": 0.0001, + "loss": 4.4705, + "loss/crossentropy": 1.8836966753005981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2360890954732895, + "step": 3772 + }, + { + "epoch": 0.07548, + "grad_norm": 2.6875, + "grad_norm_var": 0.056538899739583336, + "learning_rate": 0.0001, + "loss": 4.9291, + "loss/crossentropy": 2.3396376371383667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30392636358737946, + "step": 3774 + }, + { + "epoch": 0.07552, + "grad_norm": 2.359375, + "grad_norm_var": 0.051301066080729166, + "learning_rate": 0.0001, + "loss": 4.7518, + "loss/crossentropy": 2.4024877548217773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2858506590127945, + "step": 3776 + }, + { + "epoch": 0.07556, + "grad_norm": 2.25, + "grad_norm_var": 0.0353515625, + "learning_rate": 0.0001, + "loss": 4.4073, + "loss/crossentropy": 2.138229727745056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25458595901727676, + "step": 3778 + }, + { + "epoch": 0.0756, + "grad_norm": 2.328125, + "grad_norm_var": 0.0359375, + "learning_rate": 0.0001, + "loss": 4.3882, + "loss/crossentropy": 1.8413254618644714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23022352159023285, + "step": 3780 + }, + { + "epoch": 0.07564, + "grad_norm": 2.390625, + "grad_norm_var": 0.034764607747395836, + "learning_rate": 0.0001, + "loss": 4.613, + "loss/crossentropy": 1.8554572463035583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25644390285015106, + "step": 3782 + }, + { + "epoch": 0.07568, + "grad_norm": 2.34375, + "grad_norm_var": 0.018452962239583332, + "learning_rate": 0.0001, + "loss": 4.7013, + "loss/crossentropy": 2.0096731781959534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25370142608880997, + "step": 3784 + }, + { + "epoch": 0.07572, + "grad_norm": 2.65625, + "grad_norm_var": 0.020018513997395834, + "learning_rate": 0.0001, + "loss": 4.9317, + "loss/crossentropy": 1.7932413220405579, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2426912784576416, + "step": 3786 + }, + { + "epoch": 0.07576, + "grad_norm": 2.125, + "grad_norm_var": 0.0263336181640625, + "learning_rate": 0.0001, + "loss": 4.1599, + "loss/crossentropy": 2.0372042655944824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24203844368457794, + "step": 3788 + }, + { + "epoch": 0.0758, + "grad_norm": 2.296875, + "grad_norm_var": 0.021773274739583334, + "learning_rate": 0.0001, + "loss": 4.3627, + "loss/crossentropy": 1.8986076712608337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24614746868610382, + "step": 3790 + }, + { + "epoch": 0.07584, + "grad_norm": 2.234375, + "grad_norm_var": 0.022135416666666668, + "learning_rate": 0.0001, + "loss": 4.563, + "loss/crossentropy": 1.8080393075942993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23206676542758942, + "step": 3792 + }, + { + "epoch": 0.07588, + "grad_norm": 2.25, + "grad_norm_var": 0.015067545572916667, + "learning_rate": 0.0001, + "loss": 4.6857, + "loss/crossentropy": 1.7578041553497314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21686340868473053, + "step": 3794 + }, + { + "epoch": 0.07592, + "grad_norm": 2.40625, + "grad_norm_var": 0.015087890625, + "learning_rate": 0.0001, + "loss": 4.4938, + "loss/crossentropy": 2.0115376710891724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2651440501213074, + "step": 3796 + }, + { + "epoch": 0.07596, + "grad_norm": 2.40625, + "grad_norm_var": 0.015803019205729168, + "learning_rate": 0.0001, + "loss": 4.598, + "loss/crossentropy": 2.028555393218994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26161982119083405, + "step": 3798 + }, + { + "epoch": 0.076, + "grad_norm": 2.515625, + "grad_norm_var": 0.016429646809895834, + "learning_rate": 0.0001, + "loss": 4.8315, + "loss/crossentropy": 2.158663272857666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28110067546367645, + "step": 3800 + }, + { + "epoch": 0.07604, + "grad_norm": 2.734375, + "grad_norm_var": 0.019652303059895834, + "learning_rate": 0.0001, + "loss": 5.2376, + "loss/crossentropy": 2.2959556579589844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28477419912815094, + "step": 3802 + }, + { + "epoch": 0.07608, + "grad_norm": 2.296875, + "grad_norm_var": 0.014997355143229167, + "learning_rate": 0.0001, + "loss": 4.5289, + "loss/crossentropy": 2.149766206741333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25216003507375717, + "step": 3804 + }, + { + "epoch": 0.07612, + "grad_norm": 2.390625, + "grad_norm_var": 0.01416015625, + "learning_rate": 0.0001, + "loss": 4.7496, + "loss/crossentropy": 1.9866302609443665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26193149387836456, + "step": 3806 + }, + { + "epoch": 0.07616, + "grad_norm": 2.46875, + "grad_norm_var": 0.0122955322265625, + "learning_rate": 0.0001, + "loss": 4.9968, + "loss/crossentropy": 2.4230403900146484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27897247672080994, + "step": 3808 + }, + { + "epoch": 0.0762, + "grad_norm": 2.65625, + "grad_norm_var": 0.012613932291666666, + "learning_rate": 0.0001, + "loss": 4.9133, + "loss/crossentropy": 2.2995522022247314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27623192965984344, + "step": 3810 + }, + { + "epoch": 0.07624, + "grad_norm": 2.484375, + "grad_norm_var": 0.013923136393229167, + "learning_rate": 0.0001, + "loss": 4.6632, + "loss/crossentropy": 2.167468547821045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26121728122234344, + "step": 3812 + }, + { + "epoch": 0.07628, + "grad_norm": 2.5, + "grad_norm_var": 0.013353474934895833, + "learning_rate": 0.0001, + "loss": 4.8435, + "loss/crossentropy": 2.3259944915771484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29026439785957336, + "step": 3814 + }, + { + "epoch": 0.07632, + "grad_norm": 2.46875, + "grad_norm_var": 0.041825358072916666, + "learning_rate": 0.0001, + "loss": 4.8704, + "loss/crossentropy": 2.18080472946167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2550061345100403, + "step": 3816 + }, + { + "epoch": 0.07636, + "grad_norm": 2.390625, + "grad_norm_var": 0.03871968587239583, + "learning_rate": 0.0001, + "loss": 4.5681, + "loss/crossentropy": 2.1185330748558044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25014883279800415, + "step": 3818 + }, + { + "epoch": 0.0764, + "grad_norm": 2.28125, + "grad_norm_var": 0.039383951822916666, + "learning_rate": 0.0001, + "loss": 4.5776, + "loss/crossentropy": 1.9028193354606628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22502654790878296, + "step": 3820 + }, + { + "epoch": 0.07644, + "grad_norm": 2.640625, + "grad_norm_var": 0.039713541666666664, + "learning_rate": 0.0001, + "loss": 5.0188, + "loss/crossentropy": 2.266402840614319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24728389829397202, + "step": 3822 + }, + { + "epoch": 0.07648, + "grad_norm": 2.546875, + "grad_norm_var": 0.04279683430989583, + "learning_rate": 0.0001, + "loss": 4.7036, + "loss/crossentropy": 2.0918440222740173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2669295519590378, + "step": 3824 + }, + { + "epoch": 0.07652, + "grad_norm": 2.296875, + "grad_norm_var": 0.04456380208333333, + "learning_rate": 0.0001, + "loss": 4.1446, + "loss/crossentropy": 1.6120481491088867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21910656243562698, + "step": 3826 + }, + { + "epoch": 0.07656, + "grad_norm": 2.609375, + "grad_norm_var": 0.044331868489583336, + "learning_rate": 0.0001, + "loss": 4.6352, + "loss/crossentropy": 2.2294809818267822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2720891535282135, + "step": 3828 + }, + { + "epoch": 0.0766, + "grad_norm": 2.765625, + "grad_norm_var": 0.0524322509765625, + "learning_rate": 0.0001, + "loss": 4.873, + "loss/crossentropy": 2.2588730454444885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2613115608692169, + "step": 3830 + }, + { + "epoch": 0.07664, + "grad_norm": 2.28125, + "grad_norm_var": 0.02584228515625, + "learning_rate": 0.0001, + "loss": 4.5881, + "loss/crossentropy": 2.2658292055130005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2729395925998688, + "step": 3832 + }, + { + "epoch": 0.07668, + "grad_norm": 2.484375, + "grad_norm_var": 0.02662353515625, + "learning_rate": 0.0001, + "loss": 5.1787, + "loss/crossentropy": 2.314574718475342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28539060056209564, + "step": 3834 + }, + { + "epoch": 0.07672, + "grad_norm": 2.546875, + "grad_norm_var": 0.024738566080729166, + "learning_rate": 0.0001, + "loss": 5.0388, + "loss/crossentropy": 2.4684417247772217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29843954741954803, + "step": 3836 + }, + { + "epoch": 0.07676, + "grad_norm": 2.546875, + "grad_norm_var": 0.025944010416666666, + "learning_rate": 0.0001, + "loss": 4.5976, + "loss/crossentropy": 2.528733253479004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2696636766195297, + "step": 3838 + }, + { + "epoch": 0.0768, + "grad_norm": 2.390625, + "grad_norm_var": 0.025340779622395834, + "learning_rate": 0.0001, + "loss": 4.6449, + "loss/crossentropy": 2.203901529312134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26598773896694183, + "step": 3840 + }, + { + "epoch": 0.07684, + "grad_norm": 2.5625, + "grad_norm_var": 0.021907552083333334, + "learning_rate": 0.0001, + "loss": 4.7322, + "loss/crossentropy": 1.9192892909049988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25631098449230194, + "step": 3842 + }, + { + "epoch": 0.07688, + "grad_norm": 2.3125, + "grad_norm_var": 0.022077433268229165, + "learning_rate": 0.0001, + "loss": 4.6372, + "loss/crossentropy": 1.8314838409423828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22842589765787125, + "step": 3844 + }, + { + "epoch": 0.07692, + "grad_norm": 2.3125, + "grad_norm_var": 0.013395182291666667, + "learning_rate": 0.0001, + "loss": 4.6023, + "loss/crossentropy": 2.2416744232177734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24890189617872238, + "step": 3846 + }, + { + "epoch": 0.07696, + "grad_norm": 2.46875, + "grad_norm_var": 0.012580362955729167, + "learning_rate": 0.0001, + "loss": 4.8454, + "loss/crossentropy": 2.034749209880829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29580502212047577, + "step": 3848 + }, + { + "epoch": 0.077, + "grad_norm": 2.4375, + "grad_norm_var": 0.0117095947265625, + "learning_rate": 0.0001, + "loss": 4.5923, + "loss/crossentropy": 1.9982805848121643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2618003487586975, + "step": 3850 + }, + { + "epoch": 0.07704, + "grad_norm": 2.59375, + "grad_norm_var": 0.0127105712890625, + "learning_rate": 0.0001, + "loss": 4.7704, + "loss/crossentropy": 2.065816104412079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2571987137198448, + "step": 3852 + }, + { + "epoch": 0.07708, + "grad_norm": 2.46875, + "grad_norm_var": 0.0100982666015625, + "learning_rate": 0.0001, + "loss": 4.7493, + "loss/crossentropy": 1.933334231376648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24824900180101395, + "step": 3854 + }, + { + "epoch": 0.07712, + "grad_norm": 2.28125, + "grad_norm_var": 0.010514322916666667, + "learning_rate": 0.0001, + "loss": 4.5805, + "loss/crossentropy": 1.9197405576705933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2559673935174942, + "step": 3856 + }, + { + "epoch": 0.07716, + "grad_norm": 2.3125, + "grad_norm_var": 0.010252888997395833, + "learning_rate": 0.0001, + "loss": 4.429, + "loss/crossentropy": 2.307250142097473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2613677680492401, + "step": 3858 + }, + { + "epoch": 0.0772, + "grad_norm": 2.28125, + "grad_norm_var": 0.01011962890625, + "learning_rate": 0.0001, + "loss": 4.4494, + "loss/crossentropy": 2.1120635271072388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24830741435289383, + "step": 3860 + }, + { + "epoch": 0.07724, + "grad_norm": 2.484375, + "grad_norm_var": 0.039013671875, + "learning_rate": 0.0001, + "loss": 4.8585, + "loss/crossentropy": 2.404169201850891, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2960711419582367, + "step": 3862 + }, + { + "epoch": 0.07728, + "grad_norm": 2.421875, + "grad_norm_var": 0.038899739583333336, + "learning_rate": 0.0001, + "loss": 5.0257, + "loss/crossentropy": 2.2490307688713074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2569812461733818, + "step": 3864 + }, + { + "epoch": 0.07732, + "grad_norm": 2.34375, + "grad_norm_var": 0.0386871337890625, + "learning_rate": 0.0001, + "loss": 4.9086, + "loss/crossentropy": 2.0773178339004517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2505042105913162, + "step": 3866 + }, + { + "epoch": 0.07736, + "grad_norm": 2.40625, + "grad_norm_var": 0.038899739583333336, + "learning_rate": 0.0001, + "loss": 4.4621, + "loss/crossentropy": 1.83626389503479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22876176983118057, + "step": 3868 + }, + { + "epoch": 0.0774, + "grad_norm": 2.34375, + "grad_norm_var": 0.03911031087239583, + "learning_rate": 0.0001, + "loss": 4.5298, + "loss/crossentropy": 1.8159971833229065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22787011414766312, + "step": 3870 + }, + { + "epoch": 0.07744, + "grad_norm": 2.453125, + "grad_norm_var": 0.0378326416015625, + "learning_rate": 0.0001, + "loss": 4.5995, + "loss/crossentropy": 2.0361026525497437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23802263289690018, + "step": 3872 + }, + { + "epoch": 0.07748, + "grad_norm": 2.703125, + "grad_norm_var": 0.04433492024739583, + "learning_rate": 0.0001, + "loss": 4.9506, + "loss/crossentropy": 2.2464375495910645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26598919928073883, + "step": 3874 + }, + { + "epoch": 0.07752, + "grad_norm": 2.453125, + "grad_norm_var": 0.0408355712890625, + "learning_rate": 0.0001, + "loss": 4.9416, + "loss/crossentropy": 2.163287401199341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29068198800086975, + "step": 3876 + }, + { + "epoch": 0.07756, + "grad_norm": 2.5, + "grad_norm_var": 0.22014567057291667, + "learning_rate": 0.0001, + "loss": 4.9478, + "loss/crossentropy": 2.1638875007629395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2609352171421051, + "step": 3878 + }, + { + "epoch": 0.0776, + "grad_norm": 2.703125, + "grad_norm_var": 0.21923421223958334, + "learning_rate": 0.0001, + "loss": 4.6247, + "loss/crossentropy": 1.9216270446777344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25062238425016403, + "step": 3880 + }, + { + "epoch": 0.07764, + "grad_norm": 2.828125, + "grad_norm_var": 0.21585184733072918, + "learning_rate": 0.0001, + "loss": 5.0786, + "loss/crossentropy": 2.036958694458008, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2897229939699173, + "step": 3882 + }, + { + "epoch": 0.07768, + "grad_norm": 2.5625, + "grad_norm_var": 0.2094635009765625, + "learning_rate": 0.0001, + "loss": 4.6062, + "loss/crossentropy": 2.1493492126464844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.258526012301445, + "step": 3884 + }, + { + "epoch": 0.07772, + "grad_norm": 2.484375, + "grad_norm_var": 0.20414937337239583, + "learning_rate": 0.0001, + "loss": 4.468, + "loss/crossentropy": 2.0496288537979126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2386137992143631, + "step": 3886 + }, + { + "epoch": 0.07776, + "grad_norm": 2.40625, + "grad_norm_var": 0.20829671223958332, + "learning_rate": 0.0001, + "loss": 4.2402, + "loss/crossentropy": 1.5763422846794128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2227308303117752, + "step": 3888 + }, + { + "epoch": 0.0778, + "grad_norm": 2.640625, + "grad_norm_var": 0.20852864583333333, + "learning_rate": 0.0001, + "loss": 5.0582, + "loss/crossentropy": 2.5032416582107544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3047761619091034, + "step": 3890 + }, + { + "epoch": 0.07784, + "grad_norm": 2.375, + "grad_norm_var": 0.21199544270833334, + "learning_rate": 0.0001, + "loss": 4.7694, + "loss/crossentropy": 2.3609601259231567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27848154306411743, + "step": 3892 + }, + { + "epoch": 0.07788, + "grad_norm": 2.328125, + "grad_norm_var": 0.0202056884765625, + "learning_rate": 0.0001, + "loss": 4.776, + "loss/crossentropy": 2.188078999519348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2570301741361618, + "step": 3894 + }, + { + "epoch": 0.07792, + "grad_norm": 2.546875, + "grad_norm_var": 0.018733723958333334, + "learning_rate": 0.0001, + "loss": 4.8374, + "loss/crossentropy": 1.9860637784004211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25376833230257034, + "step": 3896 + }, + { + "epoch": 0.07796, + "grad_norm": 2.234375, + "grad_norm_var": 0.010399373372395833, + "learning_rate": 0.0001, + "loss": 4.404, + "loss/crossentropy": 2.0886037945747375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24766233563423157, + "step": 3898 + }, + { + "epoch": 0.078, + "grad_norm": 2.3125, + "grad_norm_var": 0.009845987955729166, + "learning_rate": 0.0001, + "loss": 4.6833, + "loss/crossentropy": 2.373010039329529, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27916407585144043, + "step": 3900 + }, + { + "epoch": 0.07804, + "grad_norm": 2.359375, + "grad_norm_var": 0.009837849934895834, + "learning_rate": 0.0001, + "loss": 4.6295, + "loss/crossentropy": 1.6733890771865845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2211885154247284, + "step": 3902 + }, + { + "epoch": 0.07808, + "grad_norm": 2.46875, + "grad_norm_var": 0.010184733072916667, + "learning_rate": 0.0001, + "loss": 4.6588, + "loss/crossentropy": 2.0506675243377686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27091431617736816, + "step": 3904 + }, + { + "epoch": 0.07812, + "grad_norm": 2.28125, + "grad_norm_var": 0.008784993489583334, + "learning_rate": 0.0001, + "loss": 4.712, + "loss/crossentropy": 2.3200724124908447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26983049511909485, + "step": 3906 + }, + { + "epoch": 0.07816, + "grad_norm": 2.765625, + "grad_norm_var": 0.0164459228515625, + "learning_rate": 0.0001, + "loss": 4.7171, + "loss/crossentropy": 1.928814709186554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2423655241727829, + "step": 3908 + }, + { + "epoch": 0.0782, + "grad_norm": 2.40625, + "grad_norm_var": 0.016389973958333335, + "learning_rate": 0.0001, + "loss": 4.6944, + "loss/crossentropy": 2.007555842399597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2444767728447914, + "step": 3910 + }, + { + "epoch": 0.07824, + "grad_norm": 2.875, + "grad_norm_var": 0.027469889322916666, + "learning_rate": 0.0001, + "loss": 4.7955, + "loss/crossentropy": 2.2054057121276855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2849784791469574, + "step": 3912 + }, + { + "epoch": 0.07828, + "grad_norm": 2.53125, + "grad_norm_var": 0.03986002604166667, + "learning_rate": 0.0001, + "loss": 4.6885, + "loss/crossentropy": 2.331532597541809, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2820790112018585, + "step": 3914 + }, + { + "epoch": 0.07832, + "grad_norm": 2.40625, + "grad_norm_var": 0.036519368489583336, + "learning_rate": 0.0001, + "loss": 4.6967, + "loss/crossentropy": 2.142041563987732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25799722969532013, + "step": 3916 + }, + { + "epoch": 0.07836, + "grad_norm": 2.625, + "grad_norm_var": 0.0394683837890625, + "learning_rate": 0.0001, + "loss": 4.5415, + "loss/crossentropy": 2.010735809803009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24202881753444672, + "step": 3918 + }, + { + "epoch": 0.0784, + "grad_norm": 2.4375, + "grad_norm_var": 0.03970947265625, + "learning_rate": 0.0001, + "loss": 5.037, + "loss/crossentropy": 2.382808804512024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27038049697875977, + "step": 3920 + }, + { + "epoch": 0.07844, + "grad_norm": 2.375, + "grad_norm_var": 0.0372955322265625, + "learning_rate": 0.0001, + "loss": 4.7708, + "loss/crossentropy": 2.099658191204071, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2557126358151436, + "step": 3922 + }, + { + "epoch": 0.07848, + "grad_norm": 2.375, + "grad_norm_var": 0.03968098958333333, + "learning_rate": 0.0001, + "loss": 4.3775, + "loss/crossentropy": 1.7840275764465332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22102414071559906, + "step": 3924 + }, + { + "epoch": 0.07852, + "grad_norm": 2.5, + "grad_norm_var": 0.039159138997395836, + "learning_rate": 0.0001, + "loss": 4.6637, + "loss/crossentropy": 1.8730725049972534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23219943791627884, + "step": 3926 + }, + { + "epoch": 0.07856, + "grad_norm": 2.203125, + "grad_norm_var": 0.034520467122395836, + "learning_rate": 0.0001, + "loss": 4.4432, + "loss/crossentropy": 1.9218623638153076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23333143442869186, + "step": 3928 + }, + { + "epoch": 0.0786, + "grad_norm": 2.46875, + "grad_norm_var": 0.0152740478515625, + "learning_rate": 0.0001, + "loss": 4.8944, + "loss/crossentropy": 1.9885727763175964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2596626430749893, + "step": 3930 + }, + { + "epoch": 0.07864, + "grad_norm": 2.390625, + "grad_norm_var": 0.014989217122395834, + "learning_rate": 0.0001, + "loss": 4.9358, + "loss/crossentropy": 2.397018015384674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2790713906288147, + "step": 3932 + }, + { + "epoch": 0.07868, + "grad_norm": 2.1875, + "grad_norm_var": 0.014383951822916666, + "learning_rate": 0.0001, + "loss": 4.3062, + "loss/crossentropy": 1.7345170378684998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23668452352285385, + "step": 3934 + }, + { + "epoch": 0.07872, + "grad_norm": 2.25, + "grad_norm_var": 0.013309733072916666, + "learning_rate": 0.0001, + "loss": 4.6173, + "loss/crossentropy": 1.8630162477493286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2371346428990364, + "step": 3936 + }, + { + "epoch": 0.07876, + "grad_norm": 2.40625, + "grad_norm_var": 0.010367838541666667, + "learning_rate": 0.0001, + "loss": 4.5774, + "loss/crossentropy": 2.0738128423690796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24340355396270752, + "step": 3938 + }, + { + "epoch": 0.0788, + "grad_norm": 2.40625, + "grad_norm_var": 0.010835774739583333, + "learning_rate": 0.0001, + "loss": 5.0027, + "loss/crossentropy": 2.2932467460632324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2527567446231842, + "step": 3940 + }, + { + "epoch": 0.07884, + "grad_norm": 2.46875, + "grad_norm_var": 0.009956868489583333, + "learning_rate": 0.0001, + "loss": 4.6785, + "loss/crossentropy": 2.0000113248825073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23461253196001053, + "step": 3942 + }, + { + "epoch": 0.07888, + "grad_norm": 2.296875, + "grad_norm_var": 0.008430989583333333, + "learning_rate": 0.0001, + "loss": 4.7307, + "loss/crossentropy": 2.0753955841064453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2535083442926407, + "step": 3944 + }, + { + "epoch": 0.07892, + "grad_norm": 2.40625, + "grad_norm_var": 0.008153279622395834, + "learning_rate": 0.0001, + "loss": 4.6906, + "loss/crossentropy": 2.167261242866516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24734552949666977, + "step": 3946 + }, + { + "epoch": 0.07896, + "grad_norm": 2.578125, + "grad_norm_var": 0.0108795166015625, + "learning_rate": 0.0001, + "loss": 4.7347, + "loss/crossentropy": 1.9755831956863403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24861737340688705, + "step": 3948 + }, + { + "epoch": 0.079, + "grad_norm": 2.484375, + "grad_norm_var": 0.009261067708333333, + "learning_rate": 0.0001, + "loss": 4.6813, + "loss/crossentropy": 2.195417881011963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2518697530031204, + "step": 3950 + }, + { + "epoch": 0.07904, + "grad_norm": 2.828125, + "grad_norm_var": 0.01920166015625, + "learning_rate": 0.0001, + "loss": 5.0172, + "loss/crossentropy": 2.5771371126174927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28423887491226196, + "step": 3952 + }, + { + "epoch": 0.07908, + "grad_norm": 2.375, + "grad_norm_var": 0.017870076497395835, + "learning_rate": 0.0001, + "loss": 4.7071, + "loss/crossentropy": 1.683276355266571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2075405865907669, + "step": 3954 + }, + { + "epoch": 0.07912, + "grad_norm": 2.515625, + "grad_norm_var": 0.018993123372395834, + "learning_rate": 0.0001, + "loss": 4.7128, + "loss/crossentropy": 2.2983756065368652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28010235726833344, + "step": 3956 + }, + { + "epoch": 0.07916, + "grad_norm": 2.421875, + "grad_norm_var": 0.020361328125, + "learning_rate": 0.0001, + "loss": 4.7896, + "loss/crossentropy": 2.3263272047042847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28625819087028503, + "step": 3958 + }, + { + "epoch": 0.0792, + "grad_norm": 2.40625, + "grad_norm_var": 0.0204742431640625, + "learning_rate": 0.0001, + "loss": 4.5201, + "loss/crossentropy": 1.9820871353149414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2594129145145416, + "step": 3960 + }, + { + "epoch": 0.07924, + "grad_norm": 2.171875, + "grad_norm_var": 0.025634765625, + "learning_rate": 0.0001, + "loss": 4.4754, + "loss/crossentropy": 1.8991515636444092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23694587498903275, + "step": 3962 + }, + { + "epoch": 0.07928, + "grad_norm": 2.921875, + "grad_norm_var": 0.03843994140625, + "learning_rate": 0.0001, + "loss": 4.9865, + "loss/crossentropy": 2.485508918762207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2957809865474701, + "step": 3964 + }, + { + "epoch": 0.07932, + "grad_norm": 2.390625, + "grad_norm_var": 0.037262980143229166, + "learning_rate": 0.0001, + "loss": 4.8871, + "loss/crossentropy": 2.156081974506378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27474651485681534, + "step": 3966 + }, + { + "epoch": 0.07936, + "grad_norm": 2.5625, + "grad_norm_var": 0.028544108072916668, + "learning_rate": 0.0001, + "loss": 4.8694, + "loss/crossentropy": 2.0370571613311768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2732074484229088, + "step": 3968 + }, + { + "epoch": 0.0794, + "grad_norm": 2.421875, + "grad_norm_var": 0.028251139322916667, + "learning_rate": 0.0001, + "loss": 5.062, + "loss/crossentropy": 2.3039989471435547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31054411828517914, + "step": 3970 + }, + { + "epoch": 0.07944, + "grad_norm": 2.34375, + "grad_norm_var": 0.028641764322916666, + "learning_rate": 0.0001, + "loss": 4.7875, + "loss/crossentropy": 2.2280107736587524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2711277902126312, + "step": 3972 + }, + { + "epoch": 0.07948, + "grad_norm": 2.625, + "grad_norm_var": 0.02935791015625, + "learning_rate": 0.0001, + "loss": 4.8992, + "loss/crossentropy": 2.0609869956970215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24142058193683624, + "step": 3974 + }, + { + "epoch": 0.07952, + "grad_norm": 2.296875, + "grad_norm_var": 0.03277587890625, + "learning_rate": 0.0001, + "loss": 4.1391, + "loss/crossentropy": 2.0541876554489136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24537865817546844, + "step": 3976 + }, + { + "epoch": 0.07956, + "grad_norm": 2.578125, + "grad_norm_var": 0.6102701822916666, + "learning_rate": 0.0001, + "loss": 4.9591, + "loss/crossentropy": 2.344236969947815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2770863175392151, + "step": 3978 + }, + { + "epoch": 0.0796, + "grad_norm": 2.625, + "grad_norm_var": 0.65465087890625, + "learning_rate": 0.0001, + "loss": 4.7232, + "loss/crossentropy": 1.7899338603019714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23951984196901321, + "step": 3980 + }, + { + "epoch": 0.07964, + "grad_norm": 2.328125, + "grad_norm_var": 0.6591471354166667, + "learning_rate": 0.0001, + "loss": 4.6771, + "loss/crossentropy": 2.3253976106643677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2691802680492401, + "step": 3982 + }, + { + "epoch": 0.07968, + "grad_norm": 2.453125, + "grad_norm_var": 0.65885009765625, + "learning_rate": 0.0001, + "loss": 4.8223, + "loss/crossentropy": 2.14141583442688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2603989467024803, + "step": 3984 + }, + { + "epoch": 0.07972, + "grad_norm": 2.53125, + "grad_norm_var": 0.6512196858723959, + "learning_rate": 0.0001, + "loss": 4.9059, + "loss/crossentropy": 2.262465476989746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.291474387049675, + "step": 3986 + }, + { + "epoch": 0.07976, + "grad_norm": 2.328125, + "grad_norm_var": 0.66064453125, + "learning_rate": 0.0001, + "loss": 4.5626, + "loss/crossentropy": 2.1835561990737915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23615659773349762, + "step": 3988 + }, + { + "epoch": 0.0798, + "grad_norm": 2.359375, + "grad_norm_var": 0.6716145833333333, + "learning_rate": 0.0001, + "loss": 4.8482, + "loss/crossentropy": 2.020140767097473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2593151703476906, + "step": 3990 + }, + { + "epoch": 0.07984, + "grad_norm": 2.59375, + "grad_norm_var": 0.6518513997395833, + "learning_rate": 0.0001, + "loss": 4.7986, + "loss/crossentropy": 2.277661681175232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2721617519855499, + "step": 3992 + }, + { + "epoch": 0.07988, + "grad_norm": 2.34375, + "grad_norm_var": 0.09846089680989584, + "learning_rate": 0.0001, + "loss": 4.5113, + "loss/crossentropy": 2.1883193254470825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2610222101211548, + "step": 3994 + }, + { + "epoch": 0.07992, + "grad_norm": 2.421875, + "grad_norm_var": 0.01754150390625, + "learning_rate": 0.0001, + "loss": 4.5987, + "loss/crossentropy": 2.152850031852722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27418845891952515, + "step": 3996 + }, + { + "epoch": 0.07996, + "grad_norm": 2.546875, + "grad_norm_var": 0.017513020833333334, + "learning_rate": 0.0001, + "loss": 4.8214, + "loss/crossentropy": 2.3313716650009155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27529503405094147, + "step": 3998 + }, + { + "epoch": 0.08, + "grad_norm": 2.640625, + "grad_norm_var": 0.020319620768229168, + "learning_rate": 0.0001, + "loss": 5.086, + "loss/crossentropy": 2.250498414039612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26368021965026855, + "step": 4000 + }, + { + "epoch": 0.08004, + "grad_norm": 2.4375, + "grad_norm_var": 0.011188761393229166, + "learning_rate": 0.0001, + "loss": 4.8005, + "loss/crossentropy": 2.322459101676941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2868216335773468, + "step": 4002 + }, + { + "epoch": 0.08008, + "grad_norm": 2.40625, + "grad_norm_var": 0.0142730712890625, + "learning_rate": 0.0001, + "loss": 4.8693, + "loss/crossentropy": 1.9340506792068481, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2316983863711357, + "step": 4004 + }, + { + "epoch": 0.08012, + "grad_norm": 2.359375, + "grad_norm_var": 0.0150054931640625, + "learning_rate": 0.0001, + "loss": 4.7395, + "loss/crossentropy": 1.8635645508766174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22694773972034454, + "step": 4006 + }, + { + "epoch": 0.08016, + "grad_norm": 10.375, + "grad_norm_var": 3.9615631103515625, + "learning_rate": 0.0001, + "loss": 4.8916, + "loss/crossentropy": 1.9252317547798157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24893560260534286, + "step": 4008 + }, + { + "epoch": 0.0802, + "grad_norm": 2.671875, + "grad_norm_var": 3.9093424479166665, + "learning_rate": 0.0001, + "loss": 5.2636, + "loss/crossentropy": 2.1964328289031982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2567693591117859, + "step": 4010 + }, + { + "epoch": 0.08024, + "grad_norm": 2.671875, + "grad_norm_var": 3.8960113525390625, + "learning_rate": 0.0001, + "loss": 4.9054, + "loss/crossentropy": 2.296012043952942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27360107749700546, + "step": 4012 + }, + { + "epoch": 0.08028, + "grad_norm": 2.453125, + "grad_norm_var": 3.9009724934895833, + "learning_rate": 0.0001, + "loss": 4.8894, + "loss/crossentropy": 2.360015869140625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27541905641555786, + "step": 4014 + }, + { + "epoch": 0.08032, + "grad_norm": 2.390625, + "grad_norm_var": 3.906591796875, + "learning_rate": 0.0001, + "loss": 4.8865, + "loss/crossentropy": 2.36370050907135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27881547808647156, + "step": 4016 + }, + { + "epoch": 0.08036, + "grad_norm": 2.25, + "grad_norm_var": 3.9068593343098956, + "learning_rate": 0.0001, + "loss": 4.6461, + "loss/crossentropy": 1.8704780340194702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2512069493532181, + "step": 4018 + }, + { + "epoch": 0.0804, + "grad_norm": 2.4375, + "grad_norm_var": 3.9156483968098956, + "learning_rate": 0.0001, + "loss": 4.733, + "loss/crossentropy": 2.1989234685897827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2755106985569, + "step": 4020 + }, + { + "epoch": 0.08044, + "grad_norm": 2.21875, + "grad_norm_var": 3.9420237223307293, + "learning_rate": 0.0001, + "loss": 4.3471, + "loss/crossentropy": 1.9905433058738708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2579897418618202, + "step": 4022 + }, + { + "epoch": 0.08048, + "grad_norm": 2.515625, + "grad_norm_var": 0.08837788899739583, + "learning_rate": 0.0001, + "loss": 4.9025, + "loss/crossentropy": 2.270000696182251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2693728804588318, + "step": 4024 + }, + { + "epoch": 0.08052, + "grad_norm": 2.375, + "grad_norm_var": 0.08504130045572916, + "learning_rate": 0.0001, + "loss": 4.7569, + "loss/crossentropy": 2.178301692008972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2617819905281067, + "step": 4026 + }, + { + "epoch": 0.08056, + "grad_norm": 2.25, + "grad_norm_var": 0.08346354166666667, + "learning_rate": 0.0001, + "loss": 4.687, + "loss/crossentropy": 2.518654465675354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29282887279987335, + "step": 4028 + }, + { + "epoch": 0.0806, + "grad_norm": 2.25, + "grad_norm_var": 0.08859049479166667, + "learning_rate": 0.0001, + "loss": 4.4825, + "loss/crossentropy": 1.9181422591209412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23210199177265167, + "step": 4030 + }, + { + "epoch": 0.08064, + "grad_norm": 2.359375, + "grad_norm_var": 0.08816630045572917, + "learning_rate": 0.0001, + "loss": 4.6407, + "loss/crossentropy": 2.343222141265869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28908614814281464, + "step": 4032 + }, + { + "epoch": 0.08068, + "grad_norm": 2.390625, + "grad_norm_var": 0.0302886962890625, + "learning_rate": 0.0001, + "loss": 4.6879, + "loss/crossentropy": 2.0816845893859863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2571762502193451, + "step": 4034 + }, + { + "epoch": 0.08072, + "grad_norm": 2.296875, + "grad_norm_var": 0.029878743489583335, + "learning_rate": 0.0001, + "loss": 4.3324, + "loss/crossentropy": 1.9679544568061829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2316160574555397, + "step": 4036 + }, + { + "epoch": 0.08076, + "grad_norm": 2.328125, + "grad_norm_var": 0.029195149739583332, + "learning_rate": 0.0001, + "loss": 4.7335, + "loss/crossentropy": 2.1553682684898376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25725623965263367, + "step": 4038 + }, + { + "epoch": 0.0808, + "grad_norm": 2.421875, + "grad_norm_var": 0.004964192708333333, + "learning_rate": 0.0001, + "loss": 4.6892, + "loss/crossentropy": 2.0269790291786194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26298412680625916, + "step": 4040 + }, + { + "epoch": 0.08084, + "grad_norm": 2.484375, + "grad_norm_var": 0.0060943603515625, + "learning_rate": 0.0001, + "loss": 4.6686, + "loss/crossentropy": 1.984773874282837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24288517236709595, + "step": 4042 + }, + { + "epoch": 0.08088, + "grad_norm": 2.3125, + "grad_norm_var": 0.0051910400390625, + "learning_rate": 0.0001, + "loss": 4.9282, + "loss/crossentropy": 2.178356111049652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2684163451194763, + "step": 4044 + }, + { + "epoch": 0.08092, + "grad_norm": 2.46875, + "grad_norm_var": 0.0053670247395833336, + "learning_rate": 0.0001, + "loss": 4.8191, + "loss/crossentropy": 2.235984683036804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26377667486667633, + "step": 4046 + }, + { + "epoch": 0.08096, + "grad_norm": 2.34375, + "grad_norm_var": 0.005826822916666667, + "learning_rate": 0.0001, + "loss": 4.7026, + "loss/crossentropy": 2.085321545600891, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23403701931238174, + "step": 4048 + }, + { + "epoch": 0.081, + "grad_norm": 2.3125, + "grad_norm_var": 0.0052642822265625, + "learning_rate": 0.0001, + "loss": 4.9932, + "loss/crossentropy": 2.419228672981262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27236658334732056, + "step": 4050 + }, + { + "epoch": 0.08104, + "grad_norm": 2.421875, + "grad_norm_var": 0.0054972330729166664, + "learning_rate": 0.0001, + "loss": 4.6105, + "loss/crossentropy": 2.153620958328247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28071053326129913, + "step": 4052 + }, + { + "epoch": 0.08108, + "grad_norm": 2.71875, + "grad_norm_var": 0.012743123372395833, + "learning_rate": 0.0001, + "loss": 4.8775, + "loss/crossentropy": 2.1466477513313293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27491800487041473, + "step": 4054 + }, + { + "epoch": 0.08112, + "grad_norm": 2.265625, + "grad_norm_var": 0.013932291666666667, + "learning_rate": 0.0001, + "loss": 4.3707, + "loss/crossentropy": 2.2020710706710815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25149518996477127, + "step": 4056 + }, + { + "epoch": 0.08116, + "grad_norm": 2.34375, + "grad_norm_var": 0.013623046875, + "learning_rate": 0.0001, + "loss": 4.8458, + "loss/crossentropy": 2.264205574989319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27612583339214325, + "step": 4058 + }, + { + "epoch": 0.0812, + "grad_norm": 2.4375, + "grad_norm_var": 0.016434733072916666, + "learning_rate": 0.0001, + "loss": 4.8644, + "loss/crossentropy": 2.269905209541321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2692303955554962, + "step": 4060 + }, + { + "epoch": 0.08124, + "grad_norm": 2.34375, + "grad_norm_var": 0.018505859375, + "learning_rate": 0.0001, + "loss": 4.5057, + "loss/crossentropy": 1.920631766319275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22550494968891144, + "step": 4062 + }, + { + "epoch": 0.08128, + "grad_norm": 2.53125, + "grad_norm_var": 0.019749959309895832, + "learning_rate": 0.0001, + "loss": 5.0796, + "loss/crossentropy": 2.307617664337158, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22719035297632217, + "step": 4064 + }, + { + "epoch": 0.08132, + "grad_norm": 2.375, + "grad_norm_var": 0.022102864583333333, + "learning_rate": 0.0001, + "loss": 4.6167, + "loss/crossentropy": 2.113444685935974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.240354023873806, + "step": 4066 + }, + { + "epoch": 0.08136, + "grad_norm": 2.375, + "grad_norm_var": 0.02232666015625, + "learning_rate": 0.0001, + "loss": 4.9152, + "loss/crossentropy": 2.4516230821609497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27590544521808624, + "step": 4068 + }, + { + "epoch": 0.0814, + "grad_norm": 2.34375, + "grad_norm_var": 0.08772379557291667, + "learning_rate": 0.0001, + "loss": 4.5976, + "loss/crossentropy": 1.8287339806556702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22503511607646942, + "step": 4070 + }, + { + "epoch": 0.08144, + "grad_norm": 2.5, + "grad_norm_var": 0.08479715983072916, + "learning_rate": 0.0001, + "loss": 5.1623, + "loss/crossentropy": 2.3468997478485107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2773839682340622, + "step": 4072 + }, + { + "epoch": 0.08148, + "grad_norm": 2.296875, + "grad_norm_var": 0.08782145182291666, + "learning_rate": 0.0001, + "loss": 4.5413, + "loss/crossentropy": 2.1307512521743774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24247504770755768, + "step": 4074 + }, + { + "epoch": 0.08152, + "grad_norm": 2.21875, + "grad_norm_var": 0.08982645670572917, + "learning_rate": 0.0001, + "loss": 4.7447, + "loss/crossentropy": 2.248755097389221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2696859538555145, + "step": 4076 + }, + { + "epoch": 0.08156, + "grad_norm": 2.265625, + "grad_norm_var": 0.09045817057291666, + "learning_rate": 0.0001, + "loss": 4.2948, + "loss/crossentropy": 2.0233980417251587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2370016872882843, + "step": 4078 + }, + { + "epoch": 0.0816, + "grad_norm": 2.234375, + "grad_norm_var": 0.0923828125, + "learning_rate": 0.0001, + "loss": 4.432, + "loss/crossentropy": 1.9536627531051636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23242933303117752, + "step": 4080 + }, + { + "epoch": 0.08164, + "grad_norm": 2.375, + "grad_norm_var": 0.0889801025390625, + "learning_rate": 0.0001, + "loss": 4.5037, + "loss/crossentropy": 1.9631904363632202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2337196245789528, + "step": 4082 + }, + { + "epoch": 0.08168, + "grad_norm": 2.546875, + "grad_norm_var": 0.08935139973958334, + "learning_rate": 0.0001, + "loss": 4.7406, + "loss/crossentropy": 2.193789482116699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2687358558177948, + "step": 4084 + }, + { + "epoch": 0.08172, + "grad_norm": 2.40625, + "grad_norm_var": 0.020210774739583333, + "learning_rate": 0.0001, + "loss": 4.6454, + "loss/crossentropy": 2.308240056037903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2679348289966583, + "step": 4086 + }, + { + "epoch": 0.08176, + "grad_norm": 2.359375, + "grad_norm_var": 0.021903483072916667, + "learning_rate": 0.0001, + "loss": 4.9126, + "loss/crossentropy": 2.343047261238098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3056950569152832, + "step": 4088 + }, + { + "epoch": 0.0818, + "grad_norm": 2.234375, + "grad_norm_var": 0.022526041666666666, + "learning_rate": 0.0001, + "loss": 4.7315, + "loss/crossentropy": 1.9583085179328918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23911744356155396, + "step": 4090 + }, + { + "epoch": 0.08184, + "grad_norm": 2.296875, + "grad_norm_var": 0.021198527018229166, + "learning_rate": 0.0001, + "loss": 4.6559, + "loss/crossentropy": 2.341569185256958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26053962111473083, + "step": 4092 + }, + { + "epoch": 0.08188, + "grad_norm": 2.921875, + "grad_norm_var": 0.034601847330729164, + "learning_rate": 0.0001, + "loss": 4.5773, + "loss/crossentropy": 2.068669080734253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25753986835479736, + "step": 4094 + }, + { + "epoch": 0.08192, + "grad_norm": 2.625, + "grad_norm_var": 0.04309488932291667, + "learning_rate": 0.0001, + "loss": 5.0253, + "loss/crossentropy": 2.1461241841316223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2628704681992531, + "step": 4096 + }, + { + "epoch": 0.08196, + "grad_norm": 2.3125, + "grad_norm_var": 0.04810791015625, + "learning_rate": 0.0001, + "loss": 4.5587, + "loss/crossentropy": 2.0718055963516235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24923217296600342, + "step": 4098 + }, + { + "epoch": 0.082, + "grad_norm": 2.359375, + "grad_norm_var": 0.0503082275390625, + "learning_rate": 0.0001, + "loss": 4.379, + "loss/crossentropy": 1.9812004566192627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2316955253481865, + "step": 4100 + }, + { + "epoch": 0.08204, + "grad_norm": 2.3125, + "grad_norm_var": 0.0465240478515625, + "learning_rate": 0.0001, + "loss": 4.7909, + "loss/crossentropy": 2.2669100761413574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2806926518678665, + "step": 4102 + }, + { + "epoch": 0.08208, + "grad_norm": 2.375, + "grad_norm_var": 0.043196614583333334, + "learning_rate": 0.0001, + "loss": 4.7502, + "loss/crossentropy": 2.0620261430740356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2610047310590744, + "step": 4104 + }, + { + "epoch": 0.08212, + "grad_norm": 3.046875, + "grad_norm_var": 0.06297098795572917, + "learning_rate": 0.0001, + "loss": 4.6672, + "loss/crossentropy": 2.249971866607666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2783341556787491, + "step": 4106 + }, + { + "epoch": 0.08216, + "grad_norm": 2.328125, + "grad_norm_var": 0.06320699055989583, + "learning_rate": 0.0001, + "loss": 4.834, + "loss/crossentropy": 2.1064823865890503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27534550428390503, + "step": 4108 + }, + { + "epoch": 0.0822, + "grad_norm": 2.28125, + "grad_norm_var": 0.05406901041666667, + "learning_rate": 0.0001, + "loss": 4.3944, + "loss/crossentropy": 1.886509656906128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2315160632133484, + "step": 4110 + }, + { + "epoch": 0.08224, + "grad_norm": 2.40625, + "grad_norm_var": 0.039290364583333334, + "learning_rate": 0.0001, + "loss": 4.2969, + "loss/crossentropy": 1.6429635286331177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20593100041151047, + "step": 4112 + }, + { + "epoch": 0.08228, + "grad_norm": 2.40625, + "grad_norm_var": 0.0370025634765625, + "learning_rate": 0.0001, + "loss": 4.4581, + "loss/crossentropy": 2.3236618041992188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2674623131752014, + "step": 4114 + }, + { + "epoch": 0.08232, + "grad_norm": 2.578125, + "grad_norm_var": 0.03658447265625, + "learning_rate": 0.0001, + "loss": 4.9734, + "loss/crossentropy": 2.1479567885398865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25003983825445175, + "step": 4116 + }, + { + "epoch": 0.08236, + "grad_norm": 2.578125, + "grad_norm_var": 0.03611551920572917, + "learning_rate": 0.0001, + "loss": 5.0477, + "loss/crossentropy": 2.140569031238556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24322029948234558, + "step": 4118 + }, + { + "epoch": 0.0824, + "grad_norm": 2.328125, + "grad_norm_var": 0.03762613932291667, + "learning_rate": 0.0001, + "loss": 4.6061, + "loss/crossentropy": 2.126375436782837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27629759907722473, + "step": 4120 + }, + { + "epoch": 0.08244, + "grad_norm": 2.28125, + "grad_norm_var": 0.015843709309895832, + "learning_rate": 0.0001, + "loss": 4.9143, + "loss/crossentropy": 2.3699214458465576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2692546397447586, + "step": 4122 + }, + { + "epoch": 0.08248, + "grad_norm": 2.296875, + "grad_norm_var": 0.010904947916666666, + "learning_rate": 0.0001, + "loss": 4.5672, + "loss/crossentropy": 2.013331353664398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24442073702812195, + "step": 4124 + }, + { + "epoch": 0.08252, + "grad_norm": 2.359375, + "grad_norm_var": 0.010152180989583334, + "learning_rate": 0.0001, + "loss": 4.4841, + "loss/crossentropy": 2.1869460344314575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.250150203704834, + "step": 4126 + }, + { + "epoch": 0.08256, + "grad_norm": 2.203125, + "grad_norm_var": 0.0127838134765625, + "learning_rate": 0.0001, + "loss": 4.4564, + "loss/crossentropy": 2.2725884914398193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2810261696577072, + "step": 4128 + }, + { + "epoch": 0.0826, + "grad_norm": 2.328125, + "grad_norm_var": 0.013688151041666667, + "learning_rate": 0.0001, + "loss": 4.7311, + "loss/crossentropy": 1.9190022945404053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2693525403738022, + "step": 4130 + }, + { + "epoch": 0.08264, + "grad_norm": 2.65625, + "grad_norm_var": 0.016112263997395834, + "learning_rate": 0.0001, + "loss": 4.7967, + "loss/crossentropy": 2.5477795600891113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27040669322013855, + "step": 4132 + }, + { + "epoch": 0.08268, + "grad_norm": 2.40625, + "grad_norm_var": 0.0113677978515625, + "learning_rate": 0.0001, + "loss": 4.7617, + "loss/crossentropy": 2.231198728084564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28101250529289246, + "step": 4134 + }, + { + "epoch": 0.08272, + "grad_norm": 2.296875, + "grad_norm_var": 0.011693318684895834, + "learning_rate": 0.0001, + "loss": 4.6334, + "loss/crossentropy": 2.17776882648468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24743208289146423, + "step": 4136 + }, + { + "epoch": 0.08276, + "grad_norm": 2.46875, + "grad_norm_var": 0.011799112955729166, + "learning_rate": 0.0001, + "loss": 5.0233, + "loss/crossentropy": 2.418373703956604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2954525351524353, + "step": 4138 + }, + { + "epoch": 0.0828, + "grad_norm": 2.3125, + "grad_norm_var": 0.010969034830729167, + "learning_rate": 0.0001, + "loss": 4.5564, + "loss/crossentropy": 2.054605543613434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23863950371742249, + "step": 4140 + }, + { + "epoch": 0.08284, + "grad_norm": 2.5, + "grad_norm_var": 0.011872355143229167, + "learning_rate": 0.0001, + "loss": 4.8983, + "loss/crossentropy": 2.054013967514038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29851172864437103, + "step": 4142 + }, + { + "epoch": 0.08288, + "grad_norm": 2.46875, + "grad_norm_var": 0.008610026041666666, + "learning_rate": 0.0001, + "loss": 4.7425, + "loss/crossentropy": 2.193961024284363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2563214898109436, + "step": 4144 + }, + { + "epoch": 0.08292, + "grad_norm": 2.40625, + "grad_norm_var": 0.008382161458333334, + "learning_rate": 0.0001, + "loss": 4.7995, + "loss/crossentropy": 2.460008382797241, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26705074310302734, + "step": 4146 + }, + { + "epoch": 0.08296, + "grad_norm": 2.3125, + "grad_norm_var": 0.005060831705729167, + "learning_rate": 0.0001, + "loss": 4.894, + "loss/crossentropy": 2.508321523666382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26217761635780334, + "step": 4148 + }, + { + "epoch": 0.083, + "grad_norm": 2.328125, + "grad_norm_var": 0.0065582275390625, + "learning_rate": 0.0001, + "loss": 4.6103, + "loss/crossentropy": 1.8445284366607666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23396535962820053, + "step": 4150 + }, + { + "epoch": 0.08304, + "grad_norm": 2.375, + "grad_norm_var": 0.005952962239583333, + "learning_rate": 0.0001, + "loss": 4.8048, + "loss/crossentropy": 2.433600902557373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2825329154729843, + "step": 4152 + }, + { + "epoch": 0.08308, + "grad_norm": 2.421875, + "grad_norm_var": 0.0053212483723958336, + "learning_rate": 0.0001, + "loss": 4.8632, + "loss/crossentropy": 2.386221170425415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2941686511039734, + "step": 4154 + }, + { + "epoch": 0.08312, + "grad_norm": 2.421875, + "grad_norm_var": 0.0052154541015625, + "learning_rate": 0.0001, + "loss": 4.7486, + "loss/crossentropy": 1.9578949809074402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24851053953170776, + "step": 4156 + }, + { + "epoch": 0.08316, + "grad_norm": 2.3125, + "grad_norm_var": 0.004150390625, + "learning_rate": 0.0001, + "loss": 4.6443, + "loss/crossentropy": 2.0034408569335938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25524984300136566, + "step": 4158 + }, + { + "epoch": 0.0832, + "grad_norm": 2.28125, + "grad_norm_var": 0.0069488525390625, + "learning_rate": 0.0001, + "loss": 4.6856, + "loss/crossentropy": 2.3342589139938354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31023962795734406, + "step": 4160 + }, + { + "epoch": 0.08324, + "grad_norm": 2.46875, + "grad_norm_var": 0.042867024739583336, + "learning_rate": 0.0001, + "loss": 4.6762, + "loss/crossentropy": 2.3941839933395386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2641746401786804, + "step": 4162 + }, + { + "epoch": 0.08328, + "grad_norm": 2.453125, + "grad_norm_var": 0.04168294270833333, + "learning_rate": 0.0001, + "loss": 4.7149, + "loss/crossentropy": 2.371219038963318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24197939038276672, + "step": 4164 + }, + { + "epoch": 0.08332, + "grad_norm": 2.734375, + "grad_norm_var": 0.04442952473958333, + "learning_rate": 0.0001, + "loss": 4.7949, + "loss/crossentropy": 2.133378028869629, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25449906289577484, + "step": 4166 + }, + { + "epoch": 0.08336, + "grad_norm": 2.4375, + "grad_norm_var": 0.04496968587239583, + "learning_rate": 0.0001, + "loss": 4.5974, + "loss/crossentropy": 1.7460412979125977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2157151699066162, + "step": 4168 + }, + { + "epoch": 0.0834, + "grad_norm": 2.609375, + "grad_norm_var": 0.046873982747395834, + "learning_rate": 0.0001, + "loss": 4.7234, + "loss/crossentropy": 2.215083122253418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2823774367570877, + "step": 4170 + }, + { + "epoch": 0.08344, + "grad_norm": 2.453125, + "grad_norm_var": 0.04820048014322917, + "learning_rate": 0.0001, + "loss": 4.5189, + "loss/crossentropy": 2.0528377890586853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2551003098487854, + "step": 4172 + }, + { + "epoch": 0.08348, + "grad_norm": 2.359375, + "grad_norm_var": 0.05054423014322917, + "learning_rate": 0.0001, + "loss": 4.4008, + "loss/crossentropy": 1.7953855395317078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23185917735099792, + "step": 4174 + }, + { + "epoch": 0.08352, + "grad_norm": 2.34375, + "grad_norm_var": 0.058934529622395836, + "learning_rate": 0.0001, + "loss": 4.4879, + "loss/crossentropy": 2.0794734954833984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23386041820049286, + "step": 4176 + }, + { + "epoch": 0.08356, + "grad_norm": 2.234375, + "grad_norm_var": 0.031086222330729166, + "learning_rate": 0.0001, + "loss": 4.3802, + "loss/crossentropy": 2.1685845851898193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26074862480163574, + "step": 4178 + }, + { + "epoch": 0.0836, + "grad_norm": 2.484375, + "grad_norm_var": 0.031412760416666664, + "learning_rate": 0.0001, + "loss": 4.5507, + "loss/crossentropy": 2.1495825052261353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2689145505428314, + "step": 4180 + }, + { + "epoch": 0.08364, + "grad_norm": 2.328125, + "grad_norm_var": 0.024967447916666666, + "learning_rate": 0.0001, + "loss": 4.5258, + "loss/crossentropy": 2.043331503868103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2498578578233719, + "step": 4182 + }, + { + "epoch": 0.08368, + "grad_norm": 2.734375, + "grad_norm_var": 0.07787984212239583, + "learning_rate": 0.0001, + "loss": 5.0198, + "loss/crossentropy": 2.04026997089386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2873340845108032, + "step": 4184 + }, + { + "epoch": 0.08372, + "grad_norm": 2.515625, + "grad_norm_var": 0.09128316243489583, + "learning_rate": 0.0001, + "loss": 4.9537, + "loss/crossentropy": 2.4653968811035156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2911294251680374, + "step": 4186 + }, + { + "epoch": 0.08376, + "grad_norm": 2.421875, + "grad_norm_var": 0.09215494791666666, + "learning_rate": 0.0001, + "loss": 4.6589, + "loss/crossentropy": 2.2960848808288574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23984040319919586, + "step": 4188 + }, + { + "epoch": 0.0838, + "grad_norm": 2.515625, + "grad_norm_var": 0.09599202473958333, + "learning_rate": 0.0001, + "loss": 4.4324, + "loss/crossentropy": 2.011807084083557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2523125037550926, + "step": 4190 + }, + { + "epoch": 0.08384, + "grad_norm": 2.421875, + "grad_norm_var": 0.0886871337890625, + "learning_rate": 0.0001, + "loss": 4.8437, + "loss/crossentropy": 2.0016889572143555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2291206791996956, + "step": 4192 + }, + { + "epoch": 0.08388, + "grad_norm": 2.1875, + "grad_norm_var": 0.09378255208333333, + "learning_rate": 0.0001, + "loss": 4.3604, + "loss/crossentropy": 1.97197824716568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2595779076218605, + "step": 4194 + }, + { + "epoch": 0.08392, + "grad_norm": 2.34375, + "grad_norm_var": 0.09763895670572917, + "learning_rate": 0.0001, + "loss": 4.5823, + "loss/crossentropy": 2.2910103797912598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24870596826076508, + "step": 4196 + }, + { + "epoch": 0.08396, + "grad_norm": 2.234375, + "grad_norm_var": 0.1001617431640625, + "learning_rate": 0.0001, + "loss": 4.562, + "loss/crossentropy": 2.1453208923339844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25001347810029984, + "step": 4198 + }, + { + "epoch": 0.084, + "grad_norm": 2.359375, + "grad_norm_var": 0.0398834228515625, + "learning_rate": 0.0001, + "loss": 4.8835, + "loss/crossentropy": 2.1935043334960938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26807525753974915, + "step": 4200 + }, + { + "epoch": 0.08404, + "grad_norm": 2.3125, + "grad_norm_var": 0.009765625, + "learning_rate": 0.0001, + "loss": 4.5912, + "loss/crossentropy": 2.039341926574707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2460380420088768, + "step": 4202 + }, + { + "epoch": 0.08408, + "grad_norm": 3.09375, + "grad_norm_var": 0.0478179931640625, + "learning_rate": 0.0001, + "loss": 4.8243, + "loss/crossentropy": 2.4660122394561768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3042101263999939, + "step": 4204 + }, + { + "epoch": 0.08412, + "grad_norm": 2.453125, + "grad_norm_var": 0.08088785807291667, + "learning_rate": 0.0001, + "loss": 4.8635, + "loss/crossentropy": 1.9346272349357605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25924334675073624, + "step": 4206 + }, + { + "epoch": 0.08416, + "grad_norm": 2.1875, + "grad_norm_var": 0.08311258951822917, + "learning_rate": 0.0001, + "loss": 4.5152, + "loss/crossentropy": 2.0120063424110413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24327433109283447, + "step": 4208 + }, + { + "epoch": 0.0842, + "grad_norm": 2.703125, + "grad_norm_var": 0.0852935791015625, + "learning_rate": 0.0001, + "loss": 4.6148, + "loss/crossentropy": 2.2359931468963623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25374244898557663, + "step": 4210 + }, + { + "epoch": 0.08424, + "grad_norm": 2.53125, + "grad_norm_var": 0.08185221354166666, + "learning_rate": 0.0001, + "loss": 4.5751, + "loss/crossentropy": 2.0038134455680847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22315364331007004, + "step": 4212 + }, + { + "epoch": 0.08428, + "grad_norm": 2.296875, + "grad_norm_var": 0.08567301432291667, + "learning_rate": 0.0001, + "loss": 4.721, + "loss/crossentropy": 2.2041471004486084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2655494213104248, + "step": 4214 + }, + { + "epoch": 0.08432, + "grad_norm": 2.390625, + "grad_norm_var": 0.08399149576822916, + "learning_rate": 0.0001, + "loss": 4.8091, + "loss/crossentropy": 2.344551682472229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2743126451969147, + "step": 4216 + }, + { + "epoch": 0.08436, + "grad_norm": 2.453125, + "grad_norm_var": 0.08025614420572917, + "learning_rate": 0.0001, + "loss": 4.7162, + "loss/crossentropy": 1.9694250226020813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24425261467695236, + "step": 4218 + }, + { + "epoch": 0.0844, + "grad_norm": 2.40625, + "grad_norm_var": 0.05671284993489583, + "learning_rate": 0.0001, + "loss": 4.8526, + "loss/crossentropy": 2.164921760559082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.272469699382782, + "step": 4220 + }, + { + "epoch": 0.08444, + "grad_norm": 2.484375, + "grad_norm_var": 0.02642822265625, + "learning_rate": 0.0001, + "loss": 4.5513, + "loss/crossentropy": 1.944575309753418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23848393559455872, + "step": 4222 + }, + { + "epoch": 0.08448, + "grad_norm": 2.359375, + "grad_norm_var": 0.025386555989583334, + "learning_rate": 0.0001, + "loss": 4.7416, + "loss/crossentropy": 2.278227686882019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2558315545320511, + "step": 4224 + }, + { + "epoch": 0.08452, + "grad_norm": 2.453125, + "grad_norm_var": 0.01802978515625, + "learning_rate": 0.0001, + "loss": 4.7318, + "loss/crossentropy": 2.035117268562317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2585935667157173, + "step": 4226 + }, + { + "epoch": 0.08456, + "grad_norm": 2.234375, + "grad_norm_var": 0.020466105143229166, + "learning_rate": 0.0001, + "loss": 4.5674, + "loss/crossentropy": 2.0172035694122314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23832131922245026, + "step": 4228 + }, + { + "epoch": 0.0846, + "grad_norm": 2.484375, + "grad_norm_var": 0.011165364583333334, + "learning_rate": 0.0001, + "loss": 4.8113, + "loss/crossentropy": 2.0574535727500916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23829226195812225, + "step": 4230 + }, + { + "epoch": 0.08464, + "grad_norm": 2.3125, + "grad_norm_var": 0.010773722330729167, + "learning_rate": 0.0001, + "loss": 4.6776, + "loss/crossentropy": 2.5003366470336914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27905476093292236, + "step": 4232 + }, + { + "epoch": 0.08468, + "grad_norm": 2.578125, + "grad_norm_var": 0.012043253580729166, + "learning_rate": 0.0001, + "loss": 4.9137, + "loss/crossentropy": 2.207367777824402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27426937222480774, + "step": 4234 + }, + { + "epoch": 0.08472, + "grad_norm": 2.1875, + "grad_norm_var": 0.01627197265625, + "learning_rate": 0.0001, + "loss": 4.716, + "loss/crossentropy": 2.240189790725708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26208513230085373, + "step": 4236 + }, + { + "epoch": 0.08476, + "grad_norm": 2.3125, + "grad_norm_var": 0.016600545247395834, + "learning_rate": 0.0001, + "loss": 4.4569, + "loss/crossentropy": 2.1357412338256836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24108420312404633, + "step": 4238 + }, + { + "epoch": 0.0848, + "grad_norm": 2.296875, + "grad_norm_var": 0.013004557291666666, + "learning_rate": 0.0001, + "loss": 4.7249, + "loss/crossentropy": 2.1073816418647766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25213342159986496, + "step": 4240 + }, + { + "epoch": 0.08484, + "grad_norm": 2.40625, + "grad_norm_var": 0.013102213541666666, + "learning_rate": 0.0001, + "loss": 4.9558, + "loss/crossentropy": 2.158124566078186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2505848854780197, + "step": 4242 + }, + { + "epoch": 0.08488, + "grad_norm": 2.328125, + "grad_norm_var": 0.011555989583333334, + "learning_rate": 0.0001, + "loss": 4.7743, + "loss/crossentropy": 2.253539562225342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2675466388463974, + "step": 4244 + }, + { + "epoch": 0.08492, + "grad_norm": 2.5, + "grad_norm_var": 0.01396484375, + "learning_rate": 0.0001, + "loss": 4.272, + "loss/crossentropy": 1.7170023918151855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20157357305288315, + "step": 4246 + }, + { + "epoch": 0.08496, + "grad_norm": 2.40625, + "grad_norm_var": 0.0188385009765625, + "learning_rate": 0.0001, + "loss": 4.4479, + "loss/crossentropy": 2.082640767097473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2600134015083313, + "step": 4248 + }, + { + "epoch": 0.085, + "grad_norm": 2.34375, + "grad_norm_var": 0.0165679931640625, + "learning_rate": 0.0001, + "loss": 4.4176, + "loss/crossentropy": 2.044301390647888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23220707476139069, + "step": 4250 + }, + { + "epoch": 0.08504, + "grad_norm": 2.453125, + "grad_norm_var": 0.0168121337890625, + "learning_rate": 0.0001, + "loss": 4.648, + "loss/crossentropy": 2.293405532836914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28027066588401794, + "step": 4252 + }, + { + "epoch": 0.08508, + "grad_norm": 2.265625, + "grad_norm_var": 0.016649373372395835, + "learning_rate": 0.0001, + "loss": 4.55, + "loss/crossentropy": 2.2604206800460815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25545646995306015, + "step": 4254 + }, + { + "epoch": 0.08512, + "grad_norm": 2.65625, + "grad_norm_var": 0.023949178059895833, + "learning_rate": 0.0001, + "loss": 4.6258, + "loss/crossentropy": 2.118361234664917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24799348413944244, + "step": 4256 + }, + { + "epoch": 0.08516, + "grad_norm": 2.390625, + "grad_norm_var": 0.022847493489583332, + "learning_rate": 0.0001, + "loss": 4.6751, + "loss/crossentropy": 1.9369969964027405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24676478654146194, + "step": 4258 + }, + { + "epoch": 0.0852, + "grad_norm": 2.40625, + "grad_norm_var": 0.021728515625, + "learning_rate": 0.0001, + "loss": 4.5197, + "loss/crossentropy": 2.075170874595642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21742676943540573, + "step": 4260 + }, + { + "epoch": 0.08524, + "grad_norm": 2.296875, + "grad_norm_var": 0.018973795572916667, + "learning_rate": 0.0001, + "loss": 4.4112, + "loss/crossentropy": 2.056099236011505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2490842342376709, + "step": 4262 + }, + { + "epoch": 0.08528, + "grad_norm": 2.328125, + "grad_norm_var": 0.014436848958333333, + "learning_rate": 0.0001, + "loss": 4.6169, + "loss/crossentropy": 2.2279993891716003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24964337795972824, + "step": 4264 + }, + { + "epoch": 0.08532, + "grad_norm": 2.375, + "grad_norm_var": 0.011393229166666666, + "learning_rate": 0.0001, + "loss": 4.6686, + "loss/crossentropy": 2.1645933389663696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24576786905527115, + "step": 4266 + }, + { + "epoch": 0.08536, + "grad_norm": 2.25, + "grad_norm_var": 0.010887654622395833, + "learning_rate": 0.0001, + "loss": 4.4458, + "loss/crossentropy": 1.9033920764923096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23393237590789795, + "step": 4268 + }, + { + "epoch": 0.0854, + "grad_norm": 2.46875, + "grad_norm_var": 0.011865234375, + "learning_rate": 0.0001, + "loss": 4.4022, + "loss/crossentropy": 2.153634190559387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2496839165687561, + "step": 4270 + }, + { + "epoch": 0.08544, + "grad_norm": 2.28125, + "grad_norm_var": 0.0045206705729166664, + "learning_rate": 0.0001, + "loss": 4.4781, + "loss/crossentropy": 1.9188589453697205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23544569313526154, + "step": 4272 + }, + { + "epoch": 0.08548, + "grad_norm": 2.328125, + "grad_norm_var": 0.004264322916666666, + "learning_rate": 0.0001, + "loss": 4.704, + "loss/crossentropy": 2.4337977170944214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2951700836420059, + "step": 4274 + }, + { + "epoch": 0.08552, + "grad_norm": 2.359375, + "grad_norm_var": 0.003902180989583333, + "learning_rate": 0.0001, + "loss": 4.7051, + "loss/crossentropy": 1.9108383059501648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24459081888198853, + "step": 4276 + }, + { + "epoch": 0.08556, + "grad_norm": 2.375, + "grad_norm_var": 0.003123982747395833, + "learning_rate": 0.0001, + "loss": 4.3751, + "loss/crossentropy": 1.6632736921310425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22495487332344055, + "step": 4278 + }, + { + "epoch": 0.0856, + "grad_norm": 2.296875, + "grad_norm_var": 0.021312459309895834, + "learning_rate": 0.0001, + "loss": 4.8144, + "loss/crossentropy": 2.519997477531433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2657178193330765, + "step": 4280 + }, + { + "epoch": 0.08564, + "grad_norm": 2.640625, + "grad_norm_var": 0.025275675455729167, + "learning_rate": 0.0001, + "loss": 4.6837, + "loss/crossentropy": 2.150822162628174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30192580074071884, + "step": 4282 + }, + { + "epoch": 0.08568, + "grad_norm": 2.421875, + "grad_norm_var": 0.023763020833333332, + "learning_rate": 0.0001, + "loss": 4.7411, + "loss/crossentropy": 1.9970062971115112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24695321917533875, + "step": 4284 + }, + { + "epoch": 0.08572, + "grad_norm": 2.25, + "grad_norm_var": 0.024372355143229166, + "learning_rate": 0.0001, + "loss": 4.5361, + "loss/crossentropy": 2.3136098384857178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25082121044397354, + "step": 4286 + }, + { + "epoch": 0.08576, + "grad_norm": 2.5625, + "grad_norm_var": 0.025992838541666667, + "learning_rate": 0.0001, + "loss": 4.9171, + "loss/crossentropy": 2.112035870552063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27019062638282776, + "step": 4288 + }, + { + "epoch": 0.0858, + "grad_norm": 2.328125, + "grad_norm_var": 0.025992838541666667, + "learning_rate": 0.0001, + "loss": 4.4985, + "loss/crossentropy": 2.068653643131256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24297921359539032, + "step": 4290 + }, + { + "epoch": 0.08584, + "grad_norm": 2.375, + "grad_norm_var": 0.025520833333333333, + "learning_rate": 0.0001, + "loss": 4.5182, + "loss/crossentropy": 1.9013578295707703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23058265447616577, + "step": 4292 + }, + { + "epoch": 0.08588, + "grad_norm": 2.15625, + "grad_norm_var": 0.0294830322265625, + "learning_rate": 0.0001, + "loss": 4.6825, + "loss/crossentropy": 2.149984359741211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25223904848098755, + "step": 4294 + }, + { + "epoch": 0.08592, + "grad_norm": 2.328125, + "grad_norm_var": 0.014469401041666666, + "learning_rate": 0.0001, + "loss": 4.4109, + "loss/crossentropy": 1.894010066986084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2323223054409027, + "step": 4296 + }, + { + "epoch": 0.08596, + "grad_norm": 2.421875, + "grad_norm_var": 0.010123697916666667, + "learning_rate": 0.0001, + "loss": 4.7653, + "loss/crossentropy": 2.3351621627807617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27194930613040924, + "step": 4298 + }, + { + "epoch": 0.086, + "grad_norm": 2.328125, + "grad_norm_var": 0.0097320556640625, + "learning_rate": 0.0001, + "loss": 4.741, + "loss/crossentropy": 2.224352180957794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24884501099586487, + "step": 4300 + }, + { + "epoch": 0.08604, + "grad_norm": 2.421875, + "grad_norm_var": 0.008854166666666666, + "learning_rate": 0.0001, + "loss": 4.6592, + "loss/crossentropy": 1.908318042755127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24747492372989655, + "step": 4302 + }, + { + "epoch": 0.08608, + "grad_norm": 2.296875, + "grad_norm_var": 0.00758056640625, + "learning_rate": 0.0001, + "loss": 4.8566, + "loss/crossentropy": 2.1990396976470947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27217794954776764, + "step": 4304 + }, + { + "epoch": 0.08612, + "grad_norm": 2.390625, + "grad_norm_var": 0.0098297119140625, + "learning_rate": 0.0001, + "loss": 4.6432, + "loss/crossentropy": 2.3146010637283325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26875850558280945, + "step": 4306 + }, + { + "epoch": 0.08616, + "grad_norm": 2.46875, + "grad_norm_var": 0.015208943684895834, + "learning_rate": 0.0001, + "loss": 4.8254, + "loss/crossentropy": 2.2507941722869873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27052539587020874, + "step": 4308 + }, + { + "epoch": 0.0862, + "grad_norm": 2.234375, + "grad_norm_var": 0.013199869791666667, + "learning_rate": 0.0001, + "loss": 4.4067, + "loss/crossentropy": 1.9077125787734985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22987178713083267, + "step": 4310 + }, + { + "epoch": 0.08624, + "grad_norm": 2.515625, + "grad_norm_var": 0.01353759765625, + "learning_rate": 0.0001, + "loss": 4.4822, + "loss/crossentropy": 1.951395332813263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2392890453338623, + "step": 4312 + }, + { + "epoch": 0.08628, + "grad_norm": 2.53125, + "grad_norm_var": 0.033854166666666664, + "learning_rate": 0.0001, + "loss": 4.5371, + "loss/crossentropy": 1.9426860213279724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24374966323375702, + "step": 4314 + }, + { + "epoch": 0.08632, + "grad_norm": 3.390625, + "grad_norm_var": 0.09062398274739583, + "learning_rate": 0.0001, + "loss": 5.253, + "loss/crossentropy": 2.2508288621902466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3862452507019043, + "step": 4316 + }, + { + "epoch": 0.08636, + "grad_norm": 2.359375, + "grad_norm_var": 0.09058329264322916, + "learning_rate": 0.0001, + "loss": 4.5288, + "loss/crossentropy": 2.1161463260650635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2557579278945923, + "step": 4318 + }, + { + "epoch": 0.0864, + "grad_norm": 2.3125, + "grad_norm_var": 0.09374593098958334, + "learning_rate": 0.0001, + "loss": 5.1146, + "loss/crossentropy": 2.2570544481277466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31447017192840576, + "step": 4320 + }, + { + "epoch": 0.08644, + "grad_norm": 2.328125, + "grad_norm_var": 0.08740234375, + "learning_rate": 0.0001, + "loss": 4.9724, + "loss/crossentropy": 2.3211100101470947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26107798516750336, + "step": 4322 + }, + { + "epoch": 0.08648, + "grad_norm": 2.25, + "grad_norm_var": 0.09231669108072917, + "learning_rate": 0.0001, + "loss": 4.5236, + "loss/crossentropy": 2.1451058387756348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23633568733930588, + "step": 4324 + }, + { + "epoch": 0.08652, + "grad_norm": 2.21875, + "grad_norm_var": 0.09463602701822917, + "learning_rate": 0.0001, + "loss": 4.5828, + "loss/crossentropy": 1.9880141615867615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23688867688179016, + "step": 4326 + }, + { + "epoch": 0.08656, + "grad_norm": 2.296875, + "grad_norm_var": 0.09724934895833333, + "learning_rate": 0.0001, + "loss": 4.7098, + "loss/crossentropy": 2.021056890487671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2543798238039017, + "step": 4328 + }, + { + "epoch": 0.0866, + "grad_norm": 2.359375, + "grad_norm_var": 0.0814453125, + "learning_rate": 0.0001, + "loss": 4.6439, + "loss/crossentropy": 2.1323755979537964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27410852909088135, + "step": 4330 + }, + { + "epoch": 0.08664, + "grad_norm": 2.296875, + "grad_norm_var": 0.014940388997395833, + "learning_rate": 0.0001, + "loss": 4.6945, + "loss/crossentropy": 1.9674875736236572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24818265438079834, + "step": 4332 + }, + { + "epoch": 0.08668, + "grad_norm": 2.25, + "grad_norm_var": 0.01490478515625, + "learning_rate": 0.0001, + "loss": 4.1911, + "loss/crossentropy": 2.0466583967208862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2567761391401291, + "step": 4334 + }, + { + "epoch": 0.08672, + "grad_norm": 2.375, + "grad_norm_var": 0.004423014322916667, + "learning_rate": 0.0001, + "loss": 4.5937, + "loss/crossentropy": 2.138857126235962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26430967450141907, + "step": 4336 + }, + { + "epoch": 0.08676, + "grad_norm": 2.390625, + "grad_norm_var": 0.003902180989583333, + "learning_rate": 0.0001, + "loss": 4.7168, + "loss/crossentropy": 2.164841413497925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24389629065990448, + "step": 4338 + }, + { + "epoch": 0.0868, + "grad_norm": 2.4375, + "grad_norm_var": 0.005322265625, + "learning_rate": 0.0001, + "loss": 4.6017, + "loss/crossentropy": 2.2220189571380615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24361558258533478, + "step": 4340 + }, + { + "epoch": 0.08684, + "grad_norm": 2.390625, + "grad_norm_var": 0.004792277018229167, + "learning_rate": 0.0001, + "loss": 4.3088, + "loss/crossentropy": 1.7106285095214844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21558403968811035, + "step": 4342 + }, + { + "epoch": 0.08688, + "grad_norm": 2.203125, + "grad_norm_var": 0.0053670247395833336, + "learning_rate": 0.0001, + "loss": 4.1647, + "loss/crossentropy": 1.9200173020362854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2502904310822487, + "step": 4344 + }, + { + "epoch": 0.08692, + "grad_norm": 2.828125, + "grad_norm_var": 0.020796712239583334, + "learning_rate": 0.0001, + "loss": 4.6817, + "loss/crossentropy": 1.883722960948944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24818243086338043, + "step": 4346 + }, + { + "epoch": 0.08696, + "grad_norm": 2.484375, + "grad_norm_var": 0.026146443684895833, + "learning_rate": 0.0001, + "loss": 4.7509, + "loss/crossentropy": 2.2069878578186035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2518744319677353, + "step": 4348 + }, + { + "epoch": 0.087, + "grad_norm": 2.484375, + "grad_norm_var": 0.026656087239583334, + "learning_rate": 0.0001, + "loss": 4.7235, + "loss/crossentropy": 2.2158325910568237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2551049590110779, + "step": 4350 + }, + { + "epoch": 0.08704, + "grad_norm": 2.359375, + "grad_norm_var": 0.02730712890625, + "learning_rate": 0.0001, + "loss": 4.8406, + "loss/crossentropy": 2.0580105781555176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2742728739976883, + "step": 4352 + }, + { + "epoch": 0.08708, + "grad_norm": 2.34375, + "grad_norm_var": 0.02906494140625, + "learning_rate": 0.0001, + "loss": 4.5267, + "loss/crossentropy": 2.190276265144348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.274506613612175, + "step": 4354 + }, + { + "epoch": 0.08712, + "grad_norm": 2.3125, + "grad_norm_var": 0.027242024739583332, + "learning_rate": 0.0001, + "loss": 4.7499, + "loss/crossentropy": 2.2595328092575073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25295622646808624, + "step": 4356 + }, + { + "epoch": 0.08716, + "grad_norm": 2.4375, + "grad_norm_var": 0.027391560872395835, + "learning_rate": 0.0001, + "loss": 5.1247, + "loss/crossentropy": 2.322342872619629, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2490948587656021, + "step": 4358 + }, + { + "epoch": 0.0872, + "grad_norm": 2.328125, + "grad_norm_var": 0.024665323893229167, + "learning_rate": 0.0001, + "loss": 4.7045, + "loss/crossentropy": 2.108223795890808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25404803454875946, + "step": 4360 + }, + { + "epoch": 0.08724, + "grad_norm": 2.3125, + "grad_norm_var": 0.013825480143229167, + "learning_rate": 0.0001, + "loss": 4.5332, + "loss/crossentropy": 2.1363136768341064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2679157853126526, + "step": 4362 + }, + { + "epoch": 0.08728, + "grad_norm": 2.390625, + "grad_norm_var": 0.0076171875, + "learning_rate": 0.0001, + "loss": 4.6418, + "loss/crossentropy": 2.3207738399505615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2725762128829956, + "step": 4364 + }, + { + "epoch": 0.08732, + "grad_norm": 2.40625, + "grad_norm_var": 0.004813639322916666, + "learning_rate": 0.0001, + "loss": 4.7431, + "loss/crossentropy": 2.3179128170013428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27275949716567993, + "step": 4366 + }, + { + "epoch": 0.08736, + "grad_norm": 2.328125, + "grad_norm_var": 0.005182902018229167, + "learning_rate": 0.0001, + "loss": 4.7355, + "loss/crossentropy": 2.2130206823349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2787477523088455, + "step": 4368 + }, + { + "epoch": 0.0874, + "grad_norm": 2.40625, + "grad_norm_var": 0.004548136393229167, + "learning_rate": 0.0001, + "loss": 4.6193, + "loss/crossentropy": 2.3350926637649536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26483266800642014, + "step": 4370 + }, + { + "epoch": 0.08744, + "grad_norm": 2.4375, + "grad_norm_var": 0.005736287434895833, + "learning_rate": 0.0001, + "loss": 4.7329, + "loss/crossentropy": 1.9162638187408447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23748356848955154, + "step": 4372 + }, + { + "epoch": 0.08748, + "grad_norm": 2.484375, + "grad_norm_var": 0.006322224934895833, + "learning_rate": 0.0001, + "loss": 4.7045, + "loss/crossentropy": 2.2708429098129272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2709425985813141, + "step": 4374 + }, + { + "epoch": 0.08752, + "grad_norm": 2.234375, + "grad_norm_var": 0.011604817708333333, + "learning_rate": 0.0001, + "loss": 4.3481, + "loss/crossentropy": 1.7216318845748901, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22498781234025955, + "step": 4376 + }, + { + "epoch": 0.08756, + "grad_norm": 2.328125, + "grad_norm_var": 0.011937459309895834, + "learning_rate": 0.0001, + "loss": 4.4261, + "loss/crossentropy": 2.144331693649292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25400668382644653, + "step": 4378 + }, + { + "epoch": 0.0876, + "grad_norm": 2.328125, + "grad_norm_var": 0.011750284830729167, + "learning_rate": 0.0001, + "loss": 4.5617, + "loss/crossentropy": 2.305369734764099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29137127101421356, + "step": 4380 + }, + { + "epoch": 0.08764, + "grad_norm": 2.65625, + "grad_norm_var": 0.018115234375, + "learning_rate": 0.0001, + "loss": 4.6861, + "loss/crossentropy": 2.1156765818595886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24203064292669296, + "step": 4382 + }, + { + "epoch": 0.08768, + "grad_norm": 2.28125, + "grad_norm_var": 0.016630045572916665, + "learning_rate": 0.0001, + "loss": 4.4544, + "loss/crossentropy": 1.9081769585609436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22396781295537949, + "step": 4384 + }, + { + "epoch": 0.08772, + "grad_norm": 2.265625, + "grad_norm_var": 0.017769368489583333, + "learning_rate": 0.0001, + "loss": 4.704, + "loss/crossentropy": 2.0938609838485718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27330371737480164, + "step": 4386 + }, + { + "epoch": 0.08776, + "grad_norm": 2.34375, + "grad_norm_var": 0.017748006184895835, + "learning_rate": 0.0001, + "loss": 4.1179, + "loss/crossentropy": 1.9685207605361938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2287725731730461, + "step": 4388 + }, + { + "epoch": 0.0878, + "grad_norm": 2.3125, + "grad_norm_var": 0.016109212239583334, + "learning_rate": 0.0001, + "loss": 4.5233, + "loss/crossentropy": 1.7536925673484802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2345883920788765, + "step": 4390 + }, + { + "epoch": 0.08784, + "grad_norm": 2.359375, + "grad_norm_var": 0.029150390625, + "learning_rate": 0.0001, + "loss": 4.3897, + "loss/crossentropy": 2.145567834377289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26126645505428314, + "step": 4392 + }, + { + "epoch": 0.08788, + "grad_norm": 2.390625, + "grad_norm_var": 0.028348795572916665, + "learning_rate": 0.0001, + "loss": 4.4677, + "loss/crossentropy": 2.2211687564849854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3332909345626831, + "step": 4394 + }, + { + "epoch": 0.08792, + "grad_norm": 2.375, + "grad_norm_var": 0.028539021809895832, + "learning_rate": 0.0001, + "loss": 4.5545, + "loss/crossentropy": 1.919084072113037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2480143904685974, + "step": 4396 + }, + { + "epoch": 0.08796, + "grad_norm": 2.421875, + "grad_norm_var": 0.025121053059895832, + "learning_rate": 0.0001, + "loss": 4.6408, + "loss/crossentropy": 2.1769548654556274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.269208699464798, + "step": 4398 + }, + { + "epoch": 0.088, + "grad_norm": 2.5, + "grad_norm_var": 0.028706868489583332, + "learning_rate": 0.0001, + "loss": 4.7372, + "loss/crossentropy": 1.8480086922645569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2432136833667755, + "step": 4400 + }, + { + "epoch": 0.08804, + "grad_norm": 2.46875, + "grad_norm_var": 0.026838175455729165, + "learning_rate": 0.0001, + "loss": 4.7303, + "loss/crossentropy": 2.1948903799057007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28328536450862885, + "step": 4402 + }, + { + "epoch": 0.08808, + "grad_norm": 2.40625, + "grad_norm_var": 0.022554524739583335, + "learning_rate": 0.0001, + "loss": 4.6929, + "loss/crossentropy": 2.163570761680603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.282375693321228, + "step": 4404 + }, + { + "epoch": 0.08812, + "grad_norm": 2.34375, + "grad_norm_var": 0.022359212239583332, + "learning_rate": 0.0001, + "loss": 4.6882, + "loss/crossentropy": 2.3737696409225464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26703887432813644, + "step": 4406 + }, + { + "epoch": 0.08816, + "grad_norm": 2.71875, + "grad_norm_var": 0.015192667643229166, + "learning_rate": 0.0001, + "loss": 4.6959, + "loss/crossentropy": 2.2449779510498047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2744368612766266, + "step": 4408 + }, + { + "epoch": 0.0882, + "grad_norm": 2.515625, + "grad_norm_var": 0.015348307291666667, + "learning_rate": 0.0001, + "loss": 4.6602, + "loss/crossentropy": 2.1167399287223816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28688907623291016, + "step": 4410 + }, + { + "epoch": 0.08824, + "grad_norm": 2.3125, + "grad_norm_var": 0.017118326822916665, + "learning_rate": 0.0001, + "loss": 4.3777, + "loss/crossentropy": 2.2249929904937744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24534232914447784, + "step": 4412 + }, + { + "epoch": 0.08828, + "grad_norm": 2.421875, + "grad_norm_var": 0.019261678059895832, + "learning_rate": 0.0001, + "loss": 4.7013, + "loss/crossentropy": 2.172566771507263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2783561646938324, + "step": 4414 + }, + { + "epoch": 0.08832, + "grad_norm": 2.234375, + "grad_norm_var": 0.018684895833333333, + "learning_rate": 0.0001, + "loss": 4.3536, + "loss/crossentropy": 2.0709031224250793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2419627606868744, + "step": 4416 + }, + { + "epoch": 0.08836, + "grad_norm": 3.359375, + "grad_norm_var": 0.0735260009765625, + "learning_rate": 0.0001, + "loss": 4.8378, + "loss/crossentropy": 2.2390655279159546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27328142523765564, + "step": 4418 + }, + { + "epoch": 0.0884, + "grad_norm": 2.59375, + "grad_norm_var": 0.07681884765625, + "learning_rate": 0.0001, + "loss": 4.6879, + "loss/crossentropy": 2.061118960380554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23206621408462524, + "step": 4420 + }, + { + "epoch": 0.08844, + "grad_norm": 2.140625, + "grad_norm_var": 0.09147847493489583, + "learning_rate": 0.0001, + "loss": 4.3028, + "loss/crossentropy": 1.4919558763504028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17881463468074799, + "step": 4422 + }, + { + "epoch": 0.08848, + "grad_norm": 2.53125, + "grad_norm_var": 0.08662109375, + "learning_rate": 0.0001, + "loss": 4.7705, + "loss/crossentropy": 2.267301082611084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27718164026737213, + "step": 4424 + }, + { + "epoch": 0.08852, + "grad_norm": 2.375, + "grad_norm_var": 0.0862457275390625, + "learning_rate": 0.0001, + "loss": 4.635, + "loss/crossentropy": 1.9008094668388367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22865734994411469, + "step": 4426 + }, + { + "epoch": 0.08856, + "grad_norm": 2.421875, + "grad_norm_var": 0.090966796875, + "learning_rate": 0.0001, + "loss": 4.385, + "loss/crossentropy": 1.8788060545921326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23521529138088226, + "step": 4428 + }, + { + "epoch": 0.0886, + "grad_norm": 2.546875, + "grad_norm_var": 0.08844401041666666, + "learning_rate": 0.0001, + "loss": 4.6703, + "loss/crossentropy": 2.0775802731513977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3402235209941864, + "step": 4430 + }, + { + "epoch": 0.08864, + "grad_norm": 2.515625, + "grad_norm_var": 0.08642171223958334, + "learning_rate": 0.0001, + "loss": 4.5722, + "loss/crossentropy": 2.0950201749801636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2665044367313385, + "step": 4432 + }, + { + "epoch": 0.08868, + "grad_norm": 2.40625, + "grad_norm_var": 0.026167805989583334, + "learning_rate": 0.0001, + "loss": 4.9235, + "loss/crossentropy": 2.328200340270996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24911007285118103, + "step": 4434 + }, + { + "epoch": 0.08872, + "grad_norm": 2.515625, + "grad_norm_var": 0.023273722330729166, + "learning_rate": 0.0001, + "loss": 4.7462, + "loss/crossentropy": 2.1840142011642456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2855416387319565, + "step": 4436 + }, + { + "epoch": 0.08876, + "grad_norm": 2.25, + "grad_norm_var": 0.015559895833333334, + "learning_rate": 0.0001, + "loss": 4.1399, + "loss/crossentropy": 1.6167555451393127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21646380424499512, + "step": 4438 + }, + { + "epoch": 0.0888, + "grad_norm": 2.25, + "grad_norm_var": 0.015543619791666666, + "learning_rate": 0.0001, + "loss": 4.5274, + "loss/crossentropy": 1.9902858138084412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.237789124250412, + "step": 4440 + }, + { + "epoch": 0.08884, + "grad_norm": 2.359375, + "grad_norm_var": 0.016357421875, + "learning_rate": 0.0001, + "loss": 4.6682, + "loss/crossentropy": 2.4779287576675415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2726414203643799, + "step": 4442 + }, + { + "epoch": 0.08888, + "grad_norm": 2.46875, + "grad_norm_var": 0.0116119384765625, + "learning_rate": 0.0001, + "loss": 4.7415, + "loss/crossentropy": 2.0914896726608276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22887279838323593, + "step": 4444 + }, + { + "epoch": 0.08892, + "grad_norm": 2.265625, + "grad_norm_var": 0.010570271809895834, + "learning_rate": 0.0001, + "loss": 4.498, + "loss/crossentropy": 2.0526055693626404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2509382963180542, + "step": 4446 + }, + { + "epoch": 0.08896, + "grad_norm": 2.15625, + "grad_norm_var": 0.010347493489583333, + "learning_rate": 0.0001, + "loss": 4.4306, + "loss/crossentropy": 1.9779084920883179, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2320951297879219, + "step": 4448 + }, + { + "epoch": 0.089, + "grad_norm": 2.421875, + "grad_norm_var": 0.010399373372395833, + "learning_rate": 0.0001, + "loss": 4.7725, + "loss/crossentropy": 2.2081698179244995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29023079574108124, + "step": 4450 + }, + { + "epoch": 0.08904, + "grad_norm": 2.296875, + "grad_norm_var": 0.0074127197265625, + "learning_rate": 0.0001, + "loss": 4.542, + "loss/crossentropy": 1.834806501865387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23430980741977692, + "step": 4452 + }, + { + "epoch": 0.08908, + "grad_norm": 2.28125, + "grad_norm_var": 0.006884765625, + "learning_rate": 0.0001, + "loss": 4.7087, + "loss/crossentropy": 2.4750468730926514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27945323288440704, + "step": 4454 + }, + { + "epoch": 0.08912, + "grad_norm": 2.28125, + "grad_norm_var": 0.007255045572916666, + "learning_rate": 0.0001, + "loss": 4.6822, + "loss/crossentropy": 2.1766942739486694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28881968557834625, + "step": 4456 + }, + { + "epoch": 0.08916, + "grad_norm": 2.28125, + "grad_norm_var": 0.009065755208333333, + "learning_rate": 0.0001, + "loss": 4.9204, + "loss/crossentropy": 2.265195846557617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2616717368364334, + "step": 4458 + }, + { + "epoch": 0.0892, + "grad_norm": 2.171875, + "grad_norm_var": 0.011473592122395833, + "learning_rate": 0.0001, + "loss": 4.5747, + "loss/crossentropy": 2.2438716888427734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2630545049905777, + "step": 4460 + }, + { + "epoch": 0.08924, + "grad_norm": 2.28125, + "grad_norm_var": 0.012572224934895833, + "learning_rate": 0.0001, + "loss": 4.3404, + "loss/crossentropy": 2.060324013233185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23639824986457825, + "step": 4462 + }, + { + "epoch": 0.08928, + "grad_norm": 2.453125, + "grad_norm_var": 0.0331207275390625, + "learning_rate": 0.0001, + "loss": 4.733, + "loss/crossentropy": 1.8830525279045105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21016474813222885, + "step": 4464 + }, + { + "epoch": 0.08932, + "grad_norm": 2.484375, + "grad_norm_var": 0.03435872395833333, + "learning_rate": 0.0001, + "loss": 4.7571, + "loss/crossentropy": 2.3420846462249756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28108468651771545, + "step": 4466 + }, + { + "epoch": 0.08936, + "grad_norm": 2.515625, + "grad_norm_var": 0.0337310791015625, + "learning_rate": 0.0001, + "loss": 4.6851, + "loss/crossentropy": 1.9140342473983765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22587041556835175, + "step": 4468 + }, + { + "epoch": 0.0894, + "grad_norm": 2.390625, + "grad_norm_var": 0.032613118489583336, + "learning_rate": 0.0001, + "loss": 4.79, + "loss/crossentropy": 1.9753122925758362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24506878852844238, + "step": 4470 + }, + { + "epoch": 0.08944, + "grad_norm": 2.3125, + "grad_norm_var": 0.032389322916666664, + "learning_rate": 0.0001, + "loss": 4.7769, + "loss/crossentropy": 2.0735195875167847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2763114273548126, + "step": 4472 + }, + { + "epoch": 0.08948, + "grad_norm": 2.28125, + "grad_norm_var": 0.034403483072916664, + "learning_rate": 0.0001, + "loss": 4.3593, + "loss/crossentropy": 1.7784460186958313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22829323261976242, + "step": 4474 + }, + { + "epoch": 0.08952, + "grad_norm": 2.625, + "grad_norm_var": 0.051041666666666666, + "learning_rate": 0.0001, + "loss": 4.9451, + "loss/crossentropy": 2.1188095808029175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27410270273685455, + "step": 4476 + }, + { + "epoch": 0.08956, + "grad_norm": 2.421875, + "grad_norm_var": 0.051634724934895834, + "learning_rate": 0.0001, + "loss": 4.3392, + "loss/crossentropy": 2.320235252380371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27514997124671936, + "step": 4478 + }, + { + "epoch": 0.0896, + "grad_norm": 2.265625, + "grad_norm_var": 0.03766988118489583, + "learning_rate": 0.0001, + "loss": 4.8345, + "loss/crossentropy": 2.3023080825805664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2516366094350815, + "step": 4480 + }, + { + "epoch": 0.08964, + "grad_norm": 2.390625, + "grad_norm_var": 0.039567057291666666, + "learning_rate": 0.0001, + "loss": 4.6698, + "loss/crossentropy": 1.9677563905715942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2362382560968399, + "step": 4482 + }, + { + "epoch": 0.08968, + "grad_norm": 2.34375, + "grad_norm_var": 0.03951416015625, + "learning_rate": 0.0001, + "loss": 4.9159, + "loss/crossentropy": 2.2005198001861572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2760336175560951, + "step": 4484 + }, + { + "epoch": 0.08972, + "grad_norm": 2.0625, + "grad_norm_var": 0.0478912353515625, + "learning_rate": 0.0001, + "loss": 4.4418, + "loss/crossentropy": 1.9799931049346924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25132423639297485, + "step": 4486 + }, + { + "epoch": 0.08976, + "grad_norm": 2.46875, + "grad_norm_var": 0.10212300618489584, + "learning_rate": 0.0001, + "loss": 4.9643, + "loss/crossentropy": 2.205371141433716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2695635259151459, + "step": 4488 + }, + { + "epoch": 0.0898, + "grad_norm": 2.421875, + "grad_norm_var": 0.09692281087239583, + "learning_rate": 0.0001, + "loss": 4.9695, + "loss/crossentropy": 2.3500062227249146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2736963629722595, + "step": 4490 + }, + { + "epoch": 0.08984, + "grad_norm": 2.59375, + "grad_norm_var": 0.07867431640625, + "learning_rate": 0.0001, + "loss": 4.9083, + "loss/crossentropy": 2.386352837085724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2472890019416809, + "step": 4492 + }, + { + "epoch": 0.08988, + "grad_norm": 2.265625, + "grad_norm_var": 0.07701416015625, + "learning_rate": 0.0001, + "loss": 4.3598, + "loss/crossentropy": 1.9863982200622559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2288871705532074, + "step": 4494 + }, + { + "epoch": 0.08992, + "grad_norm": 2.34375, + "grad_norm_var": 0.07757059733072917, + "learning_rate": 0.0001, + "loss": 4.6157, + "loss/crossentropy": 2.4088594913482666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24304527044296265, + "step": 4496 + }, + { + "epoch": 0.08996, + "grad_norm": 2.5, + "grad_norm_var": 0.07517801920572917, + "learning_rate": 0.0001, + "loss": 4.7457, + "loss/crossentropy": 2.1663140058517456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2681227922439575, + "step": 4498 + }, + { + "epoch": 0.09, + "grad_norm": 2.171875, + "grad_norm_var": 0.0810455322265625, + "learning_rate": 0.0001, + "loss": 4.2199, + "loss/crossentropy": 1.9233570098876953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23169977217912674, + "step": 4500 + }, + { + "epoch": 0.09004, + "grad_norm": 2.40625, + "grad_norm_var": 0.07579752604166666, + "learning_rate": 0.0001, + "loss": 4.4677, + "loss/crossentropy": 2.2940425872802734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24809539318084717, + "step": 4502 + }, + { + "epoch": 0.09008, + "grad_norm": 2.21875, + "grad_norm_var": 0.015771484375, + "learning_rate": 0.0001, + "loss": 4.5333, + "loss/crossentropy": 2.1874176263809204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26286616921424866, + "step": 4504 + }, + { + "epoch": 0.09012, + "grad_norm": 2.28125, + "grad_norm_var": 0.015868123372395834, + "learning_rate": 0.0001, + "loss": 4.5029, + "loss/crossentropy": 2.190543472766876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26766398549079895, + "step": 4506 + }, + { + "epoch": 0.09016, + "grad_norm": 2.296875, + "grad_norm_var": 0.010823567708333334, + "learning_rate": 0.0001, + "loss": 4.464, + "loss/crossentropy": 2.132491707801819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2514026165008545, + "step": 4508 + }, + { + "epoch": 0.0902, + "grad_norm": 2.296875, + "grad_norm_var": 0.0107818603515625, + "learning_rate": 0.0001, + "loss": 4.5032, + "loss/crossentropy": 2.1492353677749634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25663119554519653, + "step": 4510 + }, + { + "epoch": 0.09024, + "grad_norm": 2.203125, + "grad_norm_var": 0.011454264322916666, + "learning_rate": 0.0001, + "loss": 4.6166, + "loss/crossentropy": 2.2471927404403687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24812395125627518, + "step": 4512 + }, + { + "epoch": 0.09028, + "grad_norm": 2.34375, + "grad_norm_var": 0.007013956705729167, + "learning_rate": 0.0001, + "loss": 4.5651, + "loss/crossentropy": 2.1944304704666138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26567137241363525, + "step": 4514 + }, + { + "epoch": 0.09032, + "grad_norm": 2.203125, + "grad_norm_var": 0.006883748372395833, + "learning_rate": 0.0001, + "loss": 4.3054, + "loss/crossentropy": 1.7537739872932434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21784386038780212, + "step": 4516 + }, + { + "epoch": 0.09036, + "grad_norm": 2.46875, + "grad_norm_var": 0.006981404622395834, + "learning_rate": 0.0001, + "loss": 4.6961, + "loss/crossentropy": 2.133580207824707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27865441143512726, + "step": 4518 + }, + { + "epoch": 0.0904, + "grad_norm": 2.5, + "grad_norm_var": 0.008723958333333334, + "learning_rate": 0.0001, + "loss": 4.5486, + "loss/crossentropy": 2.0858315229415894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26390860974788666, + "step": 4520 + }, + { + "epoch": 0.09044, + "grad_norm": 2.3125, + "grad_norm_var": 0.008101399739583333, + "learning_rate": 0.0001, + "loss": 4.6833, + "loss/crossentropy": 1.9084516763687134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23013630509376526, + "step": 4522 + }, + { + "epoch": 0.09048, + "grad_norm": 2.234375, + "grad_norm_var": 0.00859375, + "learning_rate": 0.0001, + "loss": 4.5817, + "loss/crossentropy": 2.2123712301254272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2739466577768326, + "step": 4524 + }, + { + "epoch": 0.09052, + "grad_norm": 2.453125, + "grad_norm_var": 0.009691365559895833, + "learning_rate": 0.0001, + "loss": 4.9898, + "loss/crossentropy": 2.3532934188842773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25509175658226013, + "step": 4526 + }, + { + "epoch": 0.09056, + "grad_norm": 2.328125, + "grad_norm_var": 0.008137003580729166, + "learning_rate": 0.0001, + "loss": 4.7786, + "loss/crossentropy": 2.1543048620224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23639082163572311, + "step": 4528 + }, + { + "epoch": 0.0906, + "grad_norm": 2.4375, + "grad_norm_var": 0.0109527587890625, + "learning_rate": 0.0001, + "loss": 4.647, + "loss/crossentropy": 2.1322286128997803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23874164372682571, + "step": 4530 + }, + { + "epoch": 0.09064, + "grad_norm": 2.25, + "grad_norm_var": 0.01011962890625, + "learning_rate": 0.0001, + "loss": 4.7623, + "loss/crossentropy": 2.5552597045898438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24949757009744644, + "step": 4532 + }, + { + "epoch": 0.09068, + "grad_norm": 2.421875, + "grad_norm_var": 0.0117095947265625, + "learning_rate": 0.0001, + "loss": 4.6999, + "loss/crossentropy": 2.1248819231987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2795845717191696, + "step": 4534 + }, + { + "epoch": 0.09072, + "grad_norm": 2.390625, + "grad_norm_var": 0.014655558268229167, + "learning_rate": 0.0001, + "loss": 4.7739, + "loss/crossentropy": 1.985984206199646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22764238715171814, + "step": 4536 + }, + { + "epoch": 0.09076, + "grad_norm": 2.328125, + "grad_norm_var": 0.01519775390625, + "learning_rate": 0.0001, + "loss": 4.64, + "loss/crossentropy": 2.220720648765564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2693287283182144, + "step": 4538 + }, + { + "epoch": 0.0908, + "grad_norm": 2.3125, + "grad_norm_var": 0.01324462890625, + "learning_rate": 0.0001, + "loss": 4.486, + "loss/crossentropy": 1.9541595578193665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2419680804014206, + "step": 4540 + }, + { + "epoch": 0.09084, + "grad_norm": 2.53125, + "grad_norm_var": 0.013916015625, + "learning_rate": 0.0001, + "loss": 4.963, + "loss/crossentropy": 2.275113582611084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2562567666172981, + "step": 4542 + }, + { + "epoch": 0.09088, + "grad_norm": 2.3125, + "grad_norm_var": 0.017122395833333335, + "learning_rate": 0.0001, + "loss": 4.7535, + "loss/crossentropy": 2.4411803483963013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25798996537923813, + "step": 4544 + }, + { + "epoch": 0.09092, + "grad_norm": 2.34375, + "grad_norm_var": 0.019823201497395835, + "learning_rate": 0.0001, + "loss": 4.7339, + "loss/crossentropy": 2.035769820213318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25313758105039597, + "step": 4546 + }, + { + "epoch": 0.09096, + "grad_norm": 2.171875, + "grad_norm_var": 0.018701171875, + "learning_rate": 0.0001, + "loss": 4.1331, + "loss/crossentropy": 1.9237529039382935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24456002563238144, + "step": 4548 + }, + { + "epoch": 0.091, + "grad_norm": 2.234375, + "grad_norm_var": 0.02281494140625, + "learning_rate": 0.0001, + "loss": 4.6762, + "loss/crossentropy": 2.179704189300537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2681535929441452, + "step": 4550 + }, + { + "epoch": 0.09104, + "grad_norm": 2.46875, + "grad_norm_var": 0.024137369791666665, + "learning_rate": 0.0001, + "loss": 4.6899, + "loss/crossentropy": 2.013023316860199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22643990069627762, + "step": 4552 + }, + { + "epoch": 0.09108, + "grad_norm": 2.453125, + "grad_norm_var": 0.024201456705729166, + "learning_rate": 0.0001, + "loss": 4.6527, + "loss/crossentropy": 2.173883557319641, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26495447754859924, + "step": 4554 + }, + { + "epoch": 0.09112, + "grad_norm": 2.296875, + "grad_norm_var": 0.025121053059895832, + "learning_rate": 0.0001, + "loss": 4.5323, + "loss/crossentropy": 1.9398870468139648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24307211488485336, + "step": 4556 + }, + { + "epoch": 0.09116, + "grad_norm": 2.28125, + "grad_norm_var": 0.025423177083333335, + "learning_rate": 0.0001, + "loss": 4.2028, + "loss/crossentropy": 1.8551223874092102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21659143269062042, + "step": 4558 + }, + { + "epoch": 0.0912, + "grad_norm": 2.375, + "grad_norm_var": 0.0254058837890625, + "learning_rate": 0.0001, + "loss": 4.7813, + "loss/crossentropy": 2.104207456111908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26515253633260727, + "step": 4560 + }, + { + "epoch": 0.09124, + "grad_norm": 2.484375, + "grad_norm_var": 0.023875935872395834, + "learning_rate": 0.0001, + "loss": 4.6241, + "loss/crossentropy": 2.31631863117218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25454505532979965, + "step": 4562 + }, + { + "epoch": 0.09128, + "grad_norm": 2.265625, + "grad_norm_var": 0.021800740559895834, + "learning_rate": 0.0001, + "loss": 4.5732, + "loss/crossentropy": 2.331356406211853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2667195200920105, + "step": 4564 + }, + { + "epoch": 0.09132, + "grad_norm": 2.515625, + "grad_norm_var": 0.0176910400390625, + "learning_rate": 0.0001, + "loss": 4.6714, + "loss/crossentropy": 2.2126184701919556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24132181704044342, + "step": 4566 + }, + { + "epoch": 0.09136, + "grad_norm": 2.25, + "grad_norm_var": 0.013167317708333333, + "learning_rate": 0.0001, + "loss": 4.5753, + "loss/crossentropy": 2.330659508705139, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2838585078716278, + "step": 4568 + }, + { + "epoch": 0.0914, + "grad_norm": 2.28125, + "grad_norm_var": 0.0132720947265625, + "learning_rate": 0.0001, + "loss": 4.4935, + "loss/crossentropy": 2.167214035987854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24021611362695694, + "step": 4570 + }, + { + "epoch": 0.09144, + "grad_norm": 2.15625, + "grad_norm_var": 0.014046223958333333, + "learning_rate": 0.0001, + "loss": 4.5847, + "loss/crossentropy": 1.770102322101593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108836993575096, + "step": 4572 + }, + { + "epoch": 0.09148, + "grad_norm": 2.234375, + "grad_norm_var": 0.015425618489583333, + "learning_rate": 0.0001, + "loss": 4.2906, + "loss/crossentropy": 1.9293717741966248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2186967208981514, + "step": 4574 + }, + { + "epoch": 0.09152, + "grad_norm": 2.484375, + "grad_norm_var": 0.014188639322916667, + "learning_rate": 0.0001, + "loss": 4.8128, + "loss/crossentropy": 2.4099135398864746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25587645173072815, + "step": 4576 + }, + { + "epoch": 0.09156, + "grad_norm": 2.359375, + "grad_norm_var": 0.06220296223958333, + "learning_rate": 0.0001, + "loss": 4.5727, + "loss/crossentropy": 1.7967870831489563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2521464377641678, + "step": 4578 + }, + { + "epoch": 0.0916, + "grad_norm": 2.46875, + "grad_norm_var": 0.0616851806640625, + "learning_rate": 0.0001, + "loss": 4.7604, + "loss/crossentropy": 1.989583432674408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22892683744430542, + "step": 4580 + }, + { + "epoch": 0.09164, + "grad_norm": 2.625, + "grad_norm_var": 0.0626953125, + "learning_rate": 0.0001, + "loss": 4.4986, + "loss/crossentropy": 2.167198657989502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24552703648805618, + "step": 4582 + }, + { + "epoch": 0.09168, + "grad_norm": 2.453125, + "grad_norm_var": 0.06129150390625, + "learning_rate": 0.0001, + "loss": 4.8237, + "loss/crossentropy": 2.213137984275818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2614079788327217, + "step": 4584 + }, + { + "epoch": 0.09172, + "grad_norm": 2.265625, + "grad_norm_var": 0.06194661458333333, + "learning_rate": 0.0001, + "loss": 4.977, + "loss/crossentropy": 2.3586392998695374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2527218610048294, + "step": 4586 + }, + { + "epoch": 0.09176, + "grad_norm": 2.53125, + "grad_norm_var": 0.06054280598958333, + "learning_rate": 0.0001, + "loss": 4.5118, + "loss/crossentropy": 2.326598286628723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2684827446937561, + "step": 4588 + }, + { + "epoch": 0.0918, + "grad_norm": 2.234375, + "grad_norm_var": 0.055562337239583336, + "learning_rate": 0.0001, + "loss": 4.4296, + "loss/crossentropy": 2.1365907192230225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2705482095479965, + "step": 4590 + }, + { + "epoch": 0.09184, + "grad_norm": 2.328125, + "grad_norm_var": 0.056005859375, + "learning_rate": 0.0001, + "loss": 4.6895, + "loss/crossentropy": 1.816661775112152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2188858687877655, + "step": 4592 + }, + { + "epoch": 0.09188, + "grad_norm": 2.203125, + "grad_norm_var": 0.020731608072916668, + "learning_rate": 0.0001, + "loss": 4.5953, + "loss/crossentropy": 2.1533923149108887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2372177392244339, + "step": 4594 + }, + { + "epoch": 0.09192, + "grad_norm": 2.984375, + "grad_norm_var": 0.04504801432291667, + "learning_rate": 0.0001, + "loss": 4.9472, + "loss/crossentropy": 2.2175614833831787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27555912733078003, + "step": 4596 + }, + { + "epoch": 0.09196, + "grad_norm": 2.640625, + "grad_norm_var": 0.04644775390625, + "learning_rate": 0.0001, + "loss": 5.11, + "loss/crossentropy": 2.583792209625244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2970864772796631, + "step": 4598 + }, + { + "epoch": 0.092, + "grad_norm": 2.40625, + "grad_norm_var": 0.046751912434895834, + "learning_rate": 0.0001, + "loss": 4.5761, + "loss/crossentropy": 2.227868676185608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.259210504591465, + "step": 4600 + }, + { + "epoch": 0.09204, + "grad_norm": 2.265625, + "grad_norm_var": 0.047118123372395834, + "learning_rate": 0.0001, + "loss": 4.4125, + "loss/crossentropy": 2.146941900253296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26443855464458466, + "step": 4602 + }, + { + "epoch": 0.09208, + "grad_norm": 2.296875, + "grad_norm_var": 0.04439697265625, + "learning_rate": 0.0001, + "loss": 4.8655, + "loss/crossentropy": 2.129795551300049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2451881766319275, + "step": 4604 + }, + { + "epoch": 0.09212, + "grad_norm": 2.8125, + "grad_norm_var": 0.0506256103515625, + "learning_rate": 0.0001, + "loss": 4.895, + "loss/crossentropy": 2.2696213722229004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26037923991680145, + "step": 4606 + }, + { + "epoch": 0.09216, + "grad_norm": 3.03125, + "grad_norm_var": 0.07105712890625, + "learning_rate": 0.0001, + "loss": 4.7424, + "loss/crossentropy": 2.1916056871414185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2648691013455391, + "step": 4608 + }, + { + "epoch": 0.0922, + "grad_norm": 2.15625, + "grad_norm_var": 0.07119038899739584, + "learning_rate": 0.0001, + "loss": 4.5021, + "loss/crossentropy": 1.975761890411377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22248996049165726, + "step": 4610 + }, + { + "epoch": 0.09224, + "grad_norm": 2.265625, + "grad_norm_var": 0.053511555989583334, + "learning_rate": 0.0001, + "loss": 4.6596, + "loss/crossentropy": 2.403375506401062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24553030729293823, + "step": 4612 + }, + { + "epoch": 0.09228, + "grad_norm": 2.390625, + "grad_norm_var": 0.049637858072916666, + "learning_rate": 0.0001, + "loss": 4.4542, + "loss/crossentropy": 1.9468475580215454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24023501574993134, + "step": 4614 + }, + { + "epoch": 0.09232, + "grad_norm": 2.40625, + "grad_norm_var": 0.0534088134765625, + "learning_rate": 0.0001, + "loss": 4.8095, + "loss/crossentropy": 2.013557195663452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23829826712608337, + "step": 4616 + }, + { + "epoch": 0.09236, + "grad_norm": 2.265625, + "grad_norm_var": 0.0519439697265625, + "learning_rate": 0.0001, + "loss": 4.5736, + "loss/crossentropy": 2.0552549958229065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2391991689801216, + "step": 4618 + }, + { + "epoch": 0.0924, + "grad_norm": 2.28125, + "grad_norm_var": 0.053694661458333334, + "learning_rate": 0.0001, + "loss": 4.4091, + "loss/crossentropy": 1.8311110734939575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23821169883012772, + "step": 4620 + }, + { + "epoch": 0.09244, + "grad_norm": 2.28125, + "grad_norm_var": 0.04431966145833333, + "learning_rate": 0.0001, + "loss": 4.5034, + "loss/crossentropy": 2.0346454977989197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23688799142837524, + "step": 4622 + }, + { + "epoch": 0.09248, + "grad_norm": 2.484375, + "grad_norm_var": 0.016304524739583333, + "learning_rate": 0.0001, + "loss": 4.5329, + "loss/crossentropy": 1.8475716710090637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23444947600364685, + "step": 4624 + }, + { + "epoch": 0.09252, + "grad_norm": 2.328125, + "grad_norm_var": 0.0136383056640625, + "learning_rate": 0.0001, + "loss": 4.4315, + "loss/crossentropy": 2.075626790523529, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2663211151957512, + "step": 4626 + }, + { + "epoch": 0.09256, + "grad_norm": 2.46875, + "grad_norm_var": 0.0130035400390625, + "learning_rate": 0.0001, + "loss": 4.9214, + "loss/crossentropy": 2.413028359413147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29516373574733734, + "step": 4628 + }, + { + "epoch": 0.0926, + "grad_norm": 2.53125, + "grad_norm_var": 0.014143880208333333, + "learning_rate": 0.0001, + "loss": 4.6878, + "loss/crossentropy": 1.8549358248710632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23275860399007797, + "step": 4630 + }, + { + "epoch": 0.09264, + "grad_norm": 2.390625, + "grad_norm_var": 0.0076405843098958336, + "learning_rate": 0.0001, + "loss": 4.8594, + "loss/crossentropy": 2.273250460624695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2511187419295311, + "step": 4632 + }, + { + "epoch": 0.09268, + "grad_norm": 2.4375, + "grad_norm_var": 0.013093058268229167, + "learning_rate": 0.0001, + "loss": 4.336, + "loss/crossentropy": 1.929758369922638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25462278723716736, + "step": 4634 + }, + { + "epoch": 0.09272, + "grad_norm": 2.296875, + "grad_norm_var": 0.013158162434895834, + "learning_rate": 0.0001, + "loss": 4.5888, + "loss/crossentropy": 2.0929598212242126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23592843115329742, + "step": 4636 + }, + { + "epoch": 0.09276, + "grad_norm": 2.21875, + "grad_norm_var": 0.015282185872395833, + "learning_rate": 0.0001, + "loss": 4.4595, + "loss/crossentropy": 1.973824143409729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24049720913171768, + "step": 4638 + }, + { + "epoch": 0.0928, + "grad_norm": 2.25, + "grad_norm_var": 0.014207967122395833, + "learning_rate": 0.0001, + "loss": 4.2606, + "loss/crossentropy": 1.8606626987457275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21698658913373947, + "step": 4640 + }, + { + "epoch": 0.09284, + "grad_norm": 2.5625, + "grad_norm_var": 0.01875, + "learning_rate": 0.0001, + "loss": 4.9827, + "loss/crossentropy": 2.241386890411377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2831972986459732, + "step": 4642 + }, + { + "epoch": 0.09288, + "grad_norm": 2.34375, + "grad_norm_var": 0.0166656494140625, + "learning_rate": 0.0001, + "loss": 4.7046, + "loss/crossentropy": 2.2538920640945435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2482161968946457, + "step": 4644 + }, + { + "epoch": 0.09292, + "grad_norm": 2.578125, + "grad_norm_var": 0.01812744140625, + "learning_rate": 0.0001, + "loss": 4.8557, + "loss/crossentropy": 2.067206382751465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26665763556957245, + "step": 4646 + }, + { + "epoch": 0.09296, + "grad_norm": 2.5625, + "grad_norm_var": 0.021751912434895833, + "learning_rate": 0.0001, + "loss": 4.9474, + "loss/crossentropy": 2.33401358127594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2752760946750641, + "step": 4648 + }, + { + "epoch": 0.093, + "grad_norm": 2.4375, + "grad_norm_var": 0.04914449055989583, + "learning_rate": 0.0001, + "loss": 4.8658, + "loss/crossentropy": 2.1510735750198364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24588150531053543, + "step": 4650 + }, + { + "epoch": 0.09304, + "grad_norm": 2.28125, + "grad_norm_var": 0.049397786458333336, + "learning_rate": 0.0001, + "loss": 4.579, + "loss/crossentropy": 2.086448848247528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.261296346783638, + "step": 4652 + }, + { + "epoch": 0.09308, + "grad_norm": 2.5625, + "grad_norm_var": 0.0446685791015625, + "learning_rate": 0.0001, + "loss": 4.7495, + "loss/crossentropy": 1.5951193571090698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2312234491109848, + "step": 4654 + }, + { + "epoch": 0.09312, + "grad_norm": 2.328125, + "grad_norm_var": 0.04052734375, + "learning_rate": 0.0001, + "loss": 4.2273, + "loss/crossentropy": 2.0098360776901245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22354383766651154, + "step": 4656 + }, + { + "epoch": 0.09316, + "grad_norm": 2.375, + "grad_norm_var": 0.04079488118489583, + "learning_rate": 0.0001, + "loss": 4.6047, + "loss/crossentropy": 2.263219714164734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26900506019592285, + "step": 4658 + }, + { + "epoch": 0.0932, + "grad_norm": 2.828125, + "grad_norm_var": 0.0485504150390625, + "learning_rate": 0.0001, + "loss": 4.5575, + "loss/crossentropy": 1.9481555819511414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24665354192256927, + "step": 4660 + }, + { + "epoch": 0.09324, + "grad_norm": 2.265625, + "grad_norm_var": 0.04988505045572917, + "learning_rate": 0.0001, + "loss": 4.5895, + "loss/crossentropy": 1.8751367926597595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24356309324502945, + "step": 4662 + }, + { + "epoch": 0.09328, + "grad_norm": 2.28125, + "grad_norm_var": 0.06110026041666667, + "learning_rate": 0.0001, + "loss": 4.7739, + "loss/crossentropy": 2.343896746635437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2558425962924957, + "step": 4664 + }, + { + "epoch": 0.09332, + "grad_norm": 2.40625, + "grad_norm_var": 0.032811482747395836, + "learning_rate": 0.0001, + "loss": 4.779, + "loss/crossentropy": 2.067797303199768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25030098110437393, + "step": 4666 + }, + { + "epoch": 0.09336, + "grad_norm": 2.359375, + "grad_norm_var": 0.029816691080729166, + "learning_rate": 0.0001, + "loss": 4.7427, + "loss/crossentropy": 2.4221161603927612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26934675872325897, + "step": 4668 + }, + { + "epoch": 0.0934, + "grad_norm": 2.8125, + "grad_norm_var": 0.038182576497395836, + "learning_rate": 0.0001, + "loss": 4.845, + "loss/crossentropy": 2.2733672857284546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2536270022392273, + "step": 4670 + }, + { + "epoch": 0.09344, + "grad_norm": 2.390625, + "grad_norm_var": 0.0369537353515625, + "learning_rate": 0.0001, + "loss": 4.7721, + "loss/crossentropy": 2.179157257080078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24323320388793945, + "step": 4672 + }, + { + "epoch": 0.09348, + "grad_norm": 2.578125, + "grad_norm_var": 0.03875325520833333, + "learning_rate": 0.0001, + "loss": 4.8574, + "loss/crossentropy": 2.3613970279693604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2567686140537262, + "step": 4674 + }, + { + "epoch": 0.09352, + "grad_norm": 2.5, + "grad_norm_var": 0.029683430989583332, + "learning_rate": 0.0001, + "loss": 4.787, + "loss/crossentropy": 2.0755810141563416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2349473536014557, + "step": 4676 + }, + { + "epoch": 0.09356, + "grad_norm": 2.25, + "grad_norm_var": 0.03134358723958333, + "learning_rate": 0.0001, + "loss": 4.6235, + "loss/crossentropy": 1.9971619248390198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22000454366207123, + "step": 4678 + }, + { + "epoch": 0.0936, + "grad_norm": 2.3125, + "grad_norm_var": 0.020068359375, + "learning_rate": 0.0001, + "loss": 4.5307, + "loss/crossentropy": 1.9419977068901062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21052303910255432, + "step": 4680 + }, + { + "epoch": 0.09364, + "grad_norm": 2.3125, + "grad_norm_var": 0.021122233072916666, + "learning_rate": 0.0001, + "loss": 4.6314, + "loss/crossentropy": 2.1175334453582764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23567666858434677, + "step": 4682 + }, + { + "epoch": 0.09368, + "grad_norm": 2.40625, + "grad_norm_var": 0.021122233072916666, + "learning_rate": 0.0001, + "loss": 4.5955, + "loss/crossentropy": 2.0300605297088623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29177258908748627, + "step": 4684 + }, + { + "epoch": 0.09372, + "grad_norm": 2.296875, + "grad_norm_var": 0.009228515625, + "learning_rate": 0.0001, + "loss": 4.5583, + "loss/crossentropy": 2.052473723888397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22480525821447372, + "step": 4686 + }, + { + "epoch": 0.09376, + "grad_norm": 2.359375, + "grad_norm_var": 0.008275349934895834, + "learning_rate": 0.0001, + "loss": 4.4925, + "loss/crossentropy": 2.2506834268569946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26876674592494965, + "step": 4688 + }, + { + "epoch": 0.0938, + "grad_norm": 2.125, + "grad_norm_var": 0.007515462239583334, + "learning_rate": 0.0001, + "loss": 4.291, + "loss/crossentropy": 1.930393099784851, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22246946394443512, + "step": 4690 + }, + { + "epoch": 0.09384, + "grad_norm": 2.5, + "grad_norm_var": 0.007616170247395833, + "learning_rate": 0.0001, + "loss": 4.9317, + "loss/crossentropy": 2.5477651357650757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27808643877506256, + "step": 4692 + }, + { + "epoch": 0.09388, + "grad_norm": 2.15625, + "grad_norm_var": 0.009033203125, + "learning_rate": 0.0001, + "loss": 4.5313, + "loss/crossentropy": 2.1059221625328064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.253792941570282, + "step": 4694 + }, + { + "epoch": 0.09392, + "grad_norm": 2.265625, + "grad_norm_var": 0.0091217041015625, + "learning_rate": 0.0001, + "loss": 4.607, + "loss/crossentropy": 2.0058358907699585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23942459374666214, + "step": 4696 + }, + { + "epoch": 0.09396, + "grad_norm": 2.25, + "grad_norm_var": 0.0103515625, + "learning_rate": 0.0001, + "loss": 4.2024, + "loss/crossentropy": 1.9118947982788086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2300974577665329, + "step": 4698 + }, + { + "epoch": 0.094, + "grad_norm": 2.5625, + "grad_norm_var": 0.013337198893229167, + "learning_rate": 0.0001, + "loss": 4.6541, + "loss/crossentropy": 2.1172733902931213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24785596132278442, + "step": 4700 + }, + { + "epoch": 0.09404, + "grad_norm": 2.5, + "grad_norm_var": 0.016044108072916667, + "learning_rate": 0.0001, + "loss": 4.9005, + "loss/crossentropy": 2.0064170956611633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25045372545719147, + "step": 4702 + }, + { + "epoch": 0.09408, + "grad_norm": 2.3125, + "grad_norm_var": 0.015738932291666667, + "learning_rate": 0.0001, + "loss": 4.8139, + "loss/crossentropy": 2.5816656351089478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27938568592071533, + "step": 4704 + }, + { + "epoch": 0.09412, + "grad_norm": 2.25, + "grad_norm_var": 0.016597493489583334, + "learning_rate": 0.0001, + "loss": 4.866, + "loss/crossentropy": 2.5768171548843384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27975398302078247, + "step": 4706 + }, + { + "epoch": 0.09416, + "grad_norm": 2.234375, + "grad_norm_var": 0.0153717041015625, + "learning_rate": 0.0001, + "loss": 4.6075, + "loss/crossentropy": 2.323893189430237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2608431279659271, + "step": 4708 + }, + { + "epoch": 0.0942, + "grad_norm": 2.265625, + "grad_norm_var": 0.01357421875, + "learning_rate": 0.0001, + "loss": 4.3564, + "loss/crossentropy": 1.7910810708999634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21380099654197693, + "step": 4710 + }, + { + "epoch": 0.09424, + "grad_norm": 2.21875, + "grad_norm_var": 0.015848795572916668, + "learning_rate": 0.0001, + "loss": 4.6374, + "loss/crossentropy": 2.214052677154541, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24833911657333374, + "step": 4712 + }, + { + "epoch": 0.09428, + "grad_norm": 2.40625, + "grad_norm_var": 0.0158355712890625, + "learning_rate": 0.0001, + "loss": 4.9987, + "loss/crossentropy": 2.0850380063056946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2419341504573822, + "step": 4714 + }, + { + "epoch": 0.09432, + "grad_norm": 2.359375, + "grad_norm_var": 0.014058430989583334, + "learning_rate": 0.0001, + "loss": 4.2961, + "loss/crossentropy": 2.0707927346229553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2499033510684967, + "step": 4716 + }, + { + "epoch": 0.09436, + "grad_norm": 2.3125, + "grad_norm_var": 0.0125396728515625, + "learning_rate": 0.0001, + "loss": 4.4057, + "loss/crossentropy": 2.11221444606781, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25916673243045807, + "step": 4718 + }, + { + "epoch": 0.0944, + "grad_norm": 2.390625, + "grad_norm_var": 0.012718709309895833, + "learning_rate": 0.0001, + "loss": 4.6057, + "loss/crossentropy": 1.8400230407714844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22771906107664108, + "step": 4720 + }, + { + "epoch": 0.09444, + "grad_norm": 2.28125, + "grad_norm_var": 0.008234659830729166, + "learning_rate": 0.0001, + "loss": 4.4929, + "loss/crossentropy": 2.2271865606307983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2574647441506386, + "step": 4722 + }, + { + "epoch": 0.09448, + "grad_norm": 2.203125, + "grad_norm_var": 0.008649698893229167, + "learning_rate": 0.0001, + "loss": 4.4629, + "loss/crossentropy": 2.4532920122146606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27140843868255615, + "step": 4724 + }, + { + "epoch": 0.09452, + "grad_norm": 2.375, + "grad_norm_var": 0.009504191080729167, + "learning_rate": 0.0001, + "loss": 4.731, + "loss/crossentropy": 1.951303780078888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2333502620458603, + "step": 4726 + }, + { + "epoch": 0.09456, + "grad_norm": 2.40625, + "grad_norm_var": 0.009137980143229167, + "learning_rate": 0.0001, + "loss": 4.6334, + "loss/crossentropy": 2.209821343421936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24776015430688858, + "step": 4728 + }, + { + "epoch": 0.0946, + "grad_norm": 2.3125, + "grad_norm_var": 0.010872395833333333, + "learning_rate": 0.0001, + "loss": 4.861, + "loss/crossentropy": 2.434941530227661, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2747645229101181, + "step": 4730 + }, + { + "epoch": 0.09464, + "grad_norm": 2.1875, + "grad_norm_var": 0.010602823893229167, + "learning_rate": 0.0001, + "loss": 4.6145, + "loss/crossentropy": 2.051652252674103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24515582621097565, + "step": 4732 + }, + { + "epoch": 0.09468, + "grad_norm": 2.265625, + "grad_norm_var": 0.009797159830729167, + "learning_rate": 0.0001, + "loss": 4.6804, + "loss/crossentropy": 2.0039377212524414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23073314130306244, + "step": 4734 + }, + { + "epoch": 0.09472, + "grad_norm": 2.4375, + "grad_norm_var": 0.19903971354166666, + "learning_rate": 0.0001, + "loss": 4.8333, + "loss/crossentropy": 2.166410982608795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25871724635362625, + "step": 4736 + }, + { + "epoch": 0.09476, + "grad_norm": 2.3125, + "grad_norm_var": 0.2066802978515625, + "learning_rate": 0.0001, + "loss": 4.2739, + "loss/crossentropy": 1.9289153218269348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23183659464120865, + "step": 4738 + }, + { + "epoch": 0.0948, + "grad_norm": 2.46875, + "grad_norm_var": 0.20640360514322917, + "learning_rate": 0.0001, + "loss": 4.5903, + "loss/crossentropy": 2.270771861076355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26335832476615906, + "step": 4740 + }, + { + "epoch": 0.09484, + "grad_norm": 2.4375, + "grad_norm_var": 0.206494140625, + "learning_rate": 0.0001, + "loss": 4.7142, + "loss/crossentropy": 2.395651936531067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28527122735977173, + "step": 4742 + }, + { + "epoch": 0.09488, + "grad_norm": 2.390625, + "grad_norm_var": 0.20437723795572918, + "learning_rate": 0.0001, + "loss": 4.5083, + "loss/crossentropy": 1.8597867488861084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224149189889431, + "step": 4744 + }, + { + "epoch": 0.09492, + "grad_norm": 2.265625, + "grad_norm_var": 0.20693257649739583, + "learning_rate": 0.0001, + "loss": 4.4415, + "loss/crossentropy": 1.7795116305351257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23233074694871902, + "step": 4746 + }, + { + "epoch": 0.09496, + "grad_norm": 2.4375, + "grad_norm_var": 0.20255533854166666, + "learning_rate": 0.0001, + "loss": 4.7749, + "loss/crossentropy": 1.9449282884597778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2524118423461914, + "step": 4748 + }, + { + "epoch": 0.095, + "grad_norm": 2.53125, + "grad_norm_var": 0.2005035400390625, + "learning_rate": 0.0001, + "loss": 4.7047, + "loss/crossentropy": 2.195169448852539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2349853590130806, + "step": 4750 + }, + { + "epoch": 0.09504, + "grad_norm": 2.671875, + "grad_norm_var": 0.022468058268229167, + "learning_rate": 0.0001, + "loss": 4.5268, + "loss/crossentropy": 1.6628928184509277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1975274682044983, + "step": 4752 + }, + { + "epoch": 0.09508, + "grad_norm": 2.25, + "grad_norm_var": 0.018488566080729168, + "learning_rate": 0.0001, + "loss": 4.5862, + "loss/crossentropy": 2.0991236567497253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23341374844312668, + "step": 4754 + }, + { + "epoch": 0.09512, + "grad_norm": 2.40625, + "grad_norm_var": 0.017024739583333334, + "learning_rate": 0.0001, + "loss": 4.8338, + "loss/crossentropy": 2.350286066532135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2648291736841202, + "step": 4756 + }, + { + "epoch": 0.09516, + "grad_norm": 2.375, + "grad_norm_var": 0.017430623372395832, + "learning_rate": 0.0001, + "loss": 4.5788, + "loss/crossentropy": 2.02384877204895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24653150886297226, + "step": 4758 + }, + { + "epoch": 0.0952, + "grad_norm": 2.203125, + "grad_norm_var": 0.01734619140625, + "learning_rate": 0.0001, + "loss": 4.7052, + "loss/crossentropy": 2.264349341392517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25881223380565643, + "step": 4760 + }, + { + "epoch": 0.09524, + "grad_norm": 2.890625, + "grad_norm_var": 0.03779296875, + "learning_rate": 0.0001, + "loss": 4.8216, + "loss/crossentropy": 2.197356939315796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2565019279718399, + "step": 4762 + }, + { + "epoch": 0.09528, + "grad_norm": 2.515625, + "grad_norm_var": 0.038655598958333336, + "learning_rate": 0.0001, + "loss": 4.3849, + "loss/crossentropy": 1.8794787526130676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21771979331970215, + "step": 4764 + }, + { + "epoch": 0.09532, + "grad_norm": 2.3125, + "grad_norm_var": 0.03795572916666667, + "learning_rate": 0.0001, + "loss": 4.7391, + "loss/crossentropy": 2.107520580291748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2350333333015442, + "step": 4766 + }, + { + "epoch": 0.09536, + "grad_norm": 2.3125, + "grad_norm_var": 0.03297119140625, + "learning_rate": 0.0001, + "loss": 4.5194, + "loss/crossentropy": 2.082249402999878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24860269576311111, + "step": 4768 + }, + { + "epoch": 0.0954, + "grad_norm": 2.296875, + "grad_norm_var": 0.03062744140625, + "learning_rate": 0.0001, + "loss": 4.3985, + "loss/crossentropy": 1.9632289409637451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24099770188331604, + "step": 4770 + }, + { + "epoch": 0.09544, + "grad_norm": 2.359375, + "grad_norm_var": 0.02994384765625, + "learning_rate": 0.0001, + "loss": 4.4526, + "loss/crossentropy": 2.3403327465057373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2573448717594147, + "step": 4772 + }, + { + "epoch": 0.09548, + "grad_norm": 2.40625, + "grad_norm_var": 0.03181050618489583, + "learning_rate": 0.0001, + "loss": 4.7711, + "loss/crossentropy": 2.302455425262451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28050975501537323, + "step": 4774 + }, + { + "epoch": 0.09552, + "grad_norm": 2.28125, + "grad_norm_var": 0.031103515625, + "learning_rate": 0.0001, + "loss": 4.6092, + "loss/crossentropy": 2.2875213623046875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23910009860992432, + "step": 4776 + }, + { + "epoch": 0.09556, + "grad_norm": 2.765625, + "grad_norm_var": 0.022721354166666666, + "learning_rate": 0.0001, + "loss": 4.5237, + "loss/crossentropy": 2.0440263748168945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24640783667564392, + "step": 4778 + }, + { + "epoch": 0.0956, + "grad_norm": 2.53125, + "grad_norm_var": 0.02613525390625, + "learning_rate": 0.0001, + "loss": 4.6328, + "loss/crossentropy": 2.1597142219543457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24566050618886948, + "step": 4780 + }, + { + "epoch": 0.09564, + "grad_norm": 2.203125, + "grad_norm_var": 0.027765909830729168, + "learning_rate": 0.0001, + "loss": 4.7138, + "loss/crossentropy": 2.2846235036849976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2598261833190918, + "step": 4782 + }, + { + "epoch": 0.09568, + "grad_norm": 2.1875, + "grad_norm_var": 0.029683430989583332, + "learning_rate": 0.0001, + "loss": 4.4527, + "loss/crossentropy": 2.032243251800537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2488306611776352, + "step": 4784 + }, + { + "epoch": 0.09572, + "grad_norm": 2.5, + "grad_norm_var": 0.037230428059895834, + "learning_rate": 0.0001, + "loss": 4.7651, + "loss/crossentropy": 2.4999172687530518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25949685275554657, + "step": 4786 + }, + { + "epoch": 0.09576, + "grad_norm": 2.296875, + "grad_norm_var": 0.0375152587890625, + "learning_rate": 0.0001, + "loss": 4.6068, + "loss/crossentropy": 2.1610575914382935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2414923906326294, + "step": 4788 + }, + { + "epoch": 0.0958, + "grad_norm": 2.390625, + "grad_norm_var": 0.03135477701822917, + "learning_rate": 0.0001, + "loss": 4.5309, + "loss/crossentropy": 1.8679919838905334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21853071451187134, + "step": 4790 + }, + { + "epoch": 0.09584, + "grad_norm": 2.390625, + "grad_norm_var": 0.031281534830729166, + "learning_rate": 0.0001, + "loss": 4.697, + "loss/crossentropy": 2.072615623474121, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24943216145038605, + "step": 4792 + }, + { + "epoch": 0.09588, + "grad_norm": 2.765625, + "grad_norm_var": 0.032624308268229166, + "learning_rate": 0.0001, + "loss": 4.5555, + "loss/crossentropy": 2.420115113258362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2840816229581833, + "step": 4794 + }, + { + "epoch": 0.09592, + "grad_norm": 2.6875, + "grad_norm_var": 0.0455078125, + "learning_rate": 0.0001, + "loss": 5.102, + "loss/crossentropy": 2.2809360027313232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2739071249961853, + "step": 4796 + }, + { + "epoch": 0.09596, + "grad_norm": 2.328125, + "grad_norm_var": 0.04319559733072917, + "learning_rate": 0.0001, + "loss": 4.5418, + "loss/crossentropy": 2.1809465289115906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24680403620004654, + "step": 4798 + }, + { + "epoch": 0.096, + "grad_norm": 2.15625, + "grad_norm_var": 0.044611612955729164, + "learning_rate": 0.0001, + "loss": 4.6994, + "loss/crossentropy": 1.9710316061973572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2598233222961426, + "step": 4800 + }, + { + "epoch": 0.09604, + "grad_norm": 2.40625, + "grad_norm_var": 0.035477701822916666, + "learning_rate": 0.0001, + "loss": 4.9811, + "loss/crossentropy": 2.4378572702407837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2665309011936188, + "step": 4802 + }, + { + "epoch": 0.09608, + "grad_norm": 2.390625, + "grad_norm_var": 0.03759358723958333, + "learning_rate": 0.0001, + "loss": 4.604, + "loss/crossentropy": 1.8578800559043884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24051420390605927, + "step": 4804 + }, + { + "epoch": 0.09612, + "grad_norm": 2.40625, + "grad_norm_var": 0.034520467122395836, + "learning_rate": 0.0001, + "loss": 4.4567, + "loss/crossentropy": 2.191601037979126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25017624348402023, + "step": 4806 + }, + { + "epoch": 0.09616, + "grad_norm": 2.296875, + "grad_norm_var": 0.038374837239583334, + "learning_rate": 0.0001, + "loss": 4.5777, + "loss/crossentropy": 2.2077550888061523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31426818668842316, + "step": 4808 + }, + { + "epoch": 0.0962, + "grad_norm": 2.390625, + "grad_norm_var": 0.030659993489583332, + "learning_rate": 0.0001, + "loss": 4.917, + "loss/crossentropy": 2.2983503341674805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26998060941696167, + "step": 4810 + }, + { + "epoch": 0.09624, + "grad_norm": 2.703125, + "grad_norm_var": 0.020824178059895834, + "learning_rate": 0.0001, + "loss": 4.8691, + "loss/crossentropy": 2.0875505208969116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2673826217651367, + "step": 4812 + }, + { + "epoch": 0.09628, + "grad_norm": 2.03125, + "grad_norm_var": 0.031403605143229166, + "learning_rate": 0.0001, + "loss": 4.0736, + "loss/crossentropy": 1.9439318776130676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2379719465970993, + "step": 4814 + }, + { + "epoch": 0.09632, + "grad_norm": 2.359375, + "grad_norm_var": 0.03611551920572917, + "learning_rate": 0.0001, + "loss": 4.4698, + "loss/crossentropy": 1.9822518229484558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24425340443849564, + "step": 4816 + }, + { + "epoch": 0.09636, + "grad_norm": 2.296875, + "grad_norm_var": 0.03902994791666667, + "learning_rate": 0.0001, + "loss": 4.475, + "loss/crossentropy": 2.0238336324691772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2409110590815544, + "step": 4818 + }, + { + "epoch": 0.0964, + "grad_norm": 2.34375, + "grad_norm_var": 0.03707275390625, + "learning_rate": 0.0001, + "loss": 4.5401, + "loss/crossentropy": 2.454450249671936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26127950847148895, + "step": 4820 + }, + { + "epoch": 0.09644, + "grad_norm": 2.296875, + "grad_norm_var": 0.038407389322916666, + "learning_rate": 0.0001, + "loss": 4.5682, + "loss/crossentropy": 2.210579752922058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23662539571523666, + "step": 4822 + }, + { + "epoch": 0.09648, + "grad_norm": 2.28125, + "grad_norm_var": 0.034601847330729164, + "learning_rate": 0.0001, + "loss": 4.5527, + "loss/crossentropy": 1.8737664222717285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22884630411863327, + "step": 4824 + }, + { + "epoch": 0.09652, + "grad_norm": 2.484375, + "grad_norm_var": 0.0335357666015625, + "learning_rate": 0.0001, + "loss": 4.6221, + "loss/crossentropy": 2.1939562559127808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25145241618156433, + "step": 4826 + }, + { + "epoch": 0.09656, + "grad_norm": 2.328125, + "grad_norm_var": 0.0278472900390625, + "learning_rate": 0.0001, + "loss": 4.6685, + "loss/crossentropy": 2.133625030517578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25248992443084717, + "step": 4828 + }, + { + "epoch": 0.0966, + "grad_norm": 2.328125, + "grad_norm_var": 0.0197906494140625, + "learning_rate": 0.0001, + "loss": 4.2854, + "loss/crossentropy": 2.259738326072693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24727293848991394, + "step": 4830 + }, + { + "epoch": 0.09664, + "grad_norm": 2.984375, + "grad_norm_var": 0.043635050455729164, + "learning_rate": 0.0001, + "loss": 4.7497, + "loss/crossentropy": 2.0186268091201782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27940231561660767, + "step": 4832 + }, + { + "epoch": 0.09668, + "grad_norm": 2.234375, + "grad_norm_var": 0.04185282389322917, + "learning_rate": 0.0001, + "loss": 4.5867, + "loss/crossentropy": 1.9686395525932312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23021705448627472, + "step": 4834 + }, + { + "epoch": 0.09672, + "grad_norm": 2.296875, + "grad_norm_var": 0.0424224853515625, + "learning_rate": 0.0001, + "loss": 4.6157, + "loss/crossentropy": 2.1677820682525635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2766892910003662, + "step": 4836 + }, + { + "epoch": 0.09676, + "grad_norm": 2.1875, + "grad_norm_var": 0.04670308430989583, + "learning_rate": 0.0001, + "loss": 4.3751, + "loss/crossentropy": 1.8458788990974426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23294126987457275, + "step": 4838 + }, + { + "epoch": 0.0968, + "grad_norm": 2.640625, + "grad_norm_var": 0.0495269775390625, + "learning_rate": 0.0001, + "loss": 4.8931, + "loss/crossentropy": 1.7898097038269043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23436476290225983, + "step": 4840 + }, + { + "epoch": 0.09684, + "grad_norm": 2.375, + "grad_norm_var": 0.04843343098958333, + "learning_rate": 0.0001, + "loss": 4.5169, + "loss/crossentropy": 2.4594497680664062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2618473023176193, + "step": 4842 + }, + { + "epoch": 0.09688, + "grad_norm": 2.4375, + "grad_norm_var": 0.04752197265625, + "learning_rate": 0.0001, + "loss": 4.4841, + "loss/crossentropy": 2.061558425426483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2523345798254013, + "step": 4844 + }, + { + "epoch": 0.09692, + "grad_norm": 2.484375, + "grad_norm_var": 0.04462890625, + "learning_rate": 0.0001, + "loss": 4.5606, + "loss/crossentropy": 2.060658574104309, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2493506520986557, + "step": 4846 + }, + { + "epoch": 0.09696, + "grad_norm": 2.25, + "grad_norm_var": 0.016169230143229168, + "learning_rate": 0.0001, + "loss": 4.8028, + "loss/crossentropy": 2.201029062271118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2629493921995163, + "step": 4848 + }, + { + "epoch": 0.097, + "grad_norm": 2.34375, + "grad_norm_var": 0.01519775390625, + "learning_rate": 0.0001, + "loss": 4.7404, + "loss/crossentropy": 2.199273705482483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27054519951343536, + "step": 4850 + }, + { + "epoch": 0.09704, + "grad_norm": 2.109375, + "grad_norm_var": 0.0189453125, + "learning_rate": 0.0001, + "loss": 4.3008, + "loss/crossentropy": 2.2466784715652466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2678111642599106, + "step": 4852 + }, + { + "epoch": 0.09708, + "grad_norm": 2.71875, + "grad_norm_var": 0.022054036458333332, + "learning_rate": 0.0001, + "loss": 4.6762, + "loss/crossentropy": 1.9152815341949463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2441672906279564, + "step": 4854 + }, + { + "epoch": 0.09712, + "grad_norm": 2.21875, + "grad_norm_var": 0.017577107747395834, + "learning_rate": 0.0001, + "loss": 4.3818, + "loss/crossentropy": 2.092438578605652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24061349034309387, + "step": 4856 + }, + { + "epoch": 0.09716, + "grad_norm": 2.4375, + "grad_norm_var": 0.01802978515625, + "learning_rate": 0.0001, + "loss": 4.3224, + "loss/crossentropy": 1.8620384335517883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23738765716552734, + "step": 4858 + }, + { + "epoch": 0.0972, + "grad_norm": 2.953125, + "grad_norm_var": 0.04138997395833333, + "learning_rate": 0.0001, + "loss": 4.8798, + "loss/crossentropy": 2.199701428413391, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26521213352680206, + "step": 4860 + }, + { + "epoch": 0.09724, + "grad_norm": 2.53125, + "grad_norm_var": 0.042170206705729164, + "learning_rate": 0.0001, + "loss": 4.7598, + "loss/crossentropy": 2.3637821674346924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2707534506917, + "step": 4862 + }, + { + "epoch": 0.09728, + "grad_norm": 2.21875, + "grad_norm_var": 0.045491536458333336, + "learning_rate": 0.0001, + "loss": 4.5184, + "loss/crossentropy": 2.207235813140869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28080131858587265, + "step": 4864 + }, + { + "epoch": 0.09732, + "grad_norm": 2.265625, + "grad_norm_var": 0.0466461181640625, + "learning_rate": 0.0001, + "loss": 4.3798, + "loss/crossentropy": 1.901595950126648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23459318280220032, + "step": 4866 + }, + { + "epoch": 0.09736, + "grad_norm": 2.3125, + "grad_norm_var": 0.042769368489583334, + "learning_rate": 0.0001, + "loss": 4.4289, + "loss/crossentropy": 2.298312544822693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2580094337463379, + "step": 4868 + }, + { + "epoch": 0.0974, + "grad_norm": 2.3125, + "grad_norm_var": 0.03658854166666667, + "learning_rate": 0.0001, + "loss": 4.7576, + "loss/crossentropy": 2.29680597782135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24749226868152618, + "step": 4870 + }, + { + "epoch": 0.09744, + "grad_norm": 2.25, + "grad_norm_var": 0.0361968994140625, + "learning_rate": 0.0001, + "loss": 4.5839, + "loss/crossentropy": 2.1169378757476807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2565944790840149, + "step": 4872 + }, + { + "epoch": 0.09748, + "grad_norm": 2.140625, + "grad_norm_var": 0.04035542805989583, + "learning_rate": 0.0001, + "loss": 4.5364, + "loss/crossentropy": 2.0863184928894043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2527791038155556, + "step": 4874 + }, + { + "epoch": 0.09752, + "grad_norm": 2.296875, + "grad_norm_var": 0.015625, + "learning_rate": 0.0001, + "loss": 4.6176, + "loss/crossentropy": 2.146193563938141, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23968011140823364, + "step": 4876 + }, + { + "epoch": 0.09756, + "grad_norm": 2.375, + "grad_norm_var": 0.01060791015625, + "learning_rate": 0.0001, + "loss": 4.5598, + "loss/crossentropy": 2.1125508546829224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2498009204864502, + "step": 4878 + }, + { + "epoch": 0.0976, + "grad_norm": 2.375, + "grad_norm_var": 0.004130045572916667, + "learning_rate": 0.0001, + "loss": 4.772, + "loss/crossentropy": 2.13166081905365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2640175372362137, + "step": 4880 + }, + { + "epoch": 0.09764, + "grad_norm": 2.15625, + "grad_norm_var": 0.0051910400390625, + "learning_rate": 0.0001, + "loss": 4.5276, + "loss/crossentropy": 2.047860622406006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23980721086263657, + "step": 4882 + }, + { + "epoch": 0.09768, + "grad_norm": 2.515625, + "grad_norm_var": 0.010477701822916666, + "learning_rate": 0.0001, + "loss": 4.8519, + "loss/crossentropy": 2.362569808959961, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27908293902873993, + "step": 4884 + }, + { + "epoch": 0.09772, + "grad_norm": 2.109375, + "grad_norm_var": 0.012398274739583333, + "learning_rate": 0.0001, + "loss": 4.2709, + "loss/crossentropy": 1.8781500458717346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22341010719537735, + "step": 4886 + }, + { + "epoch": 0.09776, + "grad_norm": 2.390625, + "grad_norm_var": 0.019807942708333335, + "learning_rate": 0.0001, + "loss": 4.9408, + "loss/crossentropy": 2.1138893365859985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2533891350030899, + "step": 4888 + }, + { + "epoch": 0.0978, + "grad_norm": 2.484375, + "grad_norm_var": 0.0188140869140625, + "learning_rate": 0.0001, + "loss": 4.5981, + "loss/crossentropy": 2.2212090492248535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.279159352183342, + "step": 4890 + }, + { + "epoch": 0.09784, + "grad_norm": 2.453125, + "grad_norm_var": 0.0177886962890625, + "learning_rate": 0.0001, + "loss": 4.7245, + "loss/crossentropy": 1.9747707843780518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24078013002872467, + "step": 4892 + }, + { + "epoch": 0.09788, + "grad_norm": 2.53125, + "grad_norm_var": 0.020015462239583334, + "learning_rate": 0.0001, + "loss": 4.4051, + "loss/crossentropy": 1.8607316613197327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25342857837677, + "step": 4894 + }, + { + "epoch": 0.09792, + "grad_norm": 2.359375, + "grad_norm_var": 0.020963541666666665, + "learning_rate": 0.0001, + "loss": 4.6759, + "loss/crossentropy": 2.245271682739258, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24260863661766052, + "step": 4896 + }, + { + "epoch": 0.09796, + "grad_norm": 2.390625, + "grad_norm_var": 0.017560831705729165, + "learning_rate": 0.0001, + "loss": 4.5006, + "loss/crossentropy": 2.0503702759742737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25429168343544006, + "step": 4898 + }, + { + "epoch": 0.098, + "grad_norm": 2.3125, + "grad_norm_var": 0.017236328125, + "learning_rate": 0.0001, + "loss": 4.421, + "loss/crossentropy": 1.7784077525138855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2249627709388733, + "step": 4900 + }, + { + "epoch": 0.09804, + "grad_norm": 2.390625, + "grad_norm_var": 0.0114898681640625, + "learning_rate": 0.0001, + "loss": 4.5398, + "loss/crossentropy": 1.9827336072921753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22001181542873383, + "step": 4902 + }, + { + "epoch": 0.09808, + "grad_norm": 2.53125, + "grad_norm_var": 1.638996378580729, + "learning_rate": 0.0001, + "loss": 4.8149, + "loss/crossentropy": 2.1003565788269043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2550586014986038, + "step": 4904 + }, + { + "epoch": 0.09812, + "grad_norm": 2.484375, + "grad_norm_var": 1.6366933186848958, + "learning_rate": 0.0001, + "loss": 4.5099, + "loss/crossentropy": 1.9565055966377258, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25481177121400833, + "step": 4906 + }, + { + "epoch": 0.09816, + "grad_norm": 2.234375, + "grad_norm_var": 1.6479777018229167, + "learning_rate": 0.0001, + "loss": 4.6214, + "loss/crossentropy": 2.1693456172943115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23336851596832275, + "step": 4908 + }, + { + "epoch": 0.0982, + "grad_norm": 2.40625, + "grad_norm_var": 1.6428995768229167, + "learning_rate": 0.0001, + "loss": 4.7637, + "loss/crossentropy": 2.050541341304779, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25821831077337265, + "step": 4910 + }, + { + "epoch": 0.09824, + "grad_norm": 2.40625, + "grad_norm_var": 1.62906494140625, + "learning_rate": 0.0001, + "loss": 4.8515, + "loss/crossentropy": 2.168497681617737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2674206495285034, + "step": 4912 + }, + { + "epoch": 0.09828, + "grad_norm": 2.609375, + "grad_norm_var": 1.6097320556640624, + "learning_rate": 0.0001, + "loss": 5.1667, + "loss/crossentropy": 2.147577404975891, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26900771260261536, + "step": 4914 + }, + { + "epoch": 0.09832, + "grad_norm": 2.09375, + "grad_norm_var": 1.6371734619140625, + "learning_rate": 0.0001, + "loss": 4.3958, + "loss/crossentropy": 2.4436198472976685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2504816800355911, + "step": 4916 + }, + { + "epoch": 0.09836, + "grad_norm": 2.25, + "grad_norm_var": 1.6447265625, + "learning_rate": 0.0001, + "loss": 4.7244, + "loss/crossentropy": 2.097359538078308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24005448818206787, + "step": 4918 + }, + { + "epoch": 0.0984, + "grad_norm": 2.078125, + "grad_norm_var": 0.0267974853515625, + "learning_rate": 0.0001, + "loss": 4.2305, + "loss/crossentropy": 1.8467384576797485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22932368516921997, + "step": 4920 + }, + { + "epoch": 0.09844, + "grad_norm": 2.859375, + "grad_norm_var": 0.04263916015625, + "learning_rate": 0.0001, + "loss": 4.4407, + "loss/crossentropy": 2.1454135179519653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2634875178337097, + "step": 4922 + }, + { + "epoch": 0.09848, + "grad_norm": 2.765625, + "grad_norm_var": 0.05164286295572917, + "learning_rate": 0.0001, + "loss": 4.5775, + "loss/crossentropy": 1.9837967157363892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23980768024921417, + "step": 4924 + }, + { + "epoch": 0.09852, + "grad_norm": 2.265625, + "grad_norm_var": 0.054032389322916666, + "learning_rate": 0.0001, + "loss": 4.6888, + "loss/crossentropy": 2.099667489528656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23482084274291992, + "step": 4926 + }, + { + "epoch": 0.09856, + "grad_norm": 3.125, + "grad_norm_var": 0.08787333170572917, + "learning_rate": 0.0001, + "loss": 4.5685, + "loss/crossentropy": 1.8467332124710083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.256004735827446, + "step": 4928 + }, + { + "epoch": 0.0986, + "grad_norm": 2.53125, + "grad_norm_var": 0.08548075358072917, + "learning_rate": 0.0001, + "loss": 4.4474, + "loss/crossentropy": 2.036003887653351, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2377806007862091, + "step": 4930 + }, + { + "epoch": 0.09864, + "grad_norm": 2.484375, + "grad_norm_var": 0.07834370930989583, + "learning_rate": 0.0001, + "loss": 4.8258, + "loss/crossentropy": 2.091457724571228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24746537953615189, + "step": 4932 + }, + { + "epoch": 0.09868, + "grad_norm": 2.296875, + "grad_norm_var": 0.07929280598958334, + "learning_rate": 0.0001, + "loss": 4.6583, + "loss/crossentropy": 2.245227336883545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24141517281532288, + "step": 4934 + }, + { + "epoch": 0.09872, + "grad_norm": 2.40625, + "grad_norm_var": 0.0661285400390625, + "learning_rate": 0.0001, + "loss": 4.6514, + "loss/crossentropy": 2.295682668685913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28035247325897217, + "step": 4936 + }, + { + "epoch": 0.09876, + "grad_norm": 2.234375, + "grad_norm_var": 0.05524800618489583, + "learning_rate": 0.0001, + "loss": 4.5525, + "loss/crossentropy": 1.9251704812049866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23609354346990585, + "step": 4938 + }, + { + "epoch": 0.0988, + "grad_norm": 2.5, + "grad_norm_var": 0.048314412434895836, + "learning_rate": 0.0001, + "loss": 5.1583, + "loss/crossentropy": 2.289652466773987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30698196589946747, + "step": 4940 + }, + { + "epoch": 0.09884, + "grad_norm": 2.09375, + "grad_norm_var": 0.0534820556640625, + "learning_rate": 0.0001, + "loss": 4.7759, + "loss/crossentropy": 2.3339043855667114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2449272722005844, + "step": 4942 + }, + { + "epoch": 0.09888, + "grad_norm": 2.203125, + "grad_norm_var": 0.01949462890625, + "learning_rate": 0.0001, + "loss": 4.6678, + "loss/crossentropy": 2.0610195994377136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2553517669439316, + "step": 4944 + }, + { + "epoch": 0.09892, + "grad_norm": 2.203125, + "grad_norm_var": 0.018122355143229168, + "learning_rate": 0.0001, + "loss": 4.5849, + "loss/crossentropy": 2.1093825101852417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24767974764108658, + "step": 4946 + }, + { + "epoch": 0.09896, + "grad_norm": 2.328125, + "grad_norm_var": 0.01416015625, + "learning_rate": 0.0001, + "loss": 4.3631, + "loss/crossentropy": 2.1742878556251526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24640457332134247, + "step": 4948 + }, + { + "epoch": 0.099, + "grad_norm": 2.390625, + "grad_norm_var": 0.0144195556640625, + "learning_rate": 0.0001, + "loss": 4.8319, + "loss/crossentropy": 2.4237486124038696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2610916793346405, + "step": 4950 + }, + { + "epoch": 0.09904, + "grad_norm": 2.15625, + "grad_norm_var": 0.015445963541666666, + "learning_rate": 0.0001, + "loss": 4.3397, + "loss/crossentropy": 2.197754144668579, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2275979220867157, + "step": 4952 + }, + { + "epoch": 0.09908, + "grad_norm": 2.21875, + "grad_norm_var": 0.01529541015625, + "learning_rate": 0.0001, + "loss": 4.4533, + "loss/crossentropy": 2.267225503921509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23391032963991165, + "step": 4954 + }, + { + "epoch": 0.09912, + "grad_norm": 2.265625, + "grad_norm_var": 0.009504191080729167, + "learning_rate": 0.0001, + "loss": 4.47, + "loss/crossentropy": 2.04893159866333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23763196170330048, + "step": 4956 + }, + { + "epoch": 0.09916, + "grad_norm": 2.4375, + "grad_norm_var": 0.010716756184895834, + "learning_rate": 0.0001, + "loss": 4.97, + "loss/crossentropy": 2.4489223957061768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27698180079460144, + "step": 4958 + }, + { + "epoch": 0.0992, + "grad_norm": 2.328125, + "grad_norm_var": 0.011930338541666667, + "learning_rate": 0.0001, + "loss": 4.8604, + "loss/crossentropy": 2.3654375076293945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2495984137058258, + "step": 4960 + }, + { + "epoch": 0.09924, + "grad_norm": 2.328125, + "grad_norm_var": 0.011812337239583333, + "learning_rate": 0.0001, + "loss": 4.7671, + "loss/crossentropy": 1.8583308458328247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21832667291164398, + "step": 4962 + }, + { + "epoch": 0.09928, + "grad_norm": 2.265625, + "grad_norm_var": 0.01177978515625, + "learning_rate": 0.0001, + "loss": 4.4951, + "loss/crossentropy": 2.0762425661087036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2280900850892067, + "step": 4964 + }, + { + "epoch": 0.09932, + "grad_norm": 2.15625, + "grad_norm_var": 0.012532552083333334, + "learning_rate": 0.0001, + "loss": 4.7461, + "loss/crossentropy": 2.228640913963318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26595622301101685, + "step": 4966 + }, + { + "epoch": 0.09936, + "grad_norm": 2.234375, + "grad_norm_var": 0.0111236572265625, + "learning_rate": 0.0001, + "loss": 4.521, + "loss/crossentropy": 2.1447466611862183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2542327791452408, + "step": 4968 + }, + { + "epoch": 0.0994, + "grad_norm": 2.09375, + "grad_norm_var": 0.01226806640625, + "learning_rate": 0.0001, + "loss": 4.3959, + "loss/crossentropy": 2.0750887989997864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2362435683608055, + "step": 4970 + }, + { + "epoch": 0.09944, + "grad_norm": 2.328125, + "grad_norm_var": 0.009666951497395833, + "learning_rate": 0.0001, + "loss": 4.6705, + "loss/crossentropy": 1.9413353204727173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24418477714061737, + "step": 4972 + }, + { + "epoch": 0.09948, + "grad_norm": 2.265625, + "grad_norm_var": 0.00904541015625, + "learning_rate": 0.0001, + "loss": 4.609, + "loss/crossentropy": 2.102766752243042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24170882254838943, + "step": 4974 + }, + { + "epoch": 0.09952, + "grad_norm": 2.359375, + "grad_norm_var": 0.0090240478515625, + "learning_rate": 0.0001, + "loss": 4.7568, + "loss/crossentropy": 2.431061267852783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2855495512485504, + "step": 4976 + }, + { + "epoch": 0.09956, + "grad_norm": 2.453125, + "grad_norm_var": 0.010448201497395834, + "learning_rate": 0.0001, + "loss": 4.7449, + "loss/crossentropy": 1.9656312465667725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26410341262817383, + "step": 4978 + }, + { + "epoch": 0.0996, + "grad_norm": 2.578125, + "grad_norm_var": 0.01627197265625, + "learning_rate": 0.0001, + "loss": 5.0286, + "loss/crossentropy": 2.2365923523902893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27946531772613525, + "step": 4980 + }, + { + "epoch": 0.09964, + "grad_norm": 2.4375, + "grad_norm_var": 0.015184529622395833, + "learning_rate": 0.0001, + "loss": 4.5791, + "loss/crossentropy": 2.203595757484436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25247688591480255, + "step": 4982 + }, + { + "epoch": 0.09968, + "grad_norm": 2.296875, + "grad_norm_var": 0.0163482666015625, + "learning_rate": 0.0001, + "loss": 4.6824, + "loss/crossentropy": 2.1462446451187134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2539386674761772, + "step": 4984 + }, + { + "epoch": 0.09972, + "grad_norm": 2.25, + "grad_norm_var": 0.014127604166666667, + "learning_rate": 0.0001, + "loss": 4.7121, + "loss/crossentropy": 2.4518587589263916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24366125464439392, + "step": 4986 + }, + { + "epoch": 0.09976, + "grad_norm": 2.25, + "grad_norm_var": 0.01402587890625, + "learning_rate": 0.0001, + "loss": 4.5131, + "loss/crossentropy": 2.003869950771332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22874485701322556, + "step": 4988 + }, + { + "epoch": 0.0998, + "grad_norm": 2.328125, + "grad_norm_var": 0.017476399739583332, + "learning_rate": 0.0001, + "loss": 4.3221, + "loss/crossentropy": 2.0251912474632263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2240670546889305, + "step": 4990 + }, + { + "epoch": 0.09984, + "grad_norm": 2.140625, + "grad_norm_var": 0.020166015625, + "learning_rate": 0.0001, + "loss": 4.5297, + "loss/crossentropy": 2.199779748916626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22916750609874725, + "step": 4992 + }, + { + "epoch": 0.09988, + "grad_norm": 2.40625, + "grad_norm_var": 0.0207672119140625, + "learning_rate": 0.0001, + "loss": 4.8066, + "loss/crossentropy": 2.3852288722991943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26489999890327454, + "step": 4994 + }, + { + "epoch": 0.09992, + "grad_norm": 2.359375, + "grad_norm_var": 0.03874409993489583, + "learning_rate": 0.0001, + "loss": 4.7245, + "loss/crossentropy": 1.9446094632148743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24112944304943085, + "step": 4996 + }, + { + "epoch": 0.09996, + "grad_norm": 2.234375, + "grad_norm_var": 0.04269205729166667, + "learning_rate": 0.0001, + "loss": 4.7063, + "loss/crossentropy": 2.585115671157837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26535044610500336, + "step": 4998 + }, + { + "epoch": 0.1, + "grad_norm": 2.8125, + "grad_norm_var": 1.14400634765625, + "learning_rate": 0.0001, + "loss": 4.6848, + "loss/crossentropy": 1.9871427416801453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24747569859027863, + "step": 5000 + }, + { + "epoch": 0.10004, + "grad_norm": 2.359375, + "grad_norm_var": 1.1424967447916667, + "learning_rate": 0.0001, + "loss": 4.6058, + "loss/crossentropy": 1.8981972336769104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24131165444850922, + "step": 5002 + }, + { + "epoch": 0.10008, + "grad_norm": 2.28125, + "grad_norm_var": 1.1522786458333334, + "learning_rate": 0.0001, + "loss": 4.5731, + "loss/crossentropy": 2.323825240135193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2584778293967247, + "step": 5004 + }, + { + "epoch": 0.10012, + "grad_norm": 2.390625, + "grad_norm_var": 1.1363118489583333, + "learning_rate": 0.0001, + "loss": 4.7065, + "loss/crossentropy": 1.728028118610382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2997850477695465, + "step": 5006 + }, + { + "epoch": 0.10016, + "grad_norm": 2.28125, + "grad_norm_var": 1.12437744140625, + "learning_rate": 0.0001, + "loss": 4.8001, + "loss/crossentropy": 2.1486289501190186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25335805118083954, + "step": 5008 + }, + { + "epoch": 0.1002, + "grad_norm": 2.5, + "grad_norm_var": 1.1099761962890624, + "learning_rate": 0.0001, + "loss": 4.936, + "loss/crossentropy": 2.3132145404815674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26033517718315125, + "step": 5010 + }, + { + "epoch": 0.10024, + "grad_norm": 2.671875, + "grad_norm_var": 1.1134928385416667, + "learning_rate": 0.0001, + "loss": 4.493, + "loss/crossentropy": 1.9233656525611877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2471245899796486, + "step": 5012 + }, + { + "epoch": 0.10028, + "grad_norm": 2.21875, + "grad_norm_var": 1.1301747639973958, + "learning_rate": 0.0001, + "loss": 4.5221, + "loss/crossentropy": 1.9435511827468872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24730819463729858, + "step": 5014 + }, + { + "epoch": 0.10032, + "grad_norm": 2.1875, + "grad_norm_var": 0.018928019205729167, + "learning_rate": 0.0001, + "loss": 4.3895, + "loss/crossentropy": 2.2031294107437134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23213820159435272, + "step": 5016 + }, + { + "epoch": 0.10036, + "grad_norm": 2.703125, + "grad_norm_var": 0.0276763916015625, + "learning_rate": 0.0001, + "loss": 4.7247, + "loss/crossentropy": 2.285850405693054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25809091329574585, + "step": 5018 + }, + { + "epoch": 0.1004, + "grad_norm": 2.328125, + "grad_norm_var": 0.025211588541666666, + "learning_rate": 0.0001, + "loss": 4.7697, + "loss/crossentropy": 1.8660435676574707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2436714917421341, + "step": 5020 + }, + { + "epoch": 0.10044, + "grad_norm": 2.421875, + "grad_norm_var": 0.025877888997395834, + "learning_rate": 0.0001, + "loss": 4.3716, + "loss/crossentropy": 2.0659420490264893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24248096346855164, + "step": 5022 + }, + { + "epoch": 0.10048, + "grad_norm": 2.328125, + "grad_norm_var": 0.030516560872395834, + "learning_rate": 0.0001, + "loss": 4.7093, + "loss/crossentropy": 2.213133215904236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2423061951994896, + "step": 5024 + }, + { + "epoch": 0.10052, + "grad_norm": 2.234375, + "grad_norm_var": 0.0313385009765625, + "learning_rate": 0.0001, + "loss": 4.5288, + "loss/crossentropy": 2.3052343130111694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24757324904203415, + "step": 5026 + }, + { + "epoch": 0.10056, + "grad_norm": 2.796875, + "grad_norm_var": 0.039094034830729166, + "learning_rate": 0.0001, + "loss": 4.8709, + "loss/crossentropy": 2.226990580558777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25892695784568787, + "step": 5028 + }, + { + "epoch": 0.1006, + "grad_norm": 2.71875, + "grad_norm_var": 0.04163004557291667, + "learning_rate": 0.0001, + "loss": 4.9444, + "loss/crossentropy": 2.3460742235183716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27760128676891327, + "step": 5030 + }, + { + "epoch": 0.10064, + "grad_norm": 2.125, + "grad_norm_var": 0.04345703125, + "learning_rate": 0.0001, + "loss": 4.3597, + "loss/crossentropy": 1.9782095551490784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2349972277879715, + "step": 5032 + }, + { + "epoch": 0.10068, + "grad_norm": 2.96875, + "grad_norm_var": 0.06148681640625, + "learning_rate": 0.0001, + "loss": 4.5602, + "loss/crossentropy": 1.847929298877716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22401423752307892, + "step": 5034 + }, + { + "epoch": 0.10072, + "grad_norm": 2.46875, + "grad_norm_var": 0.06073811848958333, + "learning_rate": 0.0001, + "loss": 4.4989, + "loss/crossentropy": 2.172071158885956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2656550332903862, + "step": 5036 + }, + { + "epoch": 0.10076, + "grad_norm": 2.234375, + "grad_norm_var": 0.07026265462239584, + "learning_rate": 0.0001, + "loss": 4.3892, + "loss/crossentropy": 2.3497499227523804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26085225492715836, + "step": 5038 + }, + { + "epoch": 0.1008, + "grad_norm": 2.328125, + "grad_norm_var": 0.07284749348958333, + "learning_rate": 0.0001, + "loss": 4.2583, + "loss/crossentropy": 2.0916348695755005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21129868924617767, + "step": 5040 + }, + { + "epoch": 0.10084, + "grad_norm": 2.59375, + "grad_norm_var": 0.07073160807291666, + "learning_rate": 0.0001, + "loss": 4.8931, + "loss/crossentropy": 2.243077278137207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2657035142183304, + "step": 5042 + }, + { + "epoch": 0.10088, + "grad_norm": 10.0, + "grad_norm_var": 3.600194295247396, + "learning_rate": 0.0001, + "loss": 4.8887, + "loss/crossentropy": 1.9361066222190857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41255413740873337, + "step": 5044 + }, + { + "epoch": 0.10092, + "grad_norm": 3.703125, + "grad_norm_var": 3.6162760416666666, + "learning_rate": 0.0001, + "loss": 4.7632, + "loss/crossentropy": 2.0139313340187073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2591235190629959, + "step": 5046 + }, + { + "epoch": 0.10096, + "grad_norm": 2.203125, + "grad_norm_var": 3.589704386393229, + "learning_rate": 0.0001, + "loss": 4.6642, + "loss/crossentropy": 2.335710287094116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26457205414772034, + "step": 5048 + }, + { + "epoch": 0.101, + "grad_norm": 2.34375, + "grad_norm_var": 3.604325358072917, + "learning_rate": 0.0001, + "loss": 4.3329, + "loss/crossentropy": 1.9269848465919495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23867928236722946, + "step": 5050 + }, + { + "epoch": 0.10104, + "grad_norm": 2.34375, + "grad_norm_var": 3.627415974934896, + "learning_rate": 0.0001, + "loss": 4.4982, + "loss/crossentropy": 1.9732608795166016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24961821734905243, + "step": 5052 + }, + { + "epoch": 0.10108, + "grad_norm": 2.359375, + "grad_norm_var": 3.6287506103515623, + "learning_rate": 0.0001, + "loss": 4.7218, + "loss/crossentropy": 2.2659696340560913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24580512940883636, + "step": 5054 + }, + { + "epoch": 0.10112, + "grad_norm": 2.84375, + "grad_norm_var": 3.57880859375, + "learning_rate": 0.0001, + "loss": 4.4318, + "loss/crossentropy": 1.6600720882415771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20283473283052444, + "step": 5056 + }, + { + "epoch": 0.10116, + "grad_norm": 2.375, + "grad_norm_var": 3.602855428059896, + "learning_rate": 0.0001, + "loss": 4.8249, + "loss/crossentropy": 2.0175185799598694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23329314589500427, + "step": 5058 + }, + { + "epoch": 0.1012, + "grad_norm": 2.4375, + "grad_norm_var": 0.146484375, + "learning_rate": 0.0001, + "loss": 4.5132, + "loss/crossentropy": 2.259668231010437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2692077234387398, + "step": 5060 + }, + { + "epoch": 0.10124, + "grad_norm": 2.46875, + "grad_norm_var": 0.02144775390625, + "learning_rate": 0.0001, + "loss": 4.6392, + "loss/crossentropy": 2.260953903198242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28225430846214294, + "step": 5062 + }, + { + "epoch": 0.10128, + "grad_norm": 2.171875, + "grad_norm_var": 0.021686808268229166, + "learning_rate": 0.0001, + "loss": 4.3624, + "loss/crossentropy": 1.9721892476081848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2336483597755432, + "step": 5064 + }, + { + "epoch": 0.10132, + "grad_norm": 2.40625, + "grad_norm_var": 0.021512858072916665, + "learning_rate": 0.0001, + "loss": 4.6257, + "loss/crossentropy": 2.188947319984436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23959602415561676, + "step": 5066 + }, + { + "epoch": 0.10136, + "grad_norm": 3.078125, + "grad_norm_var": 0.04843343098958333, + "learning_rate": 0.0001, + "loss": 4.5601, + "loss/crossentropy": 1.7914190292358398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2234276980161667, + "step": 5068 + }, + { + "epoch": 0.1014, + "grad_norm": 2.265625, + "grad_norm_var": 0.049925740559895834, + "learning_rate": 0.0001, + "loss": 4.5806, + "loss/crossentropy": 2.1124663949012756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22523616254329681, + "step": 5070 + }, + { + "epoch": 0.10144, + "grad_norm": 2.359375, + "grad_norm_var": 0.03764546712239583, + "learning_rate": 0.0001, + "loss": 4.7098, + "loss/crossentropy": 2.1146361231803894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22483345866203308, + "step": 5072 + }, + { + "epoch": 0.10148, + "grad_norm": 2.171875, + "grad_norm_var": 0.04121805826822917, + "learning_rate": 0.0001, + "loss": 4.5076, + "loss/crossentropy": 2.335755705833435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.256914846599102, + "step": 5074 + }, + { + "epoch": 0.10152, + "grad_norm": 2.34375, + "grad_norm_var": 0.04346415201822917, + "learning_rate": 0.0001, + "loss": 4.8665, + "loss/crossentropy": 2.2911819219589233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26038119196891785, + "step": 5076 + }, + { + "epoch": 0.10156, + "grad_norm": 2.171875, + "grad_norm_var": 0.047587076822916664, + "learning_rate": 0.0001, + "loss": 4.592, + "loss/crossentropy": 2.059769034385681, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24004006385803223, + "step": 5078 + }, + { + "epoch": 0.1016, + "grad_norm": 2.109375, + "grad_norm_var": 0.04888916015625, + "learning_rate": 0.0001, + "loss": 4.5028, + "loss/crossentropy": 2.141201138496399, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22454539686441422, + "step": 5080 + }, + { + "epoch": 0.10164, + "grad_norm": 2.421875, + "grad_norm_var": 0.050146484375, + "learning_rate": 0.0001, + "loss": 4.7527, + "loss/crossentropy": 1.9538633823394775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24477297067642212, + "step": 5082 + }, + { + "epoch": 0.10168, + "grad_norm": 2.1875, + "grad_norm_var": 0.00992431640625, + "learning_rate": 0.0001, + "loss": 4.4799, + "loss/crossentropy": 2.1555078625679016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2425938919186592, + "step": 5084 + }, + { + "epoch": 0.10172, + "grad_norm": 2.296875, + "grad_norm_var": 0.009235636393229166, + "learning_rate": 0.0001, + "loss": 4.5583, + "loss/crossentropy": 2.2306214570999146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2445632517337799, + "step": 5086 + }, + { + "epoch": 0.10176, + "grad_norm": 2.03125, + "grad_norm_var": 0.010179646809895833, + "learning_rate": 0.0001, + "loss": 4.1075, + "loss/crossentropy": 1.9713392853736877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22239256650209427, + "step": 5088 + }, + { + "epoch": 0.1018, + "grad_norm": 2.234375, + "grad_norm_var": 0.011799112955729166, + "learning_rate": 0.0001, + "loss": 4.5181, + "loss/crossentropy": 1.951128602027893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24310563504695892, + "step": 5090 + }, + { + "epoch": 0.10184, + "grad_norm": 2.28125, + "grad_norm_var": 0.011051432291666666, + "learning_rate": 0.0001, + "loss": 4.3617, + "loss/crossentropy": 2.0100057125091553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23054375499486923, + "step": 5092 + }, + { + "epoch": 0.10188, + "grad_norm": 2.140625, + "grad_norm_var": 0.011042277018229166, + "learning_rate": 0.0001, + "loss": 4.4573, + "loss/crossentropy": 2.1898789405822754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24956130981445312, + "step": 5094 + }, + { + "epoch": 0.10192, + "grad_norm": 2.21875, + "grad_norm_var": 0.021512858072916665, + "learning_rate": 0.0001, + "loss": 4.6576, + "loss/crossentropy": 1.5666239857673645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20010025054216385, + "step": 5096 + }, + { + "epoch": 0.10196, + "grad_norm": 2.296875, + "grad_norm_var": 0.022468058268229167, + "learning_rate": 0.0001, + "loss": 4.2762, + "loss/crossentropy": 1.884951651096344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2484818547964096, + "step": 5098 + }, + { + "epoch": 0.102, + "grad_norm": 2.34375, + "grad_norm_var": 0.022554524739583335, + "learning_rate": 0.0001, + "loss": 4.5207, + "loss/crossentropy": 1.976080298423767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23380043357610703, + "step": 5100 + }, + { + "epoch": 0.10204, + "grad_norm": 2.375, + "grad_norm_var": 0.0229156494140625, + "learning_rate": 0.0001, + "loss": 4.592, + "loss/crossentropy": 2.1262658834457397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2441401332616806, + "step": 5102 + }, + { + "epoch": 0.10208, + "grad_norm": 2.515625, + "grad_norm_var": 0.08055013020833333, + "learning_rate": 0.0001, + "loss": 4.2735, + "loss/crossentropy": 1.7588757276535034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2101762592792511, + "step": 5104 + }, + { + "epoch": 0.10212, + "grad_norm": 2.578125, + "grad_norm_var": 0.08059488932291667, + "learning_rate": 0.0001, + "loss": 4.6721, + "loss/crossentropy": 2.292783260345459, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26928654313087463, + "step": 5106 + }, + { + "epoch": 0.10216, + "grad_norm": 2.65625, + "grad_norm_var": 0.4834218343098958, + "learning_rate": 0.0001, + "loss": 4.7475, + "loss/crossentropy": 2.0670888423919678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2644767463207245, + "step": 5108 + }, + { + "epoch": 0.1022, + "grad_norm": 2.1875, + "grad_norm_var": 0.4729563395182292, + "learning_rate": 0.0001, + "loss": 4.4113, + "loss/crossentropy": 2.0522598028182983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2410094290971756, + "step": 5110 + }, + { + "epoch": 0.10224, + "grad_norm": 2.359375, + "grad_norm_var": 0.4729075113932292, + "learning_rate": 0.0001, + "loss": 4.4914, + "loss/crossentropy": 2.0756974816322327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27077721059322357, + "step": 5112 + }, + { + "epoch": 0.10228, + "grad_norm": 2.3125, + "grad_norm_var": 0.47226155598958336, + "learning_rate": 0.0001, + "loss": 4.6524, + "loss/crossentropy": 2.1569767594337463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23582034558057785, + "step": 5114 + }, + { + "epoch": 0.10232, + "grad_norm": 2.21875, + "grad_norm_var": 0.4847819010416667, + "learning_rate": 0.0001, + "loss": 4.2821, + "loss/crossentropy": 1.9736077785491943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23984025418758392, + "step": 5116 + }, + { + "epoch": 0.10236, + "grad_norm": 2.3125, + "grad_norm_var": 0.4942698160807292, + "learning_rate": 0.0001, + "loss": 4.3047, + "loss/crossentropy": 2.1400066614151, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2347380667924881, + "step": 5118 + }, + { + "epoch": 0.1024, + "grad_norm": 2.171875, + "grad_norm_var": 0.46923726399739585, + "learning_rate": 0.0001, + "loss": 4.3531, + "loss/crossentropy": 1.989999234676361, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22508147358894348, + "step": 5120 + }, + { + "epoch": 0.10244, + "grad_norm": 2.5, + "grad_norm_var": 0.4704498291015625, + "learning_rate": 0.0001, + "loss": 4.8817, + "loss/crossentropy": 2.132390856742859, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3074522316455841, + "step": 5122 + }, + { + "epoch": 0.10248, + "grad_norm": 2.296875, + "grad_norm_var": 0.011002604166666667, + "learning_rate": 0.0001, + "loss": 4.3606, + "loss/crossentropy": 1.7906856536865234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126675397157669, + "step": 5124 + }, + { + "epoch": 0.10252, + "grad_norm": 2.265625, + "grad_norm_var": 0.0112457275390625, + "learning_rate": 0.0001, + "loss": 4.7045, + "loss/crossentropy": 2.0576369762420654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27211636304855347, + "step": 5126 + }, + { + "epoch": 0.10256, + "grad_norm": 2.265625, + "grad_norm_var": 0.01099853515625, + "learning_rate": 0.0001, + "loss": 4.5402, + "loss/crossentropy": 2.1174184679985046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2344469577074051, + "step": 5128 + }, + { + "epoch": 0.1026, + "grad_norm": 2.28125, + "grad_norm_var": 0.0090972900390625, + "learning_rate": 0.0001, + "loss": 4.7227, + "loss/crossentropy": 1.9139717817306519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2362786829471588, + "step": 5130 + }, + { + "epoch": 0.10264, + "grad_norm": 2.3125, + "grad_norm_var": 0.008495076497395834, + "learning_rate": 0.0001, + "loss": 4.4801, + "loss/crossentropy": 2.022357940673828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2307513877749443, + "step": 5132 + }, + { + "epoch": 0.10268, + "grad_norm": 2.203125, + "grad_norm_var": 0.007721964518229167, + "learning_rate": 0.0001, + "loss": 4.3963, + "loss/crossentropy": 2.038477897644043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22871223092079163, + "step": 5134 + }, + { + "epoch": 0.10272, + "grad_norm": 2.421875, + "grad_norm_var": 0.007079060872395833, + "learning_rate": 0.0001, + "loss": 4.7283, + "loss/crossentropy": 2.0895442962646484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23719681799411774, + "step": 5136 + }, + { + "epoch": 0.10276, + "grad_norm": 2.5625, + "grad_norm_var": 0.008820597330729167, + "learning_rate": 0.0001, + "loss": 4.7059, + "loss/crossentropy": 2.17978835105896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2519204765558243, + "step": 5138 + }, + { + "epoch": 0.1028, + "grad_norm": 2.46875, + "grad_norm_var": 0.00953369140625, + "learning_rate": 0.0001, + "loss": 4.7318, + "loss/crossentropy": 2.19089937210083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24376338720321655, + "step": 5140 + }, + { + "epoch": 0.10284, + "grad_norm": 2.515625, + "grad_norm_var": 0.011002604166666667, + "learning_rate": 0.0001, + "loss": 4.3961, + "loss/crossentropy": 2.018259823322296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23707401752471924, + "step": 5142 + }, + { + "epoch": 0.10288, + "grad_norm": 2.484375, + "grad_norm_var": 0.012287394205729166, + "learning_rate": 0.0001, + "loss": 4.7556, + "loss/crossentropy": 2.110401153564453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23409561812877655, + "step": 5144 + }, + { + "epoch": 0.10292, + "grad_norm": 2.390625, + "grad_norm_var": 0.012239583333333333, + "learning_rate": 0.0001, + "loss": 4.615, + "loss/crossentropy": 2.2096832990646362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2580249160528183, + "step": 5146 + }, + { + "epoch": 0.10296, + "grad_norm": 2.09375, + "grad_norm_var": 0.015803019205729168, + "learning_rate": 0.0001, + "loss": 4.5218, + "loss/crossentropy": 2.1825822591781616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23592937737703323, + "step": 5148 + }, + { + "epoch": 0.103, + "grad_norm": 2.359375, + "grad_norm_var": 0.01451416015625, + "learning_rate": 0.0001, + "loss": 4.6945, + "loss/crossentropy": 2.2440234422683716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23741164803504944, + "step": 5150 + }, + { + "epoch": 0.10304, + "grad_norm": 2.4375, + "grad_norm_var": 0.015071614583333334, + "learning_rate": 0.0001, + "loss": 4.962, + "loss/crossentropy": 2.3818799257278442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2642097622156143, + "step": 5152 + }, + { + "epoch": 0.10308, + "grad_norm": 2.296875, + "grad_norm_var": 0.014378865559895834, + "learning_rate": 0.0001, + "loss": 4.5706, + "loss/crossentropy": 2.1905806064605713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26945509016513824, + "step": 5154 + }, + { + "epoch": 0.10312, + "grad_norm": 2.140625, + "grad_norm_var": 0.017671712239583335, + "learning_rate": 0.0001, + "loss": 4.5678, + "loss/crossentropy": 2.096913695335388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2540033459663391, + "step": 5156 + }, + { + "epoch": 0.10316, + "grad_norm": 2.375, + "grad_norm_var": 0.015721638997395832, + "learning_rate": 0.0001, + "loss": 4.5286, + "loss/crossentropy": 1.7916489243507385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22504088282585144, + "step": 5158 + }, + { + "epoch": 0.1032, + "grad_norm": 2.34375, + "grad_norm_var": 0.014240519205729166, + "learning_rate": 0.0001, + "loss": 4.6256, + "loss/crossentropy": 1.9366079568862915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25474467873573303, + "step": 5160 + }, + { + "epoch": 0.10324, + "grad_norm": 2.40625, + "grad_norm_var": 0.014533487955729167, + "learning_rate": 0.0001, + "loss": 4.5054, + "loss/crossentropy": 2.0233771800994873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23394355177879333, + "step": 5162 + }, + { + "epoch": 0.10328, + "grad_norm": 2.234375, + "grad_norm_var": 0.010693359375, + "learning_rate": 0.0001, + "loss": 4.7803, + "loss/crossentropy": 2.442312717437744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28390438854694366, + "step": 5164 + }, + { + "epoch": 0.10332, + "grad_norm": 2.3125, + "grad_norm_var": 0.014435831705729167, + "learning_rate": 0.0001, + "loss": 4.7849, + "loss/crossentropy": 1.9547526836395264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2298036813735962, + "step": 5166 + }, + { + "epoch": 0.10336, + "grad_norm": 2.25, + "grad_norm_var": 0.018766276041666665, + "learning_rate": 0.0001, + "loss": 4.3225, + "loss/crossentropy": 1.7974739074707031, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21959447860717773, + "step": 5168 + }, + { + "epoch": 0.1034, + "grad_norm": 2.328125, + "grad_norm_var": 0.020075480143229168, + "learning_rate": 0.0001, + "loss": 4.5438, + "loss/crossentropy": 1.9860564470291138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24661653488874435, + "step": 5170 + }, + { + "epoch": 0.10344, + "grad_norm": 2.34375, + "grad_norm_var": 0.019050089518229167, + "learning_rate": 0.0001, + "loss": 4.5084, + "loss/crossentropy": 1.6198940873146057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20592768490314484, + "step": 5172 + }, + { + "epoch": 0.10348, + "grad_norm": 2.25, + "grad_norm_var": 0.019554646809895833, + "learning_rate": 0.0001, + "loss": 4.4831, + "loss/crossentropy": 2.193474531173706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24437790364027023, + "step": 5174 + }, + { + "epoch": 0.10352, + "grad_norm": 2.5, + "grad_norm_var": 0.022557576497395832, + "learning_rate": 0.0001, + "loss": 4.9209, + "loss/crossentropy": 2.221992254257202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2931511402130127, + "step": 5176 + }, + { + "epoch": 0.10356, + "grad_norm": 2.28125, + "grad_norm_var": 0.022272745768229168, + "learning_rate": 0.0001, + "loss": 4.4611, + "loss/crossentropy": 2.006419837474823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2335921749472618, + "step": 5178 + }, + { + "epoch": 0.1036, + "grad_norm": 2.171875, + "grad_norm_var": 0.022175089518229166, + "learning_rate": 0.0001, + "loss": 4.4477, + "loss/crossentropy": 2.2861804962158203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2549041658639908, + "step": 5180 + }, + { + "epoch": 0.10364, + "grad_norm": 2.171875, + "grad_norm_var": 0.014378865559895834, + "learning_rate": 0.0001, + "loss": 4.4079, + "loss/crossentropy": 2.217998743057251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24741299450397491, + "step": 5182 + }, + { + "epoch": 0.10368, + "grad_norm": 2.328125, + "grad_norm_var": 0.0123931884765625, + "learning_rate": 0.0001, + "loss": 4.7344, + "loss/crossentropy": 2.1875526905059814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23636415600776672, + "step": 5184 + }, + { + "epoch": 0.10372, + "grad_norm": 2.078125, + "grad_norm_var": 0.014850870768229166, + "learning_rate": 0.0001, + "loss": 4.4629, + "loss/crossentropy": 1.8408135175704956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23278112709522247, + "step": 5186 + }, + { + "epoch": 0.10376, + "grad_norm": 2.359375, + "grad_norm_var": 0.014188639322916667, + "learning_rate": 0.0001, + "loss": 4.6848, + "loss/crossentropy": 1.7936646342277527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23859456181526184, + "step": 5188 + }, + { + "epoch": 0.1038, + "grad_norm": 2.21875, + "grad_norm_var": 0.014058430989583334, + "learning_rate": 0.0001, + "loss": 4.3825, + "loss/crossentropy": 2.0800318717956543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23155340552330017, + "step": 5190 + }, + { + "epoch": 0.10384, + "grad_norm": 2.265625, + "grad_norm_var": 0.0066802978515625, + "learning_rate": 0.0001, + "loss": 4.9645, + "loss/crossentropy": 2.277778387069702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25029345601797104, + "step": 5192 + }, + { + "epoch": 0.10388, + "grad_norm": 2.09375, + "grad_norm_var": 0.0081695556640625, + "learning_rate": 0.0001, + "loss": 4.2486, + "loss/crossentropy": 1.9658478498458862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22655482590198517, + "step": 5194 + }, + { + "epoch": 0.10392, + "grad_norm": 2.21875, + "grad_norm_var": 0.01060791015625, + "learning_rate": 0.0001, + "loss": 4.7503, + "loss/crossentropy": 2.214509129524231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.254493810236454, + "step": 5196 + }, + { + "epoch": 0.10396, + "grad_norm": 2.21875, + "grad_norm_var": 0.0102203369140625, + "learning_rate": 0.0001, + "loss": 4.3648, + "loss/crossentropy": 1.9465742707252502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23288530111312866, + "step": 5198 + }, + { + "epoch": 0.104, + "grad_norm": 2.21875, + "grad_norm_var": 0.010277303059895833, + "learning_rate": 0.0001, + "loss": 4.3216, + "loss/crossentropy": 2.062779188156128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21423730999231339, + "step": 5200 + }, + { + "epoch": 0.10404, + "grad_norm": 2.390625, + "grad_norm_var": 0.010350545247395834, + "learning_rate": 0.0001, + "loss": 4.1645, + "loss/crossentropy": 1.777470588684082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22998665273189545, + "step": 5202 + }, + { + "epoch": 0.10408, + "grad_norm": 2.46875, + "grad_norm_var": 0.05396728515625, + "learning_rate": 0.0001, + "loss": 4.8979, + "loss/crossentropy": 2.2505980730056763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30352361500263214, + "step": 5204 + }, + { + "epoch": 0.10412, + "grad_norm": 3.359375, + "grad_norm_var": 0.12148335774739584, + "learning_rate": 0.0001, + "loss": 4.4093, + "loss/crossentropy": 1.8989517092704773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2210940569639206, + "step": 5206 + }, + { + "epoch": 0.10416, + "grad_norm": 2.734375, + "grad_norm_var": 0.12923177083333334, + "learning_rate": 0.0001, + "loss": 4.5535, + "loss/crossentropy": 2.378798723220825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25575730204582214, + "step": 5208 + }, + { + "epoch": 0.1042, + "grad_norm": 2.34375, + "grad_norm_var": 0.11901041666666666, + "learning_rate": 0.0001, + "loss": 4.6763, + "loss/crossentropy": 1.7642006278038025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22105304896831512, + "step": 5210 + }, + { + "epoch": 0.10424, + "grad_norm": 2.375, + "grad_norm_var": 0.11607666015625, + "learning_rate": 0.0001, + "loss": 4.8945, + "loss/crossentropy": 2.188746988773346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24192160367965698, + "step": 5212 + }, + { + "epoch": 0.10428, + "grad_norm": 2.3125, + "grad_norm_var": 0.11298421223958334, + "learning_rate": 0.0001, + "loss": 4.6001, + "loss/crossentropy": 2.116630494594574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2879187613725662, + "step": 5214 + }, + { + "epoch": 0.10432, + "grad_norm": 2.15625, + "grad_norm_var": 0.11013895670572917, + "learning_rate": 0.0001, + "loss": 4.4932, + "loss/crossentropy": 1.8329599499702454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21425092220306396, + "step": 5216 + }, + { + "epoch": 0.10436, + "grad_norm": 2.296875, + "grad_norm_var": 0.1064849853515625, + "learning_rate": 0.0001, + "loss": 4.3966, + "loss/crossentropy": 2.1063259840011597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23704984784126282, + "step": 5218 + }, + { + "epoch": 0.1044, + "grad_norm": 2.25, + "grad_norm_var": 0.08238525390625, + "learning_rate": 0.0001, + "loss": 4.4821, + "loss/crossentropy": 1.7994996309280396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21408653259277344, + "step": 5220 + }, + { + "epoch": 0.10444, + "grad_norm": 2.65625, + "grad_norm_var": 0.0281158447265625, + "learning_rate": 0.0001, + "loss": 4.2437, + "loss/crossentropy": 1.880006492137909, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24362730979919434, + "step": 5222 + }, + { + "epoch": 0.10448, + "grad_norm": 2.25, + "grad_norm_var": 0.017853800455729166, + "learning_rate": 0.0001, + "loss": 4.2726, + "loss/crossentropy": 2.010268449783325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22580985724925995, + "step": 5224 + }, + { + "epoch": 0.10452, + "grad_norm": 2.3125, + "grad_norm_var": 0.0177886962890625, + "learning_rate": 0.0001, + "loss": 4.6839, + "loss/crossentropy": 2.057162046432495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23878254741430283, + "step": 5226 + }, + { + "epoch": 0.10456, + "grad_norm": 2.21875, + "grad_norm_var": 0.019603474934895834, + "learning_rate": 0.0001, + "loss": 4.3774, + "loss/crossentropy": 2.0615930557250977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22890077531337738, + "step": 5228 + }, + { + "epoch": 0.1046, + "grad_norm": 2.40625, + "grad_norm_var": 0.020817057291666666, + "learning_rate": 0.0001, + "loss": 4.7693, + "loss/crossentropy": 1.9013472199440002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22539222240447998, + "step": 5230 + }, + { + "epoch": 0.10464, + "grad_norm": 2.296875, + "grad_norm_var": 0.019482421875, + "learning_rate": 0.0001, + "loss": 4.5531, + "loss/crossentropy": 2.239185929298401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24706681817770004, + "step": 5232 + }, + { + "epoch": 0.10468, + "grad_norm": 2.234375, + "grad_norm_var": 0.020540364583333335, + "learning_rate": 0.0001, + "loss": 4.3902, + "loss/crossentropy": 1.9881523251533508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24434638023376465, + "step": 5234 + }, + { + "epoch": 0.10472, + "grad_norm": 2.3125, + "grad_norm_var": 0.020524088541666666, + "learning_rate": 0.0001, + "loss": 4.7908, + "loss/crossentropy": 1.9529212713241577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2321469634771347, + "step": 5236 + }, + { + "epoch": 0.10476, + "grad_norm": 2.40625, + "grad_norm_var": 0.01041259765625, + "learning_rate": 0.0001, + "loss": 4.8045, + "loss/crossentropy": 2.196424722671509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24054434895515442, + "step": 5238 + }, + { + "epoch": 0.1048, + "grad_norm": 2.234375, + "grad_norm_var": 0.009781901041666667, + "learning_rate": 0.0001, + "loss": 4.5895, + "loss/crossentropy": 2.082987070083618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2760557308793068, + "step": 5240 + }, + { + "epoch": 0.10484, + "grad_norm": 2.359375, + "grad_norm_var": 0.011652628580729166, + "learning_rate": 0.0001, + "loss": 4.5961, + "loss/crossentropy": 2.2369720935821533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25305214524269104, + "step": 5242 + }, + { + "epoch": 0.10488, + "grad_norm": 2.1875, + "grad_norm_var": 0.0119537353515625, + "learning_rate": 0.0001, + "loss": 4.7268, + "loss/crossentropy": 2.3372031450271606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26765232533216476, + "step": 5244 + }, + { + "epoch": 0.10492, + "grad_norm": 2.3125, + "grad_norm_var": 0.0113677978515625, + "learning_rate": 0.0001, + "loss": 4.6148, + "loss/crossentropy": 2.23944628238678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26508912444114685, + "step": 5246 + }, + { + "epoch": 0.10496, + "grad_norm": 2.21875, + "grad_norm_var": 0.012105305989583334, + "learning_rate": 0.0001, + "loss": 4.3453, + "loss/crossentropy": 2.2701858282089233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2551077604293823, + "step": 5248 + }, + { + "epoch": 0.105, + "grad_norm": 2.234375, + "grad_norm_var": 0.010887654622395833, + "learning_rate": 0.0001, + "loss": 4.6455, + "loss/crossentropy": 2.293464779853821, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25016437470912933, + "step": 5250 + }, + { + "epoch": 0.10504, + "grad_norm": 2.46875, + "grad_norm_var": 0.0109527587890625, + "learning_rate": 0.0001, + "loss": 4.5798, + "loss/crossentropy": 2.3072171211242676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27824972569942474, + "step": 5252 + }, + { + "epoch": 0.10508, + "grad_norm": 2.453125, + "grad_norm_var": 0.010498046875, + "learning_rate": 0.0001, + "loss": 4.5948, + "loss/crossentropy": 1.9855756759643555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2312464788556099, + "step": 5254 + }, + { + "epoch": 0.10512, + "grad_norm": 2.3125, + "grad_norm_var": 0.009129842122395834, + "learning_rate": 0.0001, + "loss": 4.8104, + "loss/crossentropy": 1.9584077596664429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2386016771197319, + "step": 5256 + }, + { + "epoch": 0.10516, + "grad_norm": 2.265625, + "grad_norm_var": 0.010741170247395833, + "learning_rate": 0.0001, + "loss": 4.6103, + "loss/crossentropy": 2.2184669375419617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25914129614830017, + "step": 5258 + }, + { + "epoch": 0.1052, + "grad_norm": 2.1875, + "grad_norm_var": 0.0122222900390625, + "learning_rate": 0.0001, + "loss": 4.1221, + "loss/crossentropy": 1.6798554062843323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2037271112203598, + "step": 5260 + }, + { + "epoch": 0.10524, + "grad_norm": 2.203125, + "grad_norm_var": 0.011995442708333333, + "learning_rate": 0.0001, + "loss": 4.2718, + "loss/crossentropy": 1.8675006031990051, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21148693561553955, + "step": 5262 + }, + { + "epoch": 0.10528, + "grad_norm": 2.390625, + "grad_norm_var": 0.012336222330729167, + "learning_rate": 0.0001, + "loss": 4.5136, + "loss/crossentropy": 1.9439855813980103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24611759185791016, + "step": 5264 + }, + { + "epoch": 0.10532, + "grad_norm": 2.171875, + "grad_norm_var": 0.013923136393229167, + "learning_rate": 0.0001, + "loss": 4.5298, + "loss/crossentropy": 2.1550748348236084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2428945228457451, + "step": 5266 + }, + { + "epoch": 0.10536, + "grad_norm": 2.34375, + "grad_norm_var": 0.0117584228515625, + "learning_rate": 0.0001, + "loss": 4.6777, + "loss/crossentropy": 1.9924054741859436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21866007149219513, + "step": 5268 + }, + { + "epoch": 0.1054, + "grad_norm": 2.40625, + "grad_norm_var": 0.011571248372395834, + "learning_rate": 0.0001, + "loss": 4.5359, + "loss/crossentropy": 2.0413920879364014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2579498365521431, + "step": 5270 + }, + { + "epoch": 0.10544, + "grad_norm": 2.578125, + "grad_norm_var": 0.016380818684895833, + "learning_rate": 0.0001, + "loss": 4.6406, + "loss/crossentropy": 2.062632381916046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24655035883188248, + "step": 5272 + }, + { + "epoch": 0.10548, + "grad_norm": 2.421875, + "grad_norm_var": 0.021833292643229165, + "learning_rate": 0.0001, + "loss": 4.8133, + "loss/crossentropy": 2.0620386600494385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3041655272245407, + "step": 5274 + }, + { + "epoch": 0.10552, + "grad_norm": 2.234375, + "grad_norm_var": 0.016657511393229168, + "learning_rate": 0.0001, + "loss": 4.4775, + "loss/crossentropy": 1.966421365737915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22333864122629166, + "step": 5276 + }, + { + "epoch": 0.10556, + "grad_norm": 2.25, + "grad_norm_var": 0.017650349934895834, + "learning_rate": 0.0001, + "loss": 4.4323, + "loss/crossentropy": 1.9120238423347473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22857370972633362, + "step": 5278 + }, + { + "epoch": 0.1056, + "grad_norm": 2.40625, + "grad_norm_var": 0.019701131184895835, + "learning_rate": 0.0001, + "loss": 4.5179, + "loss/crossentropy": 2.084389805793762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2556762397289276, + "step": 5280 + }, + { + "epoch": 0.10564, + "grad_norm": 2.328125, + "grad_norm_var": 0.019758097330729165, + "learning_rate": 0.0001, + "loss": 4.4552, + "loss/crossentropy": 1.8707188367843628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22955116629600525, + "step": 5282 + }, + { + "epoch": 0.10568, + "grad_norm": 2.265625, + "grad_norm_var": 0.019775390625, + "learning_rate": 0.0001, + "loss": 4.3538, + "loss/crossentropy": 1.8243692517280579, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21088172495365143, + "step": 5284 + }, + { + "epoch": 0.10572, + "grad_norm": 2.359375, + "grad_norm_var": 0.0192535400390625, + "learning_rate": 0.0001, + "loss": 4.5046, + "loss/crossentropy": 2.111305356025696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2595943957567215, + "step": 5286 + }, + { + "epoch": 0.10576, + "grad_norm": 2.15625, + "grad_norm_var": 0.015697224934895834, + "learning_rate": 0.0001, + "loss": 4.4598, + "loss/crossentropy": 2.3729283809661865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2632211595773697, + "step": 5288 + }, + { + "epoch": 0.1058, + "grad_norm": 2.140625, + "grad_norm_var": 0.007249959309895833, + "learning_rate": 0.0001, + "loss": 4.6256, + "loss/crossentropy": 2.3542696237564087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25645585358142853, + "step": 5290 + }, + { + "epoch": 0.10584, + "grad_norm": 2.4375, + "grad_norm_var": 0.01002197265625, + "learning_rate": 0.0001, + "loss": 4.4863, + "loss/crossentropy": 2.0140068531036377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23033145815134048, + "step": 5292 + }, + { + "epoch": 0.10588, + "grad_norm": 2.375, + "grad_norm_var": 0.0099761962890625, + "learning_rate": 0.0001, + "loss": 4.8126, + "loss/crossentropy": 1.9499077796936035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21458172798156738, + "step": 5294 + }, + { + "epoch": 0.10592, + "grad_norm": 2.25, + "grad_norm_var": 0.0084625244140625, + "learning_rate": 0.0001, + "loss": 4.479, + "loss/crossentropy": 2.1434344053268433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2552379444241524, + "step": 5296 + }, + { + "epoch": 0.10596, + "grad_norm": 2.359375, + "grad_norm_var": 0.007819620768229167, + "learning_rate": 0.0001, + "loss": 4.7149, + "loss/crossentropy": 1.9951340556144714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21683495491743088, + "step": 5298 + }, + { + "epoch": 0.106, + "grad_norm": 2.5625, + "grad_norm_var": 0.02330322265625, + "learning_rate": 0.0001, + "loss": 4.7855, + "loss/crossentropy": 2.380235195159912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2756577730178833, + "step": 5300 + }, + { + "epoch": 0.10604, + "grad_norm": 2.3125, + "grad_norm_var": 0.02330322265625, + "learning_rate": 0.0001, + "loss": 4.8596, + "loss/crossentropy": 2.298241972923279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2598598450422287, + "step": 5302 + }, + { + "epoch": 0.10608, + "grad_norm": 2.5, + "grad_norm_var": 0.032136027018229166, + "learning_rate": 0.0001, + "loss": 4.8615, + "loss/crossentropy": 2.093233823776245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23809552192687988, + "step": 5304 + }, + { + "epoch": 0.10612, + "grad_norm": 2.15625, + "grad_norm_var": 0.031493123372395834, + "learning_rate": 0.0001, + "loss": 4.5271, + "loss/crossentropy": 2.0901564955711365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24874016642570496, + "step": 5306 + }, + { + "epoch": 0.10616, + "grad_norm": 2.453125, + "grad_norm_var": 0.0304107666015625, + "learning_rate": 0.0001, + "loss": 4.3193, + "loss/crossentropy": 1.8029736280441284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20973137766122818, + "step": 5308 + }, + { + "epoch": 0.1062, + "grad_norm": 2.25, + "grad_norm_var": 0.03316650390625, + "learning_rate": 0.0001, + "loss": 4.4255, + "loss/crossentropy": 2.4068437814712524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25310443341732025, + "step": 5310 + }, + { + "epoch": 0.10624, + "grad_norm": 2.265625, + "grad_norm_var": 0.031029256184895833, + "learning_rate": 0.0001, + "loss": 4.4499, + "loss/crossentropy": 2.1125503182411194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2455870360136032, + "step": 5312 + }, + { + "epoch": 0.10628, + "grad_norm": 2.234375, + "grad_norm_var": 0.0323150634765625, + "learning_rate": 0.0001, + "loss": 4.4752, + "loss/crossentropy": 2.0995737314224243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2418016716837883, + "step": 5314 + }, + { + "epoch": 0.10632, + "grad_norm": 2.234375, + "grad_norm_var": 0.0212890625, + "learning_rate": 0.0001, + "loss": 4.5873, + "loss/crossentropy": 1.8753212690353394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24125799536705017, + "step": 5316 + }, + { + "epoch": 0.10636, + "grad_norm": 2.390625, + "grad_norm_var": 0.021451822916666665, + "learning_rate": 0.0001, + "loss": 4.7269, + "loss/crossentropy": 2.0175408720970154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23516137897968292, + "step": 5318 + }, + { + "epoch": 0.1064, + "grad_norm": 2.109375, + "grad_norm_var": 0.00914306640625, + "learning_rate": 0.0001, + "loss": 4.4953, + "loss/crossentropy": 2.2671592235565186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2356845736503601, + "step": 5320 + }, + { + "epoch": 0.10644, + "grad_norm": 2.140625, + "grad_norm_var": 0.009468587239583333, + "learning_rate": 0.0001, + "loss": 4.5328, + "loss/crossentropy": 2.142452359199524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26422591507434845, + "step": 5322 + }, + { + "epoch": 0.10648, + "grad_norm": 2.3125, + "grad_norm_var": 0.007225545247395834, + "learning_rate": 0.0001, + "loss": 4.4328, + "loss/crossentropy": 1.9664896726608276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25856246054172516, + "step": 5324 + }, + { + "epoch": 0.10652, + "grad_norm": 2.203125, + "grad_norm_var": 0.007222493489583333, + "learning_rate": 0.0001, + "loss": 4.5531, + "loss/crossentropy": 2.168110191822052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23274105042219162, + "step": 5326 + }, + { + "epoch": 0.10656, + "grad_norm": 2.21875, + "grad_norm_var": 0.00552978515625, + "learning_rate": 0.0001, + "loss": 4.5242, + "loss/crossentropy": 2.006514251232147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2532104551792145, + "step": 5328 + }, + { + "epoch": 0.1066, + "grad_norm": 2.296875, + "grad_norm_var": 0.0070220947265625, + "learning_rate": 0.0001, + "loss": 4.5593, + "loss/crossentropy": 2.462701439857483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2771739661693573, + "step": 5330 + }, + { + "epoch": 0.10664, + "grad_norm": 2.203125, + "grad_norm_var": 0.0076324462890625, + "learning_rate": 0.0001, + "loss": 4.4076, + "loss/crossentropy": 2.0889209508895874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2479858472943306, + "step": 5332 + }, + { + "epoch": 0.10668, + "grad_norm": 2.25, + "grad_norm_var": 0.008356730143229166, + "learning_rate": 0.0001, + "loss": 4.329, + "loss/crossentropy": 1.8056100606918335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22194840013980865, + "step": 5334 + }, + { + "epoch": 0.10672, + "grad_norm": 2.171875, + "grad_norm_var": 0.0075185139973958336, + "learning_rate": 0.0001, + "loss": 4.703, + "loss/crossentropy": 2.32460880279541, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28185322880744934, + "step": 5336 + }, + { + "epoch": 0.10676, + "grad_norm": 2.140625, + "grad_norm_var": 0.0075185139973958336, + "learning_rate": 0.0001, + "loss": 4.6388, + "loss/crossentropy": 2.238978862762451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24314726889133453, + "step": 5338 + }, + { + "epoch": 0.1068, + "grad_norm": 2.0625, + "grad_norm_var": 0.00914306640625, + "learning_rate": 0.0001, + "loss": 4.4161, + "loss/crossentropy": 1.8914734721183777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20767460763454437, + "step": 5340 + }, + { + "epoch": 0.10684, + "grad_norm": 2.328125, + "grad_norm_var": 0.010445149739583333, + "learning_rate": 0.0001, + "loss": 4.5628, + "loss/crossentropy": 1.9704068899154663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23617815226316452, + "step": 5342 + }, + { + "epoch": 0.10688, + "grad_norm": 2.40625, + "grad_norm_var": 0.013374837239583333, + "learning_rate": 0.0001, + "loss": 4.214, + "loss/crossentropy": 1.8539690971374512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2218247577548027, + "step": 5344 + }, + { + "epoch": 0.10692, + "grad_norm": 2.28125, + "grad_norm_var": 0.012596638997395833, + "learning_rate": 0.0001, + "loss": 4.6077, + "loss/crossentropy": 1.982038140296936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23828908801078796, + "step": 5346 + }, + { + "epoch": 0.10696, + "grad_norm": 2.28125, + "grad_norm_var": 0.013313802083333333, + "learning_rate": 0.0001, + "loss": 4.2879, + "loss/crossentropy": 1.8247870802879333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22487633675336838, + "step": 5348 + }, + { + "epoch": 0.107, + "grad_norm": 2.1875, + "grad_norm_var": 0.011253865559895833, + "learning_rate": 0.0001, + "loss": 4.382, + "loss/crossentropy": 2.0704214572906494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24332743138074875, + "step": 5350 + }, + { + "epoch": 0.10704, + "grad_norm": 2.5, + "grad_norm_var": 0.013509114583333334, + "learning_rate": 0.0001, + "loss": 4.709, + "loss/crossentropy": 2.037345290184021, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2515557184815407, + "step": 5352 + }, + { + "epoch": 0.10708, + "grad_norm": 2.609375, + "grad_norm_var": 0.020702107747395834, + "learning_rate": 0.0001, + "loss": 4.5427, + "loss/crossentropy": 2.0561426877975464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24030926823616028, + "step": 5354 + }, + { + "epoch": 0.10712, + "grad_norm": 2.3125, + "grad_norm_var": 0.017943318684895834, + "learning_rate": 0.0001, + "loss": 4.3108, + "loss/crossentropy": 1.6871100068092346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2526541203260422, + "step": 5356 + }, + { + "epoch": 0.10716, + "grad_norm": 2.390625, + "grad_norm_var": 0.01978759765625, + "learning_rate": 0.0001, + "loss": 4.5238, + "loss/crossentropy": 2.0133201479911804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23663413524627686, + "step": 5358 + }, + { + "epoch": 0.1072, + "grad_norm": 2.34375, + "grad_norm_var": 0.0170562744140625, + "learning_rate": 0.0001, + "loss": 4.557, + "loss/crossentropy": 2.0627574920654297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24395380914211273, + "step": 5360 + }, + { + "epoch": 0.10724, + "grad_norm": 2.140625, + "grad_norm_var": 0.020929972330729168, + "learning_rate": 0.0001, + "loss": 4.5495, + "loss/crossentropy": 2.280818462371826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2806383967399597, + "step": 5362 + }, + { + "epoch": 0.10728, + "grad_norm": 2.5, + "grad_norm_var": 0.022684733072916668, + "learning_rate": 0.0001, + "loss": 4.4743, + "loss/crossentropy": 2.002636671066284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23986798524856567, + "step": 5364 + }, + { + "epoch": 0.10732, + "grad_norm": 2.453125, + "grad_norm_var": 0.022298177083333332, + "learning_rate": 0.0001, + "loss": 4.6816, + "loss/crossentropy": 2.042721927165985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23199205100536346, + "step": 5366 + }, + { + "epoch": 0.10736, + "grad_norm": 2.25, + "grad_norm_var": 0.021540323893229168, + "learning_rate": 0.0001, + "loss": 4.3225, + "loss/crossentropy": 2.1047908663749695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23987916857004166, + "step": 5368 + }, + { + "epoch": 0.1074, + "grad_norm": 2.1875, + "grad_norm_var": 0.014774576822916666, + "learning_rate": 0.0001, + "loss": 4.4827, + "loss/crossentropy": 2.0513075590133667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.242512047290802, + "step": 5370 + }, + { + "epoch": 0.10744, + "grad_norm": 2.4375, + "grad_norm_var": 0.015913899739583334, + "learning_rate": 0.0001, + "loss": 4.4452, + "loss/crossentropy": 1.9151215553283691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23007020354270935, + "step": 5372 + }, + { + "epoch": 0.10748, + "grad_norm": 2.125, + "grad_norm_var": 0.019303385416666666, + "learning_rate": 0.0001, + "loss": 4.2344, + "loss/crossentropy": 1.8759313821792603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2068658247590065, + "step": 5374 + }, + { + "epoch": 0.10752, + "grad_norm": 2.46875, + "grad_norm_var": 0.020807902018229168, + "learning_rate": 0.0001, + "loss": 4.4667, + "loss/crossentropy": 2.1500572562217712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24392293393611908, + "step": 5376 + }, + { + "epoch": 0.10756, + "grad_norm": 2.109375, + "grad_norm_var": 0.018355305989583334, + "learning_rate": 0.0001, + "loss": 4.0917, + "loss/crossentropy": 1.6089633703231812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20190145075321198, + "step": 5378 + }, + { + "epoch": 0.1076, + "grad_norm": 2.1875, + "grad_norm_var": 0.014452107747395833, + "learning_rate": 0.0001, + "loss": 4.6521, + "loss/crossentropy": 2.1967561841011047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2749984338879585, + "step": 5380 + }, + { + "epoch": 0.10764, + "grad_norm": 2.171875, + "grad_norm_var": 0.011311848958333334, + "learning_rate": 0.0001, + "loss": 4.0843, + "loss/crossentropy": 1.8293656706809998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20650158822536469, + "step": 5382 + }, + { + "epoch": 0.10768, + "grad_norm": 2.125, + "grad_norm_var": 0.011847941080729167, + "learning_rate": 0.0001, + "loss": 4.3441, + "loss/crossentropy": 2.3964673280715942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2814205437898636, + "step": 5384 + }, + { + "epoch": 0.10772, + "grad_norm": 2.4375, + "grad_norm_var": 0.021842447916666667, + "learning_rate": 0.0001, + "loss": 4.8872, + "loss/crossentropy": 2.4995274543762207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28704003244638443, + "step": 5386 + }, + { + "epoch": 0.10776, + "grad_norm": 2.296875, + "grad_norm_var": 0.019462076822916667, + "learning_rate": 0.0001, + "loss": 4.6066, + "loss/crossentropy": 2.0650060176849365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23606212437152863, + "step": 5388 + }, + { + "epoch": 0.1078, + "grad_norm": 2.203125, + "grad_norm_var": 0.01640625, + "learning_rate": 0.0001, + "loss": 4.3723, + "loss/crossentropy": 2.3049341440200806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23550046980381012, + "step": 5390 + }, + { + "epoch": 0.10784, + "grad_norm": 2.21875, + "grad_norm_var": 0.013505045572916667, + "learning_rate": 0.0001, + "loss": 4.813, + "loss/crossentropy": 2.2687143087387085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2854095697402954, + "step": 5392 + }, + { + "epoch": 0.10788, + "grad_norm": 2.28125, + "grad_norm_var": 0.014557902018229167, + "learning_rate": 0.0001, + "loss": 4.6267, + "loss/crossentropy": 2.029325544834137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23444775491952896, + "step": 5394 + }, + { + "epoch": 0.10792, + "grad_norm": 2.3125, + "grad_norm_var": 0.013895670572916666, + "learning_rate": 0.0001, + "loss": 4.5214, + "loss/crossentropy": 2.2012031078338623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27986764907836914, + "step": 5396 + }, + { + "epoch": 0.10796, + "grad_norm": 2.3125, + "grad_norm_var": 0.012626139322916667, + "learning_rate": 0.0001, + "loss": 4.65, + "loss/crossentropy": 2.2396020889282227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24839117377996445, + "step": 5398 + }, + { + "epoch": 0.108, + "grad_norm": 2.3125, + "grad_norm_var": 0.009566243489583333, + "learning_rate": 0.0001, + "loss": 4.4634, + "loss/crossentropy": 2.1481886506080627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2545444592833519, + "step": 5400 + }, + { + "epoch": 0.10804, + "grad_norm": 2.21875, + "grad_norm_var": 0.006005859375, + "learning_rate": 0.0001, + "loss": 4.6109, + "loss/crossentropy": 1.9799351692199707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23296385258436203, + "step": 5402 + }, + { + "epoch": 0.10808, + "grad_norm": 2.34375, + "grad_norm_var": 0.006180826822916667, + "learning_rate": 0.0001, + "loss": 4.5612, + "loss/crossentropy": 1.845237910747528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2316332384943962, + "step": 5404 + }, + { + "epoch": 0.10812, + "grad_norm": 2.234375, + "grad_norm_var": 0.005692545572916667, + "learning_rate": 0.0001, + "loss": 4.4825, + "loss/crossentropy": 2.078865647315979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.251836359500885, + "step": 5406 + }, + { + "epoch": 0.10816, + "grad_norm": 2.171875, + "grad_norm_var": 0.006245930989583333, + "learning_rate": 0.0001, + "loss": 4.4409, + "loss/crossentropy": 2.031971752643585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23658733069896698, + "step": 5408 + }, + { + "epoch": 0.1082, + "grad_norm": 2.21875, + "grad_norm_var": 0.00504150390625, + "learning_rate": 0.0001, + "loss": 4.3034, + "loss/crossentropy": 1.8173908591270447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2169174626469612, + "step": 5410 + }, + { + "epoch": 0.10824, + "grad_norm": 2.25, + "grad_norm_var": 0.0086090087890625, + "learning_rate": 0.0001, + "loss": 4.838, + "loss/crossentropy": 2.2501285672187805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.240458145737648, + "step": 5412 + }, + { + "epoch": 0.10828, + "grad_norm": 2.484375, + "grad_norm_var": 0.015168253580729167, + "learning_rate": 0.0001, + "loss": 4.5449, + "loss/crossentropy": 2.256573438644409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26129382848739624, + "step": 5414 + }, + { + "epoch": 0.10832, + "grad_norm": 2.359375, + "grad_norm_var": 0.0153717041015625, + "learning_rate": 0.0001, + "loss": 4.7704, + "loss/crossentropy": 2.2014705538749695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2334136888384819, + "step": 5416 + }, + { + "epoch": 0.10836, + "grad_norm": 2.34375, + "grad_norm_var": 0.013016764322916667, + "learning_rate": 0.0001, + "loss": 4.4046, + "loss/crossentropy": 1.8590609431266785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2271011471748352, + "step": 5418 + }, + { + "epoch": 0.1084, + "grad_norm": 2.40625, + "grad_norm_var": 0.017378743489583334, + "learning_rate": 0.0001, + "loss": 4.9419, + "loss/crossentropy": 2.2923961877822876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25833888351917267, + "step": 5420 + }, + { + "epoch": 0.10844, + "grad_norm": 2.21875, + "grad_norm_var": 0.017723592122395833, + "learning_rate": 0.0001, + "loss": 4.4535, + "loss/crossentropy": 2.1932299733161926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26209479570388794, + "step": 5422 + }, + { + "epoch": 0.10848, + "grad_norm": 2.390625, + "grad_norm_var": 0.016185506184895834, + "learning_rate": 0.0001, + "loss": 4.7057, + "loss/crossentropy": 2.3909924030303955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2525275945663452, + "step": 5424 + }, + { + "epoch": 0.10852, + "grad_norm": 2.296875, + "grad_norm_var": 0.011324055989583333, + "learning_rate": 0.0001, + "loss": 4.7509, + "loss/crossentropy": 2.423817992210388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2635423541069031, + "step": 5426 + }, + { + "epoch": 0.10856, + "grad_norm": 2.34375, + "grad_norm_var": 0.009227498372395834, + "learning_rate": 0.0001, + "loss": 4.7082, + "loss/crossentropy": 1.9641632437705994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2460889369249344, + "step": 5428 + }, + { + "epoch": 0.1086, + "grad_norm": 2.34375, + "grad_norm_var": 0.0077707926432291664, + "learning_rate": 0.0001, + "loss": 4.6347, + "loss/crossentropy": 2.027769148349762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2357894703745842, + "step": 5430 + }, + { + "epoch": 0.10864, + "grad_norm": 2.53125, + "grad_norm_var": 0.0128570556640625, + "learning_rate": 0.0001, + "loss": 4.4833, + "loss/crossentropy": 2.122319996356964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2445123866200447, + "step": 5432 + }, + { + "epoch": 0.10868, + "grad_norm": 3.53125, + "grad_norm_var": 0.0993072509765625, + "learning_rate": 0.0001, + "loss": 4.6332, + "loss/crossentropy": 1.8631052374839783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23456327617168427, + "step": 5434 + }, + { + "epoch": 0.10872, + "grad_norm": 2.59375, + "grad_norm_var": 0.1000885009765625, + "learning_rate": 0.0001, + "loss": 4.6022, + "loss/crossentropy": 2.184281885623932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24073782563209534, + "step": 5436 + }, + { + "epoch": 0.10876, + "grad_norm": 2.453125, + "grad_norm_var": 0.09521077473958334, + "learning_rate": 0.0001, + "loss": 4.7912, + "loss/crossentropy": 1.9587833881378174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22247321158647537, + "step": 5438 + }, + { + "epoch": 0.1088, + "grad_norm": 2.296875, + "grad_norm_var": 0.09562886555989583, + "learning_rate": 0.0001, + "loss": 4.5185, + "loss/crossentropy": 2.334655284881592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24817728251218796, + "step": 5440 + }, + { + "epoch": 0.10884, + "grad_norm": 2.109375, + "grad_norm_var": 0.10161031087239583, + "learning_rate": 0.0001, + "loss": 4.3817, + "loss/crossentropy": 2.1424371004104614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2622353136539459, + "step": 5442 + }, + { + "epoch": 0.10888, + "grad_norm": 2.203125, + "grad_norm_var": 0.10598958333333333, + "learning_rate": 0.0001, + "loss": 4.5876, + "loss/crossentropy": 2.0363662242889404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2450125440955162, + "step": 5444 + }, + { + "epoch": 0.10892, + "grad_norm": 2.1875, + "grad_norm_var": 0.110791015625, + "learning_rate": 0.0001, + "loss": 4.4015, + "loss/crossentropy": 2.0536006689071655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23283181339502335, + "step": 5446 + }, + { + "epoch": 0.10896, + "grad_norm": 2.234375, + "grad_norm_var": 0.10741780598958334, + "learning_rate": 0.0001, + "loss": 4.5194, + "loss/crossentropy": 2.2678059339523315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24513862282037735, + "step": 5448 + }, + { + "epoch": 0.109, + "grad_norm": 2.203125, + "grad_norm_var": 0.02115478515625, + "learning_rate": 0.0001, + "loss": 4.7404, + "loss/crossentropy": 2.406686782836914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2901146113872528, + "step": 5450 + }, + { + "epoch": 0.10904, + "grad_norm": 2.296875, + "grad_norm_var": 0.017801920572916668, + "learning_rate": 0.0001, + "loss": 4.4724, + "loss/crossentropy": 2.352605938911438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25215400755405426, + "step": 5452 + }, + { + "epoch": 0.10908, + "grad_norm": 2.203125, + "grad_norm_var": 0.014546712239583334, + "learning_rate": 0.0001, + "loss": 4.5593, + "loss/crossentropy": 1.9139850735664368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22682765871286392, + "step": 5454 + }, + { + "epoch": 0.10912, + "grad_norm": 2.171875, + "grad_norm_var": 0.01441650390625, + "learning_rate": 0.0001, + "loss": 4.6163, + "loss/crossentropy": 2.3240445852279663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24574412405490875, + "step": 5456 + }, + { + "epoch": 0.10916, + "grad_norm": 2.21875, + "grad_norm_var": 0.0123687744140625, + "learning_rate": 0.0001, + "loss": 4.3356, + "loss/crossentropy": 1.9347040057182312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2159302830696106, + "step": 5458 + }, + { + "epoch": 0.1092, + "grad_norm": 2.21875, + "grad_norm_var": 0.012360636393229167, + "learning_rate": 0.0001, + "loss": 4.6539, + "loss/crossentropy": 1.8933109641075134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.284725621342659, + "step": 5460 + }, + { + "epoch": 0.10924, + "grad_norm": 2.15625, + "grad_norm_var": 0.0129058837890625, + "learning_rate": 0.0001, + "loss": 4.2488, + "loss/crossentropy": 2.3611297607421875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2651517689228058, + "step": 5462 + }, + { + "epoch": 0.10928, + "grad_norm": 2.140625, + "grad_norm_var": 0.0235504150390625, + "learning_rate": 0.0001, + "loss": 4.4564, + "loss/crossentropy": 1.828608751296997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2337760180234909, + "step": 5464 + }, + { + "epoch": 0.10932, + "grad_norm": 2.140625, + "grad_norm_var": 0.015412394205729167, + "learning_rate": 0.0001, + "loss": 4.321, + "loss/crossentropy": 2.1374374628067017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2525453567504883, + "step": 5466 + }, + { + "epoch": 0.10936, + "grad_norm": 2.125, + "grad_norm_var": 0.0150543212890625, + "learning_rate": 0.0001, + "loss": 4.5307, + "loss/crossentropy": 1.8054441213607788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2252344712615013, + "step": 5468 + }, + { + "epoch": 0.1094, + "grad_norm": 2.421875, + "grad_norm_var": 0.022972615559895833, + "learning_rate": 0.0001, + "loss": 4.616, + "loss/crossentropy": 2.1468498706817627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2604861631989479, + "step": 5470 + }, + { + "epoch": 0.10944, + "grad_norm": 2.359375, + "grad_norm_var": 0.022684733072916668, + "learning_rate": 0.0001, + "loss": 4.7298, + "loss/crossentropy": 2.2180548906326294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2603686898946762, + "step": 5472 + }, + { + "epoch": 0.10948, + "grad_norm": 2.265625, + "grad_norm_var": 0.022261555989583334, + "learning_rate": 0.0001, + "loss": 4.5263, + "loss/crossentropy": 1.9773708581924438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23676057159900665, + "step": 5474 + }, + { + "epoch": 0.10952, + "grad_norm": 2.40625, + "grad_norm_var": 0.023485310872395835, + "learning_rate": 0.0001, + "loss": 4.6156, + "loss/crossentropy": 1.9277283549308777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22822558879852295, + "step": 5476 + }, + { + "epoch": 0.10956, + "grad_norm": 2.34375, + "grad_norm_var": 0.022484334309895833, + "learning_rate": 0.0001, + "loss": 4.5529, + "loss/crossentropy": 2.0625431537628174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2565220817923546, + "step": 5478 + }, + { + "epoch": 0.1096, + "grad_norm": 2.25, + "grad_norm_var": 0.01441650390625, + "learning_rate": 0.0001, + "loss": 4.7956, + "loss/crossentropy": 2.383894443511963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2563806623220444, + "step": 5480 + }, + { + "epoch": 0.10964, + "grad_norm": 2.171875, + "grad_norm_var": 0.0120758056640625, + "learning_rate": 0.0001, + "loss": 4.5226, + "loss/crossentropy": 2.409442663192749, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2637571170926094, + "step": 5482 + }, + { + "epoch": 0.10968, + "grad_norm": 2.203125, + "grad_norm_var": 0.0151763916015625, + "learning_rate": 0.0001, + "loss": 4.743, + "loss/crossentropy": 2.1789854764938354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23846548050642014, + "step": 5484 + }, + { + "epoch": 0.10972, + "grad_norm": 2.265625, + "grad_norm_var": 0.011839803059895833, + "learning_rate": 0.0001, + "loss": 4.4108, + "loss/crossentropy": 2.127842903137207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2622772455215454, + "step": 5486 + }, + { + "epoch": 0.10976, + "grad_norm": 2.1875, + "grad_norm_var": 0.01412353515625, + "learning_rate": 0.0001, + "loss": 4.6032, + "loss/crossentropy": 2.107556462287903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24714312702417374, + "step": 5488 + }, + { + "epoch": 0.1098, + "grad_norm": 2.578125, + "grad_norm_var": 0.022459920247395834, + "learning_rate": 0.0001, + "loss": 4.6525, + "loss/crossentropy": 2.1959601640701294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23568396270275116, + "step": 5490 + }, + { + "epoch": 0.10984, + "grad_norm": 2.171875, + "grad_norm_var": 0.021826171875, + "learning_rate": 0.0001, + "loss": 4.5584, + "loss/crossentropy": 2.1246761083602905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24083472788333893, + "step": 5492 + }, + { + "epoch": 0.10988, + "grad_norm": 2.15625, + "grad_norm_var": 0.025031534830729167, + "learning_rate": 0.0001, + "loss": 4.6436, + "loss/crossentropy": 2.091724157333374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2527346611022949, + "step": 5494 + }, + { + "epoch": 0.10992, + "grad_norm": 2.109375, + "grad_norm_var": 0.026056925455729168, + "learning_rate": 0.0001, + "loss": 4.3207, + "loss/crossentropy": 1.8898470997810364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20916878432035446, + "step": 5496 + }, + { + "epoch": 0.10996, + "grad_norm": 2.21875, + "grad_norm_var": 0.025951131184895834, + "learning_rate": 0.0001, + "loss": 4.399, + "loss/crossentropy": 2.1901716589927673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2409309297800064, + "step": 5498 + }, + { + "epoch": 0.11, + "grad_norm": 2.234375, + "grad_norm_var": 0.017626953125, + "learning_rate": 0.0001, + "loss": 4.6897, + "loss/crossentropy": 2.1018574237823486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24559657275676727, + "step": 5500 + }, + { + "epoch": 0.11004, + "grad_norm": 2.1875, + "grad_norm_var": 0.019449869791666668, + "learning_rate": 0.0001, + "loss": 3.8159, + "loss/crossentropy": 2.0575350522994995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23721858859062195, + "step": 5502 + }, + { + "epoch": 0.11008, + "grad_norm": 2.15625, + "grad_norm_var": 0.019710286458333334, + "learning_rate": 0.0001, + "loss": 4.6347, + "loss/crossentropy": 2.1846336126327515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2378462702035904, + "step": 5504 + }, + { + "epoch": 0.11012, + "grad_norm": 2.4375, + "grad_norm_var": 0.014574178059895833, + "learning_rate": 0.0001, + "loss": 4.4028, + "loss/crossentropy": 2.1359363198280334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26211391389369965, + "step": 5506 + }, + { + "epoch": 0.11016, + "grad_norm": 2.265625, + "grad_norm_var": 0.01416015625, + "learning_rate": 0.0001, + "loss": 4.5621, + "loss/crossentropy": 2.236825942993164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2557792589068413, + "step": 5508 + }, + { + "epoch": 0.1102, + "grad_norm": 2.25, + "grad_norm_var": 0.009521484375, + "learning_rate": 0.0001, + "loss": 4.3234, + "loss/crossentropy": 2.3140580654144287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2554958164691925, + "step": 5510 + }, + { + "epoch": 0.11024, + "grad_norm": 2.140625, + "grad_norm_var": 0.00904541015625, + "learning_rate": 0.0001, + "loss": 4.4382, + "loss/crossentropy": 1.7190355062484741, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19964009523391724, + "step": 5512 + }, + { + "epoch": 0.11028, + "grad_norm": 2.203125, + "grad_norm_var": 0.014631144205729167, + "learning_rate": 0.0001, + "loss": 4.3831, + "loss/crossentropy": 1.8326427340507507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2199154868721962, + "step": 5514 + }, + { + "epoch": 0.11032, + "grad_norm": 2.109375, + "grad_norm_var": 0.01549072265625, + "learning_rate": 0.0001, + "loss": 4.4494, + "loss/crossentropy": 1.9013121724128723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20293578505516052, + "step": 5516 + }, + { + "epoch": 0.11036, + "grad_norm": 2.1875, + "grad_norm_var": 0.0138671875, + "learning_rate": 0.0001, + "loss": 4.7056, + "loss/crossentropy": 2.0221983790397644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23435519635677338, + "step": 5518 + }, + { + "epoch": 0.1104, + "grad_norm": 2.359375, + "grad_norm_var": 0.0140533447265625, + "learning_rate": 0.0001, + "loss": 4.6157, + "loss/crossentropy": 2.153970956802368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22127598524093628, + "step": 5520 + }, + { + "epoch": 0.11044, + "grad_norm": 2.125, + "grad_norm_var": 0.010835774739583333, + "learning_rate": 0.0001, + "loss": 4.4516, + "loss/crossentropy": 1.8674496412277222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.232261061668396, + "step": 5522 + }, + { + "epoch": 0.11048, + "grad_norm": 2.078125, + "grad_norm_var": 0.012495930989583333, + "learning_rate": 0.0001, + "loss": 4.6528, + "loss/crossentropy": 2.1575759649276733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22218701988458633, + "step": 5524 + }, + { + "epoch": 0.11052, + "grad_norm": 2.25, + "grad_norm_var": 0.012434895833333333, + "learning_rate": 0.0001, + "loss": 4.5553, + "loss/crossentropy": 2.054452419281006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25137215852737427, + "step": 5526 + }, + { + "epoch": 0.11056, + "grad_norm": 2.296875, + "grad_norm_var": 0.022639973958333334, + "learning_rate": 0.0001, + "loss": 4.4789, + "loss/crossentropy": 1.966478705406189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23289234936237335, + "step": 5528 + }, + { + "epoch": 0.1106, + "grad_norm": 2.234375, + "grad_norm_var": 0.017626953125, + "learning_rate": 0.0001, + "loss": 4.687, + "loss/crossentropy": 2.171034336090088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23373593389987946, + "step": 5530 + }, + { + "epoch": 0.11064, + "grad_norm": 1.953125, + "grad_norm_var": 0.022362263997395833, + "learning_rate": 0.0001, + "loss": 4.142, + "loss/crossentropy": 2.2416292428970337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24835532158613205, + "step": 5532 + }, + { + "epoch": 0.11068, + "grad_norm": 2.53125, + "grad_norm_var": 0.02720947265625, + "learning_rate": 0.0001, + "loss": 4.6252, + "loss/crossentropy": 2.2599780559539795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24948878586292267, + "step": 5534 + }, + { + "epoch": 0.11072, + "grad_norm": 2.34375, + "grad_norm_var": 0.028902180989583335, + "learning_rate": 0.0001, + "loss": 4.6984, + "loss/crossentropy": 2.2292014360427856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26616473495960236, + "step": 5536 + }, + { + "epoch": 0.11076, + "grad_norm": 2.375, + "grad_norm_var": 0.027424112955729166, + "learning_rate": 0.0001, + "loss": 4.336, + "loss/crossentropy": 1.9285388588905334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22964124381542206, + "step": 5538 + }, + { + "epoch": 0.1108, + "grad_norm": 2.171875, + "grad_norm_var": 0.0251953125, + "learning_rate": 0.0001, + "loss": 4.2861, + "loss/crossentropy": 1.864789366722107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2264304906129837, + "step": 5540 + }, + { + "epoch": 0.11084, + "grad_norm": 2.234375, + "grad_norm_var": 0.0250885009765625, + "learning_rate": 0.0001, + "loss": 4.5163, + "loss/crossentropy": 1.8676912188529968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21919699758291245, + "step": 5542 + }, + { + "epoch": 0.11088, + "grad_norm": 2.234375, + "grad_norm_var": 0.0190338134765625, + "learning_rate": 0.0001, + "loss": 4.3612, + "loss/crossentropy": 2.34523469209671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25638002157211304, + "step": 5544 + }, + { + "epoch": 0.11092, + "grad_norm": 2.09375, + "grad_norm_var": 0.020992024739583334, + "learning_rate": 0.0001, + "loss": 4.5165, + "loss/crossentropy": 2.2903120517730713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25111711025238037, + "step": 5546 + }, + { + "epoch": 0.11096, + "grad_norm": 2.15625, + "grad_norm_var": 0.017671712239583335, + "learning_rate": 0.0001, + "loss": 4.5336, + "loss/crossentropy": 2.2106658220291138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22340595722198486, + "step": 5548 + }, + { + "epoch": 0.111, + "grad_norm": 2.203125, + "grad_norm_var": 0.013700358072916667, + "learning_rate": 0.0001, + "loss": 4.6305, + "loss/crossentropy": 2.0777581334114075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2446284517645836, + "step": 5550 + }, + { + "epoch": 0.11104, + "grad_norm": 2.234375, + "grad_norm_var": 0.010758463541666667, + "learning_rate": 0.0001, + "loss": 4.5507, + "loss/crossentropy": 2.131237506866455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24113191664218903, + "step": 5552 + }, + { + "epoch": 0.11108, + "grad_norm": 2.578125, + "grad_norm_var": 0.018778483072916668, + "learning_rate": 0.0001, + "loss": 4.6337, + "loss/crossentropy": 2.190987467765808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2528213635087013, + "step": 5554 + }, + { + "epoch": 0.11112, + "grad_norm": 2.25, + "grad_norm_var": 0.019527180989583334, + "learning_rate": 0.0001, + "loss": 4.4889, + "loss/crossentropy": 2.26843523979187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27781064808368683, + "step": 5556 + }, + { + "epoch": 0.11116, + "grad_norm": 2.25, + "grad_norm_var": 0.019527180989583334, + "learning_rate": 0.0001, + "loss": 4.2242, + "loss/crossentropy": 1.9507999420166016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23172564804553986, + "step": 5558 + }, + { + "epoch": 0.1112, + "grad_norm": 2.28125, + "grad_norm_var": 0.016966756184895834, + "learning_rate": 0.0001, + "loss": 4.4821, + "loss/crossentropy": 2.0738234519958496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24497026205062866, + "step": 5560 + }, + { + "epoch": 0.11124, + "grad_norm": 2.515625, + "grad_norm_var": 0.019261678059895832, + "learning_rate": 0.0001, + "loss": 4.9501, + "loss/crossentropy": 2.273179054260254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2917838394641876, + "step": 5562 + }, + { + "epoch": 0.11128, + "grad_norm": 2.359375, + "grad_norm_var": 0.017899576822916666, + "learning_rate": 0.0001, + "loss": 4.774, + "loss/crossentropy": 2.085157036781311, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2488701120018959, + "step": 5564 + }, + { + "epoch": 0.11132, + "grad_norm": 3.109375, + "grad_norm_var": 0.05950113932291667, + "learning_rate": 0.0001, + "loss": 4.2869, + "loss/crossentropy": 2.0528116822242737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24717991054058075, + "step": 5566 + }, + { + "epoch": 0.11136, + "grad_norm": 7.0, + "grad_norm_var": 1.3981597900390625, + "learning_rate": 0.0001, + "loss": 4.4443, + "loss/crossentropy": 2.0651500821113586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26806148886680603, + "step": 5568 + }, + { + "epoch": 0.1114, + "grad_norm": 2.328125, + "grad_norm_var": 1.4721588134765624, + "learning_rate": 0.0001, + "loss": 4.6162, + "loss/crossentropy": 2.1860616207122803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23045460134744644, + "step": 5570 + }, + { + "epoch": 0.11144, + "grad_norm": 2.328125, + "grad_norm_var": 1.460399373372396, + "learning_rate": 0.0001, + "loss": 4.3629, + "loss/crossentropy": 1.6931262016296387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20816385000944138, + "step": 5572 + }, + { + "epoch": 0.11148, + "grad_norm": 2.28125, + "grad_norm_var": 1.4581858317057292, + "learning_rate": 0.0001, + "loss": 4.3376, + "loss/crossentropy": 2.199341118335724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2490597665309906, + "step": 5574 + }, + { + "epoch": 0.11152, + "grad_norm": 2.484375, + "grad_norm_var": 1.4561513264973958, + "learning_rate": 0.0001, + "loss": 4.6627, + "loss/crossentropy": 2.2010069489479065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24095547199249268, + "step": 5576 + }, + { + "epoch": 0.11156, + "grad_norm": 2.234375, + "grad_norm_var": 1.4810129801432292, + "learning_rate": 0.0001, + "loss": 4.5243, + "loss/crossentropy": 1.9907150864601135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2122703418135643, + "step": 5578 + }, + { + "epoch": 0.1116, + "grad_norm": 2.21875, + "grad_norm_var": 1.5023508707682292, + "learning_rate": 0.0001, + "loss": 4.2618, + "loss/crossentropy": 2.196335554122925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25111766904592514, + "step": 5580 + }, + { + "epoch": 0.11164, + "grad_norm": 2.171875, + "grad_norm_var": 1.5003000895182292, + "learning_rate": 0.0001, + "loss": 4.5104, + "loss/crossentropy": 2.0762988924980164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23773670941591263, + "step": 5582 + }, + { + "epoch": 0.11168, + "grad_norm": 2.140625, + "grad_norm_var": 0.15530192057291667, + "learning_rate": 0.0001, + "loss": 4.3166, + "loss/crossentropy": 2.0803143978118896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22162869572639465, + "step": 5584 + }, + { + "epoch": 0.11172, + "grad_norm": 2.3125, + "grad_norm_var": 0.0084136962890625, + "learning_rate": 0.0001, + "loss": 4.5596, + "loss/crossentropy": 2.1821994185447693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26280316710472107, + "step": 5586 + }, + { + "epoch": 0.11176, + "grad_norm": 2.171875, + "grad_norm_var": 0.011847941080729167, + "learning_rate": 0.0001, + "loss": 4.4328, + "loss/crossentropy": 2.1899439096450806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24677179753780365, + "step": 5588 + }, + { + "epoch": 0.1118, + "grad_norm": 2.21875, + "grad_norm_var": 0.0116851806640625, + "learning_rate": 0.0001, + "loss": 4.4361, + "loss/crossentropy": 2.334734559059143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2481135129928589, + "step": 5590 + }, + { + "epoch": 0.11184, + "grad_norm": 2.34375, + "grad_norm_var": 0.007291666666666667, + "learning_rate": 0.0001, + "loss": 4.7838, + "loss/crossentropy": 2.2976341247558594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2454354390501976, + "step": 5592 + }, + { + "epoch": 0.11188, + "grad_norm": 2.3125, + "grad_norm_var": 0.007477823893229167, + "learning_rate": 0.0001, + "loss": 4.7148, + "loss/crossentropy": 2.3243749141693115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2735731601715088, + "step": 5594 + }, + { + "epoch": 0.11192, + "grad_norm": 2.28125, + "grad_norm_var": 0.007372029622395833, + "learning_rate": 0.0001, + "loss": 4.7124, + "loss/crossentropy": 2.0328271985054016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21919244527816772, + "step": 5596 + }, + { + "epoch": 0.11196, + "grad_norm": 2.234375, + "grad_norm_var": 0.0077789306640625, + "learning_rate": 0.0001, + "loss": 4.4584, + "loss/crossentropy": 2.249367594718933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2577860951423645, + "step": 5598 + }, + { + "epoch": 0.112, + "grad_norm": 2.046875, + "grad_norm_var": 0.009505208333333333, + "learning_rate": 0.0001, + "loss": 4.2594, + "loss/crossentropy": 1.9844761490821838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2142435386776924, + "step": 5600 + }, + { + "epoch": 0.11204, + "grad_norm": 2.375, + "grad_norm_var": 0.0111328125, + "learning_rate": 0.0001, + "loss": 4.6043, + "loss/crossentropy": 2.1334372758865356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23296397179365158, + "step": 5602 + }, + { + "epoch": 0.11208, + "grad_norm": 2.3125, + "grad_norm_var": 0.0078084309895833336, + "learning_rate": 0.0001, + "loss": 4.5308, + "loss/crossentropy": 2.119946002960205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23398507386446, + "step": 5604 + }, + { + "epoch": 0.11212, + "grad_norm": 2.25, + "grad_norm_var": 0.008040364583333333, + "learning_rate": 0.0001, + "loss": 4.5011, + "loss/crossentropy": 2.0414544343948364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23420867323875427, + "step": 5606 + }, + { + "epoch": 0.11216, + "grad_norm": 2.125, + "grad_norm_var": 0.009130859375, + "learning_rate": 0.0001, + "loss": 4.2092, + "loss/crossentropy": 1.9592725038528442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23922864347696304, + "step": 5608 + }, + { + "epoch": 0.1122, + "grad_norm": 2.4375, + "grad_norm_var": 0.011188761393229166, + "learning_rate": 0.0001, + "loss": 4.4795, + "loss/crossentropy": 2.150269627571106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2528265416622162, + "step": 5610 + }, + { + "epoch": 0.11224, + "grad_norm": 2.578125, + "grad_norm_var": 0.3099772135416667, + "learning_rate": 0.0001, + "loss": 4.6234, + "loss/crossentropy": 2.0591378211975098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2430240362882614, + "step": 5612 + }, + { + "epoch": 0.11228, + "grad_norm": 2.171875, + "grad_norm_var": 0.30794169108072916, + "learning_rate": 0.0001, + "loss": 4.4251, + "loss/crossentropy": 2.2132861614227295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22676552087068558, + "step": 5614 + }, + { + "epoch": 0.11232, + "grad_norm": 2.375, + "grad_norm_var": 0.3002237955729167, + "learning_rate": 0.0001, + "loss": 4.4332, + "loss/crossentropy": 1.9607917070388794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23068836331367493, + "step": 5616 + }, + { + "epoch": 0.11236, + "grad_norm": 2.25, + "grad_norm_var": 0.3021321614583333, + "learning_rate": 0.0001, + "loss": 4.549, + "loss/crossentropy": 2.2245940566062927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25029121339321136, + "step": 5618 + }, + { + "epoch": 0.1124, + "grad_norm": 2.234375, + "grad_norm_var": 0.30278218587239586, + "learning_rate": 0.0001, + "loss": 4.4539, + "loss/crossentropy": 2.2511253356933594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25143079459667206, + "step": 5620 + }, + { + "epoch": 0.11244, + "grad_norm": 2.328125, + "grad_norm_var": 0.3026194254557292, + "learning_rate": 0.0001, + "loss": 4.4632, + "loss/crossentropy": 2.2945470809936523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2196483463048935, + "step": 5622 + }, + { + "epoch": 0.11248, + "grad_norm": 2.359375, + "grad_norm_var": 0.29273681640625, + "learning_rate": 0.0001, + "loss": 4.8769, + "loss/crossentropy": 2.2266393899917603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24020668864250183, + "step": 5624 + }, + { + "epoch": 0.11252, + "grad_norm": 2.359375, + "grad_norm_var": 0.29136454264322914, + "learning_rate": 0.0001, + "loss": 4.742, + "loss/crossentropy": 2.2835845947265625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27199871838092804, + "step": 5626 + }, + { + "epoch": 0.11256, + "grad_norm": 2.859375, + "grad_norm_var": 0.026383463541666666, + "learning_rate": 0.0001, + "loss": 4.4213, + "loss/crossentropy": 1.9576718211174011, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21716968715190887, + "step": 5628 + }, + { + "epoch": 0.1126, + "grad_norm": 2.375, + "grad_norm_var": 0.044611612955729164, + "learning_rate": 0.0001, + "loss": 4.7695, + "loss/crossentropy": 2.0955676436424255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24259109795093536, + "step": 5630 + }, + { + "epoch": 0.11264, + "grad_norm": 2.5625, + "grad_norm_var": 0.044840494791666664, + "learning_rate": 0.0001, + "loss": 4.4127, + "loss/crossentropy": 2.119523346424103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2478165179491043, + "step": 5632 + }, + { + "epoch": 0.11268, + "grad_norm": 2.265625, + "grad_norm_var": 0.0444488525390625, + "learning_rate": 0.0001, + "loss": 4.5591, + "loss/crossentropy": 2.189425826072693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24532486498355865, + "step": 5634 + }, + { + "epoch": 0.11272, + "grad_norm": 2.390625, + "grad_norm_var": 0.0467193603515625, + "learning_rate": 0.0001, + "loss": 4.4969, + "loss/crossentropy": 2.215874433517456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24533041566610336, + "step": 5636 + }, + { + "epoch": 0.11276, + "grad_norm": 2.515625, + "grad_norm_var": 0.04868876139322917, + "learning_rate": 0.0001, + "loss": 4.5657, + "loss/crossentropy": 2.226451873779297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2545652836561203, + "step": 5638 + }, + { + "epoch": 0.1128, + "grad_norm": 2.296875, + "grad_norm_var": 0.05181884765625, + "learning_rate": 0.0001, + "loss": 4.4779, + "loss/crossentropy": 2.0343592762947083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2503928989171982, + "step": 5640 + }, + { + "epoch": 0.11284, + "grad_norm": 2.25, + "grad_norm_var": 0.0535552978515625, + "learning_rate": 0.0001, + "loss": 4.6253, + "loss/crossentropy": 2.142001748085022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24624411761760712, + "step": 5642 + }, + { + "epoch": 0.11288, + "grad_norm": 2.140625, + "grad_norm_var": 0.039383951822916666, + "learning_rate": 0.0001, + "loss": 4.4212, + "loss/crossentropy": 1.822394609451294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21315739303827286, + "step": 5644 + }, + { + "epoch": 0.11292, + "grad_norm": 2.140625, + "grad_norm_var": 0.018863932291666666, + "learning_rate": 0.0001, + "loss": 4.1611, + "loss/crossentropy": 2.3221731185913086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24407917261123657, + "step": 5646 + }, + { + "epoch": 0.11296, + "grad_norm": 2.234375, + "grad_norm_var": 0.012531534830729166, + "learning_rate": 0.0001, + "loss": 4.5591, + "loss/crossentropy": 1.9784467816352844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22578515857458115, + "step": 5648 + }, + { + "epoch": 0.113, + "grad_norm": 2.328125, + "grad_norm_var": 0.0142730712890625, + "learning_rate": 0.0001, + "loss": 4.636, + "loss/crossentropy": 2.2148635387420654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25031210482120514, + "step": 5650 + }, + { + "epoch": 0.11304, + "grad_norm": 2.21875, + "grad_norm_var": 0.012369791666666666, + "learning_rate": 0.0001, + "loss": 4.4644, + "loss/crossentropy": 1.9204095602035522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22073335200548172, + "step": 5652 + }, + { + "epoch": 0.11308, + "grad_norm": 2.421875, + "grad_norm_var": 0.009764607747395833, + "learning_rate": 0.0001, + "loss": 4.6601, + "loss/crossentropy": 2.092605173587799, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24161123484373093, + "step": 5654 + }, + { + "epoch": 0.11312, + "grad_norm": 2.546875, + "grad_norm_var": 0.0163726806640625, + "learning_rate": 0.0001, + "loss": 4.5977, + "loss/crossentropy": 1.9546263217926025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22741927206516266, + "step": 5656 + }, + { + "epoch": 0.11316, + "grad_norm": 2.28125, + "grad_norm_var": 0.016974894205729167, + "learning_rate": 0.0001, + "loss": 4.6585, + "loss/crossentropy": 2.256605863571167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2560018301010132, + "step": 5658 + }, + { + "epoch": 0.1132, + "grad_norm": 2.21875, + "grad_norm_var": 0.0157135009765625, + "learning_rate": 0.0001, + "loss": 4.3605, + "loss/crossentropy": 2.24527370929718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.270521879196167, + "step": 5660 + }, + { + "epoch": 0.11324, + "grad_norm": 2.5, + "grad_norm_var": 0.014029947916666667, + "learning_rate": 0.0001, + "loss": 4.5429, + "loss/crossentropy": 1.8154722452163696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22786997258663177, + "step": 5662 + }, + { + "epoch": 0.11328, + "grad_norm": 2.359375, + "grad_norm_var": 0.0143951416015625, + "learning_rate": 0.0001, + "loss": 4.4079, + "loss/crossentropy": 2.135699689388275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.253468282520771, + "step": 5664 + }, + { + "epoch": 0.11332, + "grad_norm": 2.203125, + "grad_norm_var": 0.015087890625, + "learning_rate": 0.0001, + "loss": 4.5381, + "loss/crossentropy": 2.15896338224411, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25651170313358307, + "step": 5666 + }, + { + "epoch": 0.11336, + "grad_norm": 2.40625, + "grad_norm_var": 0.012035115559895834, + "learning_rate": 0.0001, + "loss": 4.7877, + "loss/crossentropy": 2.115864336490631, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28546491265296936, + "step": 5668 + }, + { + "epoch": 0.1134, + "grad_norm": 2.25, + "grad_norm_var": 0.012214152018229167, + "learning_rate": 0.0001, + "loss": 4.4283, + "loss/crossentropy": 2.2036256790161133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2397892326116562, + "step": 5670 + }, + { + "epoch": 0.11344, + "grad_norm": 2.234375, + "grad_norm_var": 0.008430989583333333, + "learning_rate": 0.0001, + "loss": 4.598, + "loss/crossentropy": 2.4966647624969482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2984588146209717, + "step": 5672 + }, + { + "epoch": 0.11348, + "grad_norm": 2.234375, + "grad_norm_var": 0.008202107747395833, + "learning_rate": 0.0001, + "loss": 4.5289, + "loss/crossentropy": 2.051860749721527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2748461365699768, + "step": 5674 + }, + { + "epoch": 0.11352, + "grad_norm": 2.328125, + "grad_norm_var": 0.007938639322916666, + "learning_rate": 0.0001, + "loss": 4.597, + "loss/crossentropy": 2.046416461467743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2320190668106079, + "step": 5676 + }, + { + "epoch": 0.11356, + "grad_norm": 2.21875, + "grad_norm_var": 0.0054514567057291664, + "learning_rate": 0.0001, + "loss": 4.7389, + "loss/crossentropy": 2.2385342121124268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21859309077262878, + "step": 5678 + }, + { + "epoch": 0.1136, + "grad_norm": 2.265625, + "grad_norm_var": 0.006843058268229166, + "learning_rate": 0.0001, + "loss": 4.2487, + "loss/crossentropy": 1.8511550426483154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22065181285142899, + "step": 5680 + }, + { + "epoch": 0.11364, + "grad_norm": 2.828125, + "grad_norm_var": 0.031477864583333334, + "learning_rate": 0.0001, + "loss": 4.642, + "loss/crossentropy": 2.304056167602539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29507844150066376, + "step": 5682 + }, + { + "epoch": 0.11368, + "grad_norm": 2.4375, + "grad_norm_var": 0.03329671223958333, + "learning_rate": 0.0001, + "loss": 4.5519, + "loss/crossentropy": 1.993275225162506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33826952427625656, + "step": 5684 + }, + { + "epoch": 0.11372, + "grad_norm": 2.140625, + "grad_norm_var": 0.03498433430989583, + "learning_rate": 0.0001, + "loss": 4.4729, + "loss/crossentropy": 2.1836347579956055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24539872258901596, + "step": 5686 + }, + { + "epoch": 0.11376, + "grad_norm": 2.296875, + "grad_norm_var": 0.03504231770833333, + "learning_rate": 0.0001, + "loss": 4.5908, + "loss/crossentropy": 1.9467885494232178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2180013582110405, + "step": 5688 + }, + { + "epoch": 0.1138, + "grad_norm": 2.78125, + "grad_norm_var": 0.04907938639322917, + "learning_rate": 0.0001, + "loss": 4.2888, + "loss/crossentropy": 1.9907563924789429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2551623433828354, + "step": 5690 + }, + { + "epoch": 0.11384, + "grad_norm": 2.21875, + "grad_norm_var": 0.049153645833333336, + "learning_rate": 0.0001, + "loss": 4.2186, + "loss/crossentropy": 1.9452654719352722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24263620376586914, + "step": 5692 + }, + { + "epoch": 0.11388, + "grad_norm": 2.171875, + "grad_norm_var": 0.050093587239583334, + "learning_rate": 0.0001, + "loss": 4.5816, + "loss/crossentropy": 2.2448811531066895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2508034110069275, + "step": 5694 + }, + { + "epoch": 0.11392, + "grad_norm": 2.65625, + "grad_norm_var": 0.055712890625, + "learning_rate": 0.0001, + "loss": 4.1868, + "loss/crossentropy": 1.991935908794403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22709138691425323, + "step": 5696 + }, + { + "epoch": 0.11396, + "grad_norm": 2.625, + "grad_norm_var": 0.47700907389322916, + "learning_rate": 0.0001, + "loss": 4.8208, + "loss/crossentropy": 2.0479623675346375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23980189859867096, + "step": 5698 + }, + { + "epoch": 0.114, + "grad_norm": 2.15625, + "grad_norm_var": 0.4779581705729167, + "learning_rate": 0.0001, + "loss": 4.4742, + "loss/crossentropy": 1.9319151639938354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2295292615890503, + "step": 5700 + }, + { + "epoch": 0.11404, + "grad_norm": 2.296875, + "grad_norm_var": 0.47330322265625, + "learning_rate": 0.0001, + "loss": 4.32, + "loss/crossentropy": 1.985447645187378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.246543288230896, + "step": 5702 + }, + { + "epoch": 0.11408, + "grad_norm": 2.84375, + "grad_norm_var": 0.478271484375, + "learning_rate": 0.0001, + "loss": 4.6602, + "loss/crossentropy": 2.0016521215438843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25966253876686096, + "step": 5704 + }, + { + "epoch": 0.11412, + "grad_norm": 2.421875, + "grad_norm_var": 0.47038472493489586, + "learning_rate": 0.0001, + "loss": 4.9207, + "loss/crossentropy": 2.112374246120453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3251144737005234, + "step": 5706 + }, + { + "epoch": 0.11416, + "grad_norm": 2.328125, + "grad_norm_var": 0.4698720296223958, + "learning_rate": 0.0001, + "loss": 4.5223, + "loss/crossentropy": 2.0931158661842346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2275928407907486, + "step": 5708 + }, + { + "epoch": 0.1142, + "grad_norm": 2.328125, + "grad_norm_var": 0.46104227701822914, + "learning_rate": 0.0001, + "loss": 4.4481, + "loss/crossentropy": 1.9867743849754333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2419440597295761, + "step": 5710 + }, + { + "epoch": 0.11424, + "grad_norm": 2.171875, + "grad_norm_var": 0.4639312744140625, + "learning_rate": 0.0001, + "loss": 4.3874, + "loss/crossentropy": 2.1822216510772705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2437271624803543, + "step": 5712 + }, + { + "epoch": 0.11428, + "grad_norm": 6.46875, + "grad_norm_var": 1.1088775634765624, + "learning_rate": 0.0001, + "loss": 4.5307, + "loss/crossentropy": 2.412580370903015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3222763389348984, + "step": 5714 + }, + { + "epoch": 0.11432, + "grad_norm": 2.28125, + "grad_norm_var": 1.1019490559895833, + "learning_rate": 0.0001, + "loss": 4.2491, + "loss/crossentropy": 1.676392376422882, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24499019235372543, + "step": 5716 + }, + { + "epoch": 0.11436, + "grad_norm": 2.328125, + "grad_norm_var": 1.1039876302083333, + "learning_rate": 0.0001, + "loss": 4.4741, + "loss/crossentropy": 1.7818856835365295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22935030609369278, + "step": 5718 + }, + { + "epoch": 0.1144, + "grad_norm": 2.40625, + "grad_norm_var": 1.0944732666015624, + "learning_rate": 0.0001, + "loss": 4.5445, + "loss/crossentropy": 2.012014925479889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.231942281126976, + "step": 5720 + }, + { + "epoch": 0.11444, + "grad_norm": 2.296875, + "grad_norm_var": 1.1023834228515625, + "learning_rate": 0.0001, + "loss": 4.5061, + "loss/crossentropy": 2.3663965463638306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.254768967628479, + "step": 5722 + }, + { + "epoch": 0.11448, + "grad_norm": 2.34375, + "grad_norm_var": 1.107982381184896, + "learning_rate": 0.0001, + "loss": 4.5878, + "loss/crossentropy": 2.343206286430359, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26609115302562714, + "step": 5724 + }, + { + "epoch": 0.11452, + "grad_norm": 2.21875, + "grad_norm_var": 1.1131337483723958, + "learning_rate": 0.0001, + "loss": 4.5047, + "loss/crossentropy": 1.8696978092193604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24282050132751465, + "step": 5726 + }, + { + "epoch": 0.11456, + "grad_norm": 2.0625, + "grad_norm_var": 1.1170644124348958, + "learning_rate": 0.0001, + "loss": 4.2198, + "loss/crossentropy": 2.1430450677871704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23211582750082016, + "step": 5728 + }, + { + "epoch": 0.1146, + "grad_norm": 2.421875, + "grad_norm_var": 0.013841756184895833, + "learning_rate": 0.0001, + "loss": 4.6868, + "loss/crossentropy": 2.2231308221817017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23736387491226196, + "step": 5730 + }, + { + "epoch": 0.11464, + "grad_norm": 2.421875, + "grad_norm_var": 0.014069620768229167, + "learning_rate": 0.0001, + "loss": 4.4911, + "loss/crossentropy": 1.8789280652999878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24081497639417648, + "step": 5732 + }, + { + "epoch": 0.11468, + "grad_norm": 2.1875, + "grad_norm_var": 0.014875284830729167, + "learning_rate": 0.0001, + "loss": 4.4478, + "loss/crossentropy": 2.0491732358932495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20530463755130768, + "step": 5734 + }, + { + "epoch": 0.11472, + "grad_norm": 2.296875, + "grad_norm_var": 0.012516276041666666, + "learning_rate": 0.0001, + "loss": 4.4352, + "loss/crossentropy": 2.067046642303467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24701344966888428, + "step": 5736 + }, + { + "epoch": 0.11476, + "grad_norm": 2.234375, + "grad_norm_var": 0.012791951497395834, + "learning_rate": 0.0001, + "loss": 4.4515, + "loss/crossentropy": 2.0207647681236267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2243770807981491, + "step": 5738 + }, + { + "epoch": 0.1148, + "grad_norm": 2.21875, + "grad_norm_var": 0.013084920247395833, + "learning_rate": 0.0001, + "loss": 4.9205, + "loss/crossentropy": 2.230514347553253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24377264082431793, + "step": 5740 + }, + { + "epoch": 0.11484, + "grad_norm": 2.078125, + "grad_norm_var": 0.013997395833333334, + "learning_rate": 0.0001, + "loss": 4.3831, + "loss/crossentropy": 2.38068687915802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24760407954454422, + "step": 5742 + }, + { + "epoch": 0.11488, + "grad_norm": 2.3125, + "grad_norm_var": 0.012483723958333333, + "learning_rate": 0.0001, + "loss": 4.5598, + "loss/crossentropy": 2.238909125328064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2503668889403343, + "step": 5744 + }, + { + "epoch": 0.11492, + "grad_norm": 2.1875, + "grad_norm_var": 0.015265909830729167, + "learning_rate": 0.0001, + "loss": 4.4848, + "loss/crossentropy": 1.7423101663589478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20424582809209824, + "step": 5746 + }, + { + "epoch": 0.11496, + "grad_norm": 2.25, + "grad_norm_var": 0.0181304931640625, + "learning_rate": 0.0001, + "loss": 4.6754, + "loss/crossentropy": 2.5906827449798584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24731651693582535, + "step": 5748 + }, + { + "epoch": 0.115, + "grad_norm": 2.6875, + "grad_norm_var": 0.025830078125, + "learning_rate": 0.0001, + "loss": 4.7427, + "loss/crossentropy": 2.418861746788025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2505089193582535, + "step": 5750 + }, + { + "epoch": 0.11504, + "grad_norm": 2.765625, + "grad_norm_var": 0.03658447265625, + "learning_rate": 0.0001, + "loss": 4.869, + "loss/crossentropy": 2.158658504486084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2381347045302391, + "step": 5752 + }, + { + "epoch": 0.11508, + "grad_norm": 2.40625, + "grad_norm_var": 0.03860270182291667, + "learning_rate": 0.0001, + "loss": 4.5073, + "loss/crossentropy": 2.0938435196876526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23639065027236938, + "step": 5754 + }, + { + "epoch": 0.11512, + "grad_norm": 2.046875, + "grad_norm_var": 0.0441314697265625, + "learning_rate": 0.0001, + "loss": 4.2227, + "loss/crossentropy": 2.023799479007721, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24510329961776733, + "step": 5756 + }, + { + "epoch": 0.11516, + "grad_norm": 2.265625, + "grad_norm_var": 0.03854166666666667, + "learning_rate": 0.0001, + "loss": 4.5155, + "loss/crossentropy": 2.3589184284210205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2778936177492142, + "step": 5758 + }, + { + "epoch": 0.1152, + "grad_norm": 2.1875, + "grad_norm_var": 0.041731770833333334, + "learning_rate": 0.0001, + "loss": 4.4209, + "loss/crossentropy": 2.4897998571395874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23546544462442398, + "step": 5760 + }, + { + "epoch": 0.11524, + "grad_norm": 2.1875, + "grad_norm_var": 0.041747029622395834, + "learning_rate": 0.0001, + "loss": 4.4619, + "loss/crossentropy": 2.0426196455955505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22708147019147873, + "step": 5762 + }, + { + "epoch": 0.11528, + "grad_norm": 2.28125, + "grad_norm_var": 0.0397369384765625, + "learning_rate": 0.0001, + "loss": 4.6661, + "loss/crossentropy": 2.2582051753997803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25646351277828217, + "step": 5764 + }, + { + "epoch": 0.11532, + "grad_norm": 2.21875, + "grad_norm_var": 0.03430582682291667, + "learning_rate": 0.0001, + "loss": 4.5929, + "loss/crossentropy": 2.181105613708496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2380223199725151, + "step": 5766 + }, + { + "epoch": 0.11536, + "grad_norm": 2.46875, + "grad_norm_var": 0.026200358072916666, + "learning_rate": 0.0001, + "loss": 4.5078, + "loss/crossentropy": 2.236580967903137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26678355038166046, + "step": 5768 + }, + { + "epoch": 0.1154, + "grad_norm": 2.203125, + "grad_norm_var": 0.020361328125, + "learning_rate": 0.0001, + "loss": 4.6232, + "loss/crossentropy": 1.9383749961853027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24920085072517395, + "step": 5770 + }, + { + "epoch": 0.11544, + "grad_norm": 2.15625, + "grad_norm_var": 0.018529256184895832, + "learning_rate": 0.0001, + "loss": 4.6044, + "loss/crossentropy": 2.2856688499450684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.258881650865078, + "step": 5772 + }, + { + "epoch": 0.11548, + "grad_norm": 2.28125, + "grad_norm_var": 0.0239410400390625, + "learning_rate": 0.0001, + "loss": 4.8391, + "loss/crossentropy": 2.2897390127182007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24453644454479218, + "step": 5774 + }, + { + "epoch": 0.11552, + "grad_norm": 2.25, + "grad_norm_var": 0.014549763997395833, + "learning_rate": 0.0001, + "loss": 4.5698, + "loss/crossentropy": 2.0502785444259644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24106161296367645, + "step": 5776 + }, + { + "epoch": 0.11556, + "grad_norm": 2.21875, + "grad_norm_var": 0.01441650390625, + "learning_rate": 0.0001, + "loss": 4.5665, + "loss/crossentropy": 2.344050645828247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24690424650907516, + "step": 5778 + }, + { + "epoch": 0.1156, + "grad_norm": 2.609375, + "grad_norm_var": 0.021239217122395834, + "learning_rate": 0.0001, + "loss": 4.5273, + "loss/crossentropy": 2.2274389266967773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25618284940719604, + "step": 5780 + }, + { + "epoch": 0.11564, + "grad_norm": 2.203125, + "grad_norm_var": 0.022184244791666665, + "learning_rate": 0.0001, + "loss": 4.4888, + "loss/crossentropy": 2.063184678554535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22606099396944046, + "step": 5782 + }, + { + "epoch": 0.11568, + "grad_norm": 2.203125, + "grad_norm_var": 0.0206451416015625, + "learning_rate": 0.0001, + "loss": 4.2333, + "loss/crossentropy": 2.093947410583496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23588553071022034, + "step": 5784 + }, + { + "epoch": 0.11572, + "grad_norm": 2.015625, + "grad_norm_var": 0.0246490478515625, + "learning_rate": 0.0001, + "loss": 4.426, + "loss/crossentropy": 2.1599318981170654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20322780311107635, + "step": 5786 + }, + { + "epoch": 0.11576, + "grad_norm": 2.40625, + "grad_norm_var": 0.02506103515625, + "learning_rate": 0.0001, + "loss": 4.5398, + "loss/crossentropy": 2.274693012237549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2704206556081772, + "step": 5788 + }, + { + "epoch": 0.1158, + "grad_norm": 2.421875, + "grad_norm_var": 0.0222076416015625, + "learning_rate": 0.0001, + "loss": 4.6755, + "loss/crossentropy": 1.9712103009223938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28300437331199646, + "step": 5790 + }, + { + "epoch": 0.11584, + "grad_norm": 2.09375, + "grad_norm_var": 0.024344889322916667, + "learning_rate": 0.0001, + "loss": 4.2035, + "loss/crossentropy": 2.027747690677643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24419110268354416, + "step": 5792 + }, + { + "epoch": 0.11588, + "grad_norm": 2.34375, + "grad_norm_var": 0.024251302083333332, + "learning_rate": 0.0001, + "loss": 4.5614, + "loss/crossentropy": 2.162364959716797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2522001415491104, + "step": 5794 + }, + { + "epoch": 0.11592, + "grad_norm": 2.28125, + "grad_norm_var": 0.0166656494140625, + "learning_rate": 0.0001, + "loss": 4.4711, + "loss/crossentropy": 2.558881998062134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26168718934059143, + "step": 5796 + }, + { + "epoch": 0.11596, + "grad_norm": 2.125, + "grad_norm_var": 0.0162994384765625, + "learning_rate": 0.0001, + "loss": 4.4108, + "loss/crossentropy": 2.1027071475982666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24049362540245056, + "step": 5798 + }, + { + "epoch": 0.116, + "grad_norm": 2.234375, + "grad_norm_var": 0.015034993489583334, + "learning_rate": 0.0001, + "loss": 4.6059, + "loss/crossentropy": 2.488932490348816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2697457820177078, + "step": 5800 + }, + { + "epoch": 0.11604, + "grad_norm": 2.375, + "grad_norm_var": 0.0121734619140625, + "learning_rate": 0.0001, + "loss": 4.6706, + "loss/crossentropy": 2.0441418886184692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23980429768562317, + "step": 5802 + }, + { + "epoch": 0.11608, + "grad_norm": 2.375, + "grad_norm_var": 0.0107421875, + "learning_rate": 0.0001, + "loss": 4.8471, + "loss/crossentropy": 2.2873395681381226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3105090409517288, + "step": 5804 + }, + { + "epoch": 0.11612, + "grad_norm": 2.265625, + "grad_norm_var": 0.00826416015625, + "learning_rate": 0.0001, + "loss": 4.5388, + "loss/crossentropy": 1.8773444890975952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20640414953231812, + "step": 5806 + }, + { + "epoch": 0.11616, + "grad_norm": 2.375, + "grad_norm_var": 0.007502237955729167, + "learning_rate": 0.0001, + "loss": 4.7338, + "loss/crossentropy": 2.3736027479171753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2679155319929123, + "step": 5808 + }, + { + "epoch": 0.1162, + "grad_norm": 2.28125, + "grad_norm_var": 0.010676066080729166, + "learning_rate": 0.0001, + "loss": 4.0496, + "loss/crossentropy": 2.071690082550049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21868111193180084, + "step": 5810 + }, + { + "epoch": 0.11624, + "grad_norm": 2.390625, + "grad_norm_var": 0.015425618489583333, + "learning_rate": 0.0001, + "loss": 4.4029, + "loss/crossentropy": 2.0531184673309326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24457989633083344, + "step": 5812 + }, + { + "epoch": 0.11628, + "grad_norm": 2.3125, + "grad_norm_var": 0.015718587239583335, + "learning_rate": 0.0001, + "loss": 4.6511, + "loss/crossentropy": 2.1161271929740906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22809523344039917, + "step": 5814 + }, + { + "epoch": 0.11632, + "grad_norm": 2.046875, + "grad_norm_var": 0.0196929931640625, + "learning_rate": 0.0001, + "loss": 4.2736, + "loss/crossentropy": 1.6557151675224304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20782940834760666, + "step": 5816 + }, + { + "epoch": 0.11636, + "grad_norm": 2.3125, + "grad_norm_var": 0.03704020182291667, + "learning_rate": 0.0001, + "loss": 4.5977, + "loss/crossentropy": 2.2499040365219116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2713513821363449, + "step": 5818 + }, + { + "epoch": 0.1164, + "grad_norm": 2.390625, + "grad_norm_var": 0.03723958333333333, + "learning_rate": 0.0001, + "loss": 4.6417, + "loss/crossentropy": 2.3616446256637573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2613145411014557, + "step": 5820 + }, + { + "epoch": 0.11644, + "grad_norm": 2.265625, + "grad_norm_var": 0.0353515625, + "learning_rate": 0.0001, + "loss": 4.3744, + "loss/crossentropy": 2.0932790637016296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23843754082918167, + "step": 5822 + }, + { + "epoch": 0.11648, + "grad_norm": 2.25, + "grad_norm_var": 0.03540751139322917, + "learning_rate": 0.0001, + "loss": 4.4089, + "loss/crossentropy": 2.128177046775818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26579540967941284, + "step": 5824 + }, + { + "epoch": 0.11652, + "grad_norm": 2.296875, + "grad_norm_var": 0.031281534830729166, + "learning_rate": 0.0001, + "loss": 4.6942, + "loss/crossentropy": 2.332372784614563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27953268587589264, + "step": 5826 + }, + { + "epoch": 0.11656, + "grad_norm": 2.1875, + "grad_norm_var": 0.0263092041015625, + "learning_rate": 0.0001, + "loss": 4.4615, + "loss/crossentropy": 2.360959053039551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26249848306179047, + "step": 5828 + }, + { + "epoch": 0.1166, + "grad_norm": 2.0, + "grad_norm_var": 0.030745442708333334, + "learning_rate": 0.0001, + "loss": 3.9757, + "loss/crossentropy": 1.9800339341163635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22766301035881042, + "step": 5830 + }, + { + "epoch": 0.11664, + "grad_norm": 2.390625, + "grad_norm_var": 0.029442342122395833, + "learning_rate": 0.0001, + "loss": 4.3287, + "loss/crossentropy": 2.082980155944824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2368428260087967, + "step": 5832 + }, + { + "epoch": 0.11668, + "grad_norm": 2.34375, + "grad_norm_var": 0.011986287434895833, + "learning_rate": 0.0001, + "loss": 4.4441, + "loss/crossentropy": 2.093027710914612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21121951937675476, + "step": 5834 + }, + { + "epoch": 0.11672, + "grad_norm": 2.296875, + "grad_norm_var": 0.011031087239583333, + "learning_rate": 0.0001, + "loss": 4.5008, + "loss/crossentropy": 2.1329175233840942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25303877890110016, + "step": 5836 + }, + { + "epoch": 0.11676, + "grad_norm": 2.21875, + "grad_norm_var": 0.022932942708333334, + "learning_rate": 0.0001, + "loss": 4.5174, + "loss/crossentropy": 1.79305762052536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20448636263608932, + "step": 5838 + }, + { + "epoch": 0.1168, + "grad_norm": 2.28125, + "grad_norm_var": 0.024933878580729166, + "learning_rate": 0.0001, + "loss": 4.2722, + "loss/crossentropy": 1.957836627960205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22745755314826965, + "step": 5840 + }, + { + "epoch": 0.11684, + "grad_norm": 2.25, + "grad_norm_var": 0.025007120768229165, + "learning_rate": 0.0001, + "loss": 4.3773, + "loss/crossentropy": 2.1167174577713013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2437898963689804, + "step": 5842 + }, + { + "epoch": 0.11688, + "grad_norm": 2.1875, + "grad_norm_var": 0.024800618489583332, + "learning_rate": 0.0001, + "loss": 4.4856, + "loss/crossentropy": 2.3288447856903076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2355937883257866, + "step": 5844 + }, + { + "epoch": 0.11692, + "grad_norm": 2.125, + "grad_norm_var": 0.022337849934895834, + "learning_rate": 0.0001, + "loss": 4.4774, + "loss/crossentropy": 2.2526416778564453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24917340278625488, + "step": 5846 + }, + { + "epoch": 0.11696, + "grad_norm": 2.25, + "grad_norm_var": 0.0198883056640625, + "learning_rate": 0.0001, + "loss": 4.391, + "loss/crossentropy": 2.194224774837494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2625589966773987, + "step": 5848 + }, + { + "epoch": 0.117, + "grad_norm": 2.34375, + "grad_norm_var": 0.020116170247395832, + "learning_rate": 0.0001, + "loss": 4.4955, + "loss/crossentropy": 2.0156877040863037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22422078251838684, + "step": 5850 + }, + { + "epoch": 0.11704, + "grad_norm": 2.21875, + "grad_norm_var": 0.01962890625, + "learning_rate": 0.0001, + "loss": 4.6231, + "loss/crossentropy": 2.2480785846710205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24705884605646133, + "step": 5852 + }, + { + "epoch": 0.11708, + "grad_norm": 2.25, + "grad_norm_var": 0.0078521728515625, + "learning_rate": 0.0001, + "loss": 4.4817, + "loss/crossentropy": 2.0915993452072144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24669666588306427, + "step": 5854 + }, + { + "epoch": 0.11712, + "grad_norm": 2.171875, + "grad_norm_var": 0.0062652587890625, + "learning_rate": 0.0001, + "loss": 4.4444, + "loss/crossentropy": 2.1283876299858093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24228712916374207, + "step": 5856 + }, + { + "epoch": 0.11716, + "grad_norm": 2.625, + "grad_norm_var": 0.015523274739583334, + "learning_rate": 0.0001, + "loss": 4.6582, + "loss/crossentropy": 2.028861939907074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2281404584646225, + "step": 5858 + }, + { + "epoch": 0.1172, + "grad_norm": 2.15625, + "grad_norm_var": 0.015852864583333334, + "learning_rate": 0.0001, + "loss": 4.2293, + "loss/crossentropy": 2.154610753059387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24233703315258026, + "step": 5860 + }, + { + "epoch": 0.11724, + "grad_norm": 2.421875, + "grad_norm_var": 0.015787760416666668, + "learning_rate": 0.0001, + "loss": 4.5059, + "loss/crossentropy": 1.9396602511405945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22303076088428497, + "step": 5862 + }, + { + "epoch": 0.11728, + "grad_norm": 2.578125, + "grad_norm_var": 0.021061197916666666, + "learning_rate": 0.0001, + "loss": 4.7447, + "loss/crossentropy": 2.053893029689789, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26897912472486496, + "step": 5864 + }, + { + "epoch": 0.11732, + "grad_norm": 2.0625, + "grad_norm_var": 0.0263336181640625, + "learning_rate": 0.0001, + "loss": 4.3529, + "loss/crossentropy": 1.990949273109436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22180908918380737, + "step": 5866 + }, + { + "epoch": 0.11736, + "grad_norm": 2.453125, + "grad_norm_var": 0.030085245768229168, + "learning_rate": 0.0001, + "loss": 4.6434, + "loss/crossentropy": 2.0929455161094666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24055248498916626, + "step": 5868 + }, + { + "epoch": 0.1174, + "grad_norm": 2.46875, + "grad_norm_var": 0.03168843587239583, + "learning_rate": 0.0001, + "loss": 4.5465, + "loss/crossentropy": 2.1476733684539795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2416895627975464, + "step": 5870 + }, + { + "epoch": 0.11744, + "grad_norm": 2.21875, + "grad_norm_var": 0.027962239583333333, + "learning_rate": 0.0001, + "loss": 4.5633, + "loss/crossentropy": 1.74330335855484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22001302987337112, + "step": 5872 + }, + { + "epoch": 0.11748, + "grad_norm": 2.265625, + "grad_norm_var": 0.021507771809895833, + "learning_rate": 0.0001, + "loss": 4.2725, + "loss/crossentropy": 1.8903921246528625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21950766444206238, + "step": 5874 + }, + { + "epoch": 0.11752, + "grad_norm": 2.140625, + "grad_norm_var": 0.022069295247395832, + "learning_rate": 0.0001, + "loss": 4.3673, + "loss/crossentropy": 1.798406720161438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20271167159080505, + "step": 5876 + }, + { + "epoch": 0.11756, + "grad_norm": 2.25, + "grad_norm_var": 0.021089680989583335, + "learning_rate": 0.0001, + "loss": 4.7128, + "loss/crossentropy": 1.9651959538459778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21256517618894577, + "step": 5878 + }, + { + "epoch": 0.1176, + "grad_norm": 2.109375, + "grad_norm_var": 0.016844685872395834, + "learning_rate": 0.0001, + "loss": 4.4158, + "loss/crossentropy": 2.039245307445526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23225219547748566, + "step": 5880 + }, + { + "epoch": 0.11764, + "grad_norm": 2.40625, + "grad_norm_var": 0.013329060872395833, + "learning_rate": 0.0001, + "loss": 4.8741, + "loss/crossentropy": 2.500381350517273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31757834553718567, + "step": 5882 + }, + { + "epoch": 0.11768, + "grad_norm": 2.59375, + "grad_norm_var": 0.018195597330729167, + "learning_rate": 0.0001, + "loss": 4.6463, + "loss/crossentropy": 2.0540305972099304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2474198415875435, + "step": 5884 + }, + { + "epoch": 0.11772, + "grad_norm": 2.359375, + "grad_norm_var": 0.016462198893229165, + "learning_rate": 0.0001, + "loss": 4.5047, + "loss/crossentropy": 2.072624385356903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2482014298439026, + "step": 5886 + }, + { + "epoch": 0.11776, + "grad_norm": 2.328125, + "grad_norm_var": 0.017853800455729166, + "learning_rate": 0.0001, + "loss": 4.4025, + "loss/crossentropy": 2.153423309326172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.234448604285717, + "step": 5888 + }, + { + "epoch": 0.1178, + "grad_norm": 2.5625, + "grad_norm_var": 0.025325520833333334, + "learning_rate": 0.0001, + "loss": 4.2698, + "loss/crossentropy": 1.5941627621650696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20982372760772705, + "step": 5890 + }, + { + "epoch": 0.11784, + "grad_norm": 2.390625, + "grad_norm_var": 0.025886027018229167, + "learning_rate": 0.0001, + "loss": 4.5168, + "loss/crossentropy": 2.2858930826187134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24936140328645706, + "step": 5892 + }, + { + "epoch": 0.11788, + "grad_norm": 2.296875, + "grad_norm_var": 0.028076171875, + "learning_rate": 0.0001, + "loss": 4.536, + "loss/crossentropy": 2.333559274673462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26152122020721436, + "step": 5894 + }, + { + "epoch": 0.11792, + "grad_norm": 2.296875, + "grad_norm_var": 0.027106730143229167, + "learning_rate": 0.0001, + "loss": 4.412, + "loss/crossentropy": 2.134613037109375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24929091334342957, + "step": 5896 + }, + { + "epoch": 0.11796, + "grad_norm": 2.296875, + "grad_norm_var": 0.026423136393229168, + "learning_rate": 0.0001, + "loss": 4.4052, + "loss/crossentropy": 2.179764688014984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24523546546697617, + "step": 5898 + }, + { + "epoch": 0.118, + "grad_norm": 2.109375, + "grad_norm_var": 0.0204742431640625, + "learning_rate": 0.0001, + "loss": 4.3431, + "loss/crossentropy": 2.1184223294258118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21715932339429855, + "step": 5900 + }, + { + "epoch": 0.11804, + "grad_norm": 2.078125, + "grad_norm_var": 0.021882120768229166, + "learning_rate": 0.0001, + "loss": 4.5606, + "loss/crossentropy": 2.024593770503998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23016374558210373, + "step": 5902 + }, + { + "epoch": 0.11808, + "grad_norm": 2.265625, + "grad_norm_var": 0.0204986572265625, + "learning_rate": 0.0001, + "loss": 4.3646, + "loss/crossentropy": 2.077186107635498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2538782134652138, + "step": 5904 + }, + { + "epoch": 0.11812, + "grad_norm": 2.359375, + "grad_norm_var": 0.013190714518229167, + "learning_rate": 0.0001, + "loss": 4.9135, + "loss/crossentropy": 2.2535945177078247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25025132298469543, + "step": 5906 + }, + { + "epoch": 0.11816, + "grad_norm": 2.21875, + "grad_norm_var": 0.01064453125, + "learning_rate": 0.0001, + "loss": 4.5512, + "loss/crossentropy": 2.5321284532546997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2729681059718132, + "step": 5908 + }, + { + "epoch": 0.1182, + "grad_norm": 2.234375, + "grad_norm_var": 0.006403605143229167, + "learning_rate": 0.0001, + "loss": 4.332, + "loss/crossentropy": 2.043885111808777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23721691966056824, + "step": 5910 + }, + { + "epoch": 0.11824, + "grad_norm": 2.296875, + "grad_norm_var": 0.005204264322916667, + "learning_rate": 0.0001, + "loss": 4.3901, + "loss/crossentropy": 2.1343676447868347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23325734585523605, + "step": 5912 + }, + { + "epoch": 0.11828, + "grad_norm": 2.03125, + "grad_norm_var": 0.0074045817057291664, + "learning_rate": 0.0001, + "loss": 4.3551, + "loss/crossentropy": 2.1164477467536926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24526448547840118, + "step": 5914 + }, + { + "epoch": 0.11832, + "grad_norm": 2.1875, + "grad_norm_var": 0.0064737955729166664, + "learning_rate": 0.0001, + "loss": 4.5162, + "loss/crossentropy": 2.268216848373413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24837128818035126, + "step": 5916 + }, + { + "epoch": 0.11836, + "grad_norm": 2.25, + "grad_norm_var": 0.0057037353515625, + "learning_rate": 0.0001, + "loss": 4.4636, + "loss/crossentropy": 2.074695885181427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24956009536981583, + "step": 5918 + }, + { + "epoch": 0.1184, + "grad_norm": 2.203125, + "grad_norm_var": 0.007372029622395833, + "learning_rate": 0.0001, + "loss": 4.3165, + "loss/crossentropy": 1.9369722604751587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2316487580537796, + "step": 5920 + }, + { + "epoch": 0.11844, + "grad_norm": 2.15625, + "grad_norm_var": 0.00611572265625, + "learning_rate": 0.0001, + "loss": 4.2789, + "loss/crossentropy": 2.2189531326293945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23884039372205734, + "step": 5922 + }, + { + "epoch": 0.11848, + "grad_norm": 2.34375, + "grad_norm_var": 0.007323201497395833, + "learning_rate": 0.0001, + "loss": 4.5965, + "loss/crossentropy": 2.3833028078079224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2504591718316078, + "step": 5924 + }, + { + "epoch": 0.11852, + "grad_norm": 2.234375, + "grad_norm_var": 0.007005818684895833, + "learning_rate": 0.0001, + "loss": 4.6346, + "loss/crossentropy": 2.0443845987319946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23422807455062866, + "step": 5926 + }, + { + "epoch": 0.11856, + "grad_norm": 2.203125, + "grad_norm_var": 0.0073964436848958336, + "learning_rate": 0.0001, + "loss": 4.9061, + "loss/crossentropy": 2.223625063896179, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24985776841640472, + "step": 5928 + }, + { + "epoch": 0.1186, + "grad_norm": 2.171875, + "grad_norm_var": 0.005126953125, + "learning_rate": 0.0001, + "loss": 4.493, + "loss/crossentropy": 2.1353545784950256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22846727818250656, + "step": 5930 + }, + { + "epoch": 0.11864, + "grad_norm": 2.203125, + "grad_norm_var": 0.005159505208333333, + "learning_rate": 0.0001, + "loss": 4.5786, + "loss/crossentropy": 2.0497827529907227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21461189538240433, + "step": 5932 + }, + { + "epoch": 0.11868, + "grad_norm": 2.28125, + "grad_norm_var": 0.004715983072916667, + "learning_rate": 0.0001, + "loss": 4.5071, + "loss/crossentropy": 1.9257362484931946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2216043472290039, + "step": 5934 + }, + { + "epoch": 0.11872, + "grad_norm": 2.171875, + "grad_norm_var": 0.0034006754557291668, + "learning_rate": 0.0001, + "loss": 4.4212, + "loss/crossentropy": 2.0458216071128845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23162957280874252, + "step": 5936 + }, + { + "epoch": 0.11876, + "grad_norm": 2.1875, + "grad_norm_var": 0.0033274332682291666, + "learning_rate": 0.0001, + "loss": 4.6109, + "loss/crossentropy": 2.0786396861076355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22303655743598938, + "step": 5938 + }, + { + "epoch": 0.1188, + "grad_norm": 2.546875, + "grad_norm_var": 0.04327799479166667, + "learning_rate": 0.0001, + "loss": 4.4861, + "loss/crossentropy": 1.9800568222999573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22236331552267075, + "step": 5940 + }, + { + "epoch": 0.11884, + "grad_norm": 2.0625, + "grad_norm_var": 0.0502349853515625, + "learning_rate": 0.0001, + "loss": 4.056, + "loss/crossentropy": 1.882250189781189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21664132922887802, + "step": 5942 + }, + { + "epoch": 0.11888, + "grad_norm": 2.203125, + "grad_norm_var": 0.05025634765625, + "learning_rate": 0.0001, + "loss": 4.5965, + "loss/crossentropy": 2.3682695627212524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2698971778154373, + "step": 5944 + }, + { + "epoch": 0.11892, + "grad_norm": 2.265625, + "grad_norm_var": 0.0505523681640625, + "learning_rate": 0.0001, + "loss": 4.6364, + "loss/crossentropy": 2.225574493408203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2642917186021805, + "step": 5946 + }, + { + "epoch": 0.11896, + "grad_norm": 2.1875, + "grad_norm_var": 0.05032145182291667, + "learning_rate": 0.0001, + "loss": 4.3157, + "loss/crossentropy": 1.8634169697761536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2170402556657791, + "step": 5948 + }, + { + "epoch": 0.119, + "grad_norm": 2.3125, + "grad_norm_var": 0.05123291015625, + "learning_rate": 0.0001, + "loss": 5.0392, + "loss/crossentropy": 2.4500025510787964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27845144271850586, + "step": 5950 + }, + { + "epoch": 0.11904, + "grad_norm": 2.046875, + "grad_norm_var": 0.054108683268229166, + "learning_rate": 0.0001, + "loss": 4.1179, + "loss/crossentropy": 2.1532052755355835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24952176213264465, + "step": 5952 + }, + { + "epoch": 0.11908, + "grad_norm": 2.140625, + "grad_norm_var": 0.0561676025390625, + "learning_rate": 0.0001, + "loss": 4.2592, + "loss/crossentropy": 2.066560387611389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2356283888220787, + "step": 5954 + }, + { + "epoch": 0.11912, + "grad_norm": 2.265625, + "grad_norm_var": 0.01021728515625, + "learning_rate": 0.0001, + "loss": 4.6187, + "loss/crossentropy": 1.9679089784622192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23729706555604935, + "step": 5956 + }, + { + "epoch": 0.11916, + "grad_norm": 2.234375, + "grad_norm_var": 0.0071441650390625, + "learning_rate": 0.0001, + "loss": 4.3463, + "loss/crossentropy": 2.3490394353866577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26400282233953476, + "step": 5958 + }, + { + "epoch": 0.1192, + "grad_norm": 2.296875, + "grad_norm_var": 0.008576456705729167, + "learning_rate": 0.0001, + "loss": 4.3629, + "loss/crossentropy": 2.145757555961609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23329483717679977, + "step": 5960 + }, + { + "epoch": 0.11924, + "grad_norm": 2.1875, + "grad_norm_var": 0.005399576822916667, + "learning_rate": 0.0001, + "loss": 4.2376, + "loss/crossentropy": 2.0764617919921875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22371648252010345, + "step": 5962 + }, + { + "epoch": 0.11928, + "grad_norm": 2.46875, + "grad_norm_var": 0.010380045572916666, + "learning_rate": 0.0001, + "loss": 4.8106, + "loss/crossentropy": 2.199389696121216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23210449516773224, + "step": 5964 + }, + { + "epoch": 0.11932, + "grad_norm": 2.09375, + "grad_norm_var": 0.01064453125, + "learning_rate": 0.0001, + "loss": 4.3515, + "loss/crossentropy": 1.9008439183235168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21474837511777878, + "step": 5966 + }, + { + "epoch": 0.11936, + "grad_norm": 2.0, + "grad_norm_var": 0.0148590087890625, + "learning_rate": 0.0001, + "loss": 4.4603, + "loss/crossentropy": 2.1779539585113525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23485051095485687, + "step": 5968 + }, + { + "epoch": 0.1194, + "grad_norm": 2.15625, + "grad_norm_var": 0.014176432291666667, + "learning_rate": 0.0001, + "loss": 4.428, + "loss/crossentropy": 2.267147421836853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24924689531326294, + "step": 5970 + }, + { + "epoch": 0.11944, + "grad_norm": 2.1875, + "grad_norm_var": 0.016292317708333334, + "learning_rate": 0.0001, + "loss": 4.0856, + "loss/crossentropy": 2.0618110299110413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24352984875440598, + "step": 5972 + }, + { + "epoch": 0.11948, + "grad_norm": 2.34375, + "grad_norm_var": 0.023758951822916666, + "learning_rate": 0.0001, + "loss": 4.5802, + "loss/crossentropy": 2.1419676542282104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23680058121681213, + "step": 5974 + }, + { + "epoch": 0.11952, + "grad_norm": 2.265625, + "grad_norm_var": 0.023368326822916667, + "learning_rate": 0.0001, + "loss": 4.6554, + "loss/crossentropy": 2.0376622080802917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22515031695365906, + "step": 5976 + }, + { + "epoch": 0.11956, + "grad_norm": 2.46875, + "grad_norm_var": 0.025739542643229165, + "learning_rate": 0.0001, + "loss": 4.4213, + "loss/crossentropy": 1.7675965428352356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.219834603369236, + "step": 5978 + }, + { + "epoch": 0.1196, + "grad_norm": 2.1875, + "grad_norm_var": 0.024030558268229165, + "learning_rate": 0.0001, + "loss": 4.2886, + "loss/crossentropy": 1.8919905424118042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2256327122449875, + "step": 5980 + }, + { + "epoch": 0.11964, + "grad_norm": 2.15625, + "grad_norm_var": 0.0228424072265625, + "learning_rate": 0.0001, + "loss": 4.6148, + "loss/crossentropy": 2.287980794906616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25571687519550323, + "step": 5982 + }, + { + "epoch": 0.11968, + "grad_norm": 2.15625, + "grad_norm_var": 0.0169830322265625, + "learning_rate": 0.0001, + "loss": 4.3527, + "loss/crossentropy": 2.1030094027519226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23401429504156113, + "step": 5984 + }, + { + "epoch": 0.11972, + "grad_norm": 2.421875, + "grad_norm_var": 0.0166412353515625, + "learning_rate": 0.0001, + "loss": 4.8857, + "loss/crossentropy": 2.469533920288086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2548582851886749, + "step": 5986 + }, + { + "epoch": 0.11976, + "grad_norm": 2.328125, + "grad_norm_var": 0.018047841389973958, + "learning_rate": 0.0001, + "loss": 4.167, + "loss/crossentropy": 2.140324354171753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22603372484445572, + "step": 5988 + }, + { + "epoch": 0.1198, + "grad_norm": 2.140625, + "grad_norm_var": 0.015773264567057292, + "learning_rate": 0.0001, + "loss": 4.3848, + "loss/crossentropy": 2.1848061084747314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22712922096252441, + "step": 5990 + }, + { + "epoch": 0.11984, + "grad_norm": 2.34375, + "grad_norm_var": 0.015380605061848959, + "learning_rate": 0.0001, + "loss": 4.6457, + "loss/crossentropy": 2.2872358560562134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2352529615163803, + "step": 5992 + }, + { + "epoch": 0.11988, + "grad_norm": 2.25, + "grad_norm_var": 0.012318674723307292, + "learning_rate": 0.0001, + "loss": 4.6136, + "loss/crossentropy": 2.082811713218689, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24107889831066132, + "step": 5994 + }, + { + "epoch": 0.11992, + "grad_norm": 2.109375, + "grad_norm_var": 0.013602447509765626, + "learning_rate": 0.0001, + "loss": 4.2089, + "loss/crossentropy": 2.23664391040802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23429522663354874, + "step": 5996 + }, + { + "epoch": 0.11996, + "grad_norm": 2.265625, + "grad_norm_var": 0.013242340087890625, + "learning_rate": 0.0001, + "loss": 4.5237, + "loss/crossentropy": 2.451270341873169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2541816979646683, + "step": 5998 + }, + { + "epoch": 0.12, + "grad_norm": 2.109375, + "grad_norm_var": 0.013561757405598958, + "learning_rate": 0.0001, + "loss": 4.5559, + "loss/crossentropy": 2.1744157671928406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22349119931459427, + "step": 6000 + }, + { + "epoch": 0.12004, + "grad_norm": 2.09375, + "grad_norm_var": 0.013171132405598958, + "learning_rate": 0.0001, + "loss": 4.5532, + "loss/crossentropy": 2.0316836833953857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.259635865688324, + "step": 6002 + }, + { + "epoch": 0.12008, + "grad_norm": 2.609375, + "grad_norm_var": 0.022337849934895834, + "learning_rate": 0.0001, + "loss": 4.3674, + "loss/crossentropy": 2.0989437103271484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25056491047143936, + "step": 6004 + }, + { + "epoch": 0.12012, + "grad_norm": 2.5625, + "grad_norm_var": 0.02633056640625, + "learning_rate": 0.0001, + "loss": 4.0883, + "loss/crossentropy": 1.9609100818634033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22969383746385574, + "step": 6006 + }, + { + "epoch": 0.12016, + "grad_norm": 2.15625, + "grad_norm_var": 0.027665201822916666, + "learning_rate": 0.0001, + "loss": 4.2094, + "loss/crossentropy": 2.077883243560791, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22850078344345093, + "step": 6008 + }, + { + "epoch": 0.1202, + "grad_norm": 2.34375, + "grad_norm_var": 1.6465779622395833, + "learning_rate": 0.0001, + "loss": 4.5659, + "loss/crossentropy": 1.7967591285705566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126556932926178, + "step": 6010 + }, + { + "epoch": 0.12024, + "grad_norm": 2.234375, + "grad_norm_var": 1.62222900390625, + "learning_rate": 0.0001, + "loss": 4.4018, + "loss/crossentropy": 1.6516226530075073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2188895046710968, + "step": 6012 + }, + { + "epoch": 0.12028, + "grad_norm": 2.21875, + "grad_norm_var": 1.62984619140625, + "learning_rate": 0.0001, + "loss": 4.3303, + "loss/crossentropy": 2.161388635635376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23447516560554504, + "step": 6014 + }, + { + "epoch": 0.12032, + "grad_norm": 2.328125, + "grad_norm_var": 1.6235636393229167, + "learning_rate": 0.0001, + "loss": 4.5112, + "loss/crossentropy": 1.9205461740493774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23797179013490677, + "step": 6016 + }, + { + "epoch": 0.12036, + "grad_norm": 2.3125, + "grad_norm_var": 1.616844685872396, + "learning_rate": 0.0001, + "loss": 4.8019, + "loss/crossentropy": 2.025223135948181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25120896100997925, + "step": 6018 + }, + { + "epoch": 0.1204, + "grad_norm": 2.25, + "grad_norm_var": 1.6347005208333334, + "learning_rate": 0.0001, + "loss": 4.4819, + "loss/crossentropy": 1.957942008972168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22228525578975677, + "step": 6020 + }, + { + "epoch": 0.12044, + "grad_norm": 2.171875, + "grad_norm_var": 1.674201456705729, + "learning_rate": 0.0001, + "loss": 4.6193, + "loss/crossentropy": 2.3325445652008057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26250994950532913, + "step": 6022 + }, + { + "epoch": 0.12048, + "grad_norm": 2.03125, + "grad_norm_var": 1.6957997639973958, + "learning_rate": 0.0001, + "loss": 4.264, + "loss/crossentropy": 2.0049667954444885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22990728169679642, + "step": 6024 + }, + { + "epoch": 0.12052, + "grad_norm": 2.265625, + "grad_norm_var": 0.08680013020833334, + "learning_rate": 0.0001, + "loss": 4.5996, + "loss/crossentropy": 2.0473387241363525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23628919571638107, + "step": 6026 + }, + { + "epoch": 0.12056, + "grad_norm": 1.9765625, + "grad_norm_var": 0.09107640584309896, + "learning_rate": 0.0001, + "loss": 4.109, + "loss/crossentropy": 2.013141930103302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21728236973285675, + "step": 6028 + }, + { + "epoch": 0.1206, + "grad_norm": 2.109375, + "grad_norm_var": 0.09145278930664062, + "learning_rate": 0.0001, + "loss": 4.3869, + "loss/crossentropy": 2.1269132494926453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25139085948467255, + "step": 6030 + }, + { + "epoch": 0.12064, + "grad_norm": 2.234375, + "grad_norm_var": 0.09058405558268229, + "learning_rate": 0.0001, + "loss": 4.5568, + "loss/crossentropy": 2.5267512798309326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27998340129852295, + "step": 6032 + }, + { + "epoch": 0.12068, + "grad_norm": 2.78125, + "grad_norm_var": 0.10746027628580729, + "learning_rate": 0.0001, + "loss": 4.2615, + "loss/crossentropy": 1.8502249717712402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22863731533288956, + "step": 6034 + }, + { + "epoch": 0.12072, + "grad_norm": 2.34375, + "grad_norm_var": 0.10850601196289063, + "learning_rate": 0.0001, + "loss": 4.2197, + "loss/crossentropy": 1.7754456400871277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23809141665697098, + "step": 6036 + }, + { + "epoch": 0.12076, + "grad_norm": 2.203125, + "grad_norm_var": 0.033760325113932295, + "learning_rate": 0.0001, + "loss": 4.4714, + "loss/crossentropy": 1.9596920609474182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2111937776207924, + "step": 6038 + }, + { + "epoch": 0.1208, + "grad_norm": 2.203125, + "grad_norm_var": 0.02995580037434896, + "learning_rate": 0.0001, + "loss": 4.3079, + "loss/crossentropy": 1.888563334941864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21698038280010223, + "step": 6040 + }, + { + "epoch": 0.12084, + "grad_norm": 2.296875, + "grad_norm_var": 0.031404368082682294, + "learning_rate": 0.0001, + "loss": 4.2412, + "loss/crossentropy": 2.1646993160247803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22735021263360977, + "step": 6042 + }, + { + "epoch": 0.12088, + "grad_norm": 2.1875, + "grad_norm_var": 0.0258941650390625, + "learning_rate": 0.0001, + "loss": 4.525, + "loss/crossentropy": 2.0790343284606934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2193024456501007, + "step": 6044 + }, + { + "epoch": 0.12092, + "grad_norm": 2.109375, + "grad_norm_var": 0.025211588541666666, + "learning_rate": 0.0001, + "loss": 4.3538, + "loss/crossentropy": 2.2733768224716187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.249516561627388, + "step": 6046 + }, + { + "epoch": 0.12096, + "grad_norm": 2.21875, + "grad_norm_var": 0.02431640625, + "learning_rate": 0.0001, + "loss": 4.7286, + "loss/crossentropy": 2.5003533363342285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22928690910339355, + "step": 6048 + }, + { + "epoch": 0.121, + "grad_norm": 2.453125, + "grad_norm_var": 0.03629150390625, + "learning_rate": 0.0001, + "loss": 4.6269, + "loss/crossentropy": 1.9606900215148926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2152085080742836, + "step": 6050 + }, + { + "epoch": 0.12104, + "grad_norm": 2.390625, + "grad_norm_var": 0.0384765625, + "learning_rate": 0.0001, + "loss": 4.5724, + "loss/crossentropy": 2.266395926475525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26055608689785004, + "step": 6052 + }, + { + "epoch": 0.12108, + "grad_norm": 2.296875, + "grad_norm_var": 0.037821451822916664, + "learning_rate": 0.0001, + "loss": 4.5841, + "loss/crossentropy": 2.1753041744232178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2327822595834732, + "step": 6054 + }, + { + "epoch": 0.12112, + "grad_norm": 2.15625, + "grad_norm_var": 0.041304524739583334, + "learning_rate": 0.0001, + "loss": 4.2198, + "loss/crossentropy": 1.775630235671997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2073052078485489, + "step": 6056 + }, + { + "epoch": 0.12116, + "grad_norm": 2.21875, + "grad_norm_var": 0.03975321451822917, + "learning_rate": 0.0001, + "loss": 4.486, + "loss/crossentropy": 2.415855050086975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.270721398293972, + "step": 6058 + }, + { + "epoch": 0.1212, + "grad_norm": 2.34375, + "grad_norm_var": 0.039453125, + "learning_rate": 0.0001, + "loss": 4.6647, + "loss/crossentropy": 2.2122162580490112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24723708629608154, + "step": 6060 + }, + { + "epoch": 0.12124, + "grad_norm": 2.375, + "grad_norm_var": 0.041792805989583334, + "learning_rate": 0.0001, + "loss": 4.3803, + "loss/crossentropy": 1.9540830850601196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21385761350393295, + "step": 6062 + }, + { + "epoch": 0.12128, + "grad_norm": 2.34375, + "grad_norm_var": 0.0413726806640625, + "learning_rate": 0.0001, + "loss": 4.979, + "loss/crossentropy": 1.923313319683075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22986505925655365, + "step": 6064 + }, + { + "epoch": 0.12132, + "grad_norm": 3.03125, + "grad_norm_var": 0.05182291666666667, + "learning_rate": 0.0001, + "loss": 4.6895, + "loss/crossentropy": 2.4042444229125977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30044034123420715, + "step": 6066 + }, + { + "epoch": 0.12136, + "grad_norm": 2.25, + "grad_norm_var": 0.049225870768229166, + "learning_rate": 0.0001, + "loss": 4.5857, + "loss/crossentropy": 2.2610549926757812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2628016769886017, + "step": 6068 + }, + { + "epoch": 0.1214, + "grad_norm": 2.140625, + "grad_norm_var": 0.0513671875, + "learning_rate": 0.0001, + "loss": 4.5024, + "loss/crossentropy": 1.9100797176361084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22663169354200363, + "step": 6070 + }, + { + "epoch": 0.12144, + "grad_norm": 2.34375, + "grad_norm_var": 0.04830729166666667, + "learning_rate": 0.0001, + "loss": 4.5916, + "loss/crossentropy": 2.3971651792526245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26831263303756714, + "step": 6072 + }, + { + "epoch": 0.12148, + "grad_norm": 2.453125, + "grad_norm_var": 0.0492095947265625, + "learning_rate": 0.0001, + "loss": 4.4553, + "loss/crossentropy": 2.106821596622467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23786011338233948, + "step": 6074 + }, + { + "epoch": 0.12152, + "grad_norm": 2.125, + "grad_norm_var": 0.052469889322916664, + "learning_rate": 0.0001, + "loss": 4.4225, + "loss/crossentropy": 2.1920535564422607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23812127113342285, + "step": 6076 + }, + { + "epoch": 0.12156, + "grad_norm": 2.203125, + "grad_norm_var": 0.049088541666666666, + "learning_rate": 0.0001, + "loss": 4.2314, + "loss/crossentropy": 2.3014419078826904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2380141019821167, + "step": 6078 + }, + { + "epoch": 0.1216, + "grad_norm": 2.265625, + "grad_norm_var": 0.04903055826822917, + "learning_rate": 0.0001, + "loss": 4.4558, + "loss/crossentropy": 2.1721781492233276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24158813059329987, + "step": 6080 + }, + { + "epoch": 0.12164, + "grad_norm": 2.21875, + "grad_norm_var": 0.013232421875, + "learning_rate": 0.0001, + "loss": 4.5192, + "loss/crossentropy": 2.1230934858322144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23764102160930634, + "step": 6082 + }, + { + "epoch": 0.12168, + "grad_norm": 2.296875, + "grad_norm_var": 0.0166412353515625, + "learning_rate": 0.0001, + "loss": 4.2147, + "loss/crossentropy": 1.9570311307907104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24687693268060684, + "step": 6084 + }, + { + "epoch": 0.12172, + "grad_norm": 2.15625, + "grad_norm_var": 0.014997355143229167, + "learning_rate": 0.0001, + "loss": 4.2063, + "loss/crossentropy": 2.272383213043213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2567671462893486, + "step": 6086 + }, + { + "epoch": 0.12176, + "grad_norm": 2.1875, + "grad_norm_var": 0.009598795572916667, + "learning_rate": 0.0001, + "loss": 4.645, + "loss/crossentropy": 2.216492176055908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24377377331256866, + "step": 6088 + }, + { + "epoch": 0.1218, + "grad_norm": 2.140625, + "grad_norm_var": 0.007201131184895833, + "learning_rate": 0.0001, + "loss": 4.5956, + "loss/crossentropy": 2.25177001953125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23176074773073196, + "step": 6090 + }, + { + "epoch": 0.12184, + "grad_norm": 2.21875, + "grad_norm_var": 0.009098307291666666, + "learning_rate": 0.0001, + "loss": 4.1966, + "loss/crossentropy": 2.2452452182769775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24528006464242935, + "step": 6092 + }, + { + "epoch": 0.12188, + "grad_norm": 2.25, + "grad_norm_var": 0.0091949462890625, + "learning_rate": 0.0001, + "loss": 4.3548, + "loss/crossentropy": 2.078445553779602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2276337668299675, + "step": 6094 + }, + { + "epoch": 0.12192, + "grad_norm": 2.125, + "grad_norm_var": 0.00982666015625, + "learning_rate": 0.0001, + "loss": 4.4435, + "loss/crossentropy": 2.2938032150268555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2784377336502075, + "step": 6096 + }, + { + "epoch": 0.12196, + "grad_norm": 2.015625, + "grad_norm_var": 0.0136627197265625, + "learning_rate": 0.0001, + "loss": 4.1588, + "loss/crossentropy": 2.028180480003357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23716723918914795, + "step": 6098 + }, + { + "epoch": 0.122, + "grad_norm": 2.609375, + "grad_norm_var": 0.020580037434895834, + "learning_rate": 0.0001, + "loss": 4.6345, + "loss/crossentropy": 2.4959323406219482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2510451450943947, + "step": 6100 + }, + { + "epoch": 0.12204, + "grad_norm": 2.21875, + "grad_norm_var": 0.023856608072916667, + "learning_rate": 0.0001, + "loss": 4.6639, + "loss/crossentropy": 2.186043620109558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23511512577533722, + "step": 6102 + }, + { + "epoch": 0.12208, + "grad_norm": 2.359375, + "grad_norm_var": 0.024442545572916665, + "learning_rate": 0.0001, + "loss": 4.6727, + "loss/crossentropy": 2.6631078720092773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25689610838890076, + "step": 6104 + }, + { + "epoch": 0.12212, + "grad_norm": 2.453125, + "grad_norm_var": 0.025609334309895832, + "learning_rate": 0.0001, + "loss": 4.7943, + "loss/crossentropy": 2.310486674308777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26937828958034515, + "step": 6106 + }, + { + "epoch": 0.12216, + "grad_norm": 2.359375, + "grad_norm_var": 0.0247711181640625, + "learning_rate": 0.0001, + "loss": 4.9892, + "loss/crossentropy": 2.2036240100860596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24968907237052917, + "step": 6108 + }, + { + "epoch": 0.1222, + "grad_norm": 2.984375, + "grad_norm_var": 0.058251953125, + "learning_rate": 0.0001, + "loss": 4.3527, + "loss/crossentropy": 1.9434874057769775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23247701674699783, + "step": 6110 + }, + { + "epoch": 0.12224, + "grad_norm": 2.296875, + "grad_norm_var": 0.05579020182291667, + "learning_rate": 0.0001, + "loss": 4.4149, + "loss/crossentropy": 2.0525330305099487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24810528755187988, + "step": 6112 + }, + { + "epoch": 0.12228, + "grad_norm": 2.328125, + "grad_norm_var": 0.04482320149739583, + "learning_rate": 0.0001, + "loss": 4.5205, + "loss/crossentropy": 2.0085532665252686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2514277398586273, + "step": 6114 + }, + { + "epoch": 0.12232, + "grad_norm": 2.328125, + "grad_norm_var": 0.04120686848958333, + "learning_rate": 0.0001, + "loss": 4.5745, + "loss/crossentropy": 1.7393967509269714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2239851951599121, + "step": 6116 + }, + { + "epoch": 0.12236, + "grad_norm": 2.546875, + "grad_norm_var": 0.04327799479166667, + "learning_rate": 0.0001, + "loss": 4.7332, + "loss/crossentropy": 1.8714343905448914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22823140025138855, + "step": 6118 + }, + { + "epoch": 0.1224, + "grad_norm": 2.3125, + "grad_norm_var": 0.04192708333333333, + "learning_rate": 0.0001, + "loss": 4.6177, + "loss/crossentropy": 1.9353562593460083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22261886298656464, + "step": 6120 + }, + { + "epoch": 0.12244, + "grad_norm": 2.171875, + "grad_norm_var": 0.0469635009765625, + "learning_rate": 0.0001, + "loss": 4.3154, + "loss/crossentropy": 2.0409420132637024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2319282591342926, + "step": 6122 + }, + { + "epoch": 0.12248, + "grad_norm": 2.125, + "grad_norm_var": 0.04882405598958333, + "learning_rate": 0.0001, + "loss": 4.4764, + "loss/crossentropy": 2.2461307048797607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25047267973423004, + "step": 6124 + }, + { + "epoch": 0.12252, + "grad_norm": 2.203125, + "grad_norm_var": 0.014012654622395834, + "learning_rate": 0.0001, + "loss": 4.4863, + "loss/crossentropy": 2.1060246229171753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24702349305152893, + "step": 6126 + }, + { + "epoch": 0.12256, + "grad_norm": 2.21875, + "grad_norm_var": 0.01718724568684896, + "learning_rate": 0.0001, + "loss": 3.9093, + "loss/crossentropy": 1.7481068968772888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19611438363790512, + "step": 6128 + }, + { + "epoch": 0.1226, + "grad_norm": 2.21875, + "grad_norm_var": 0.017329661051432292, + "learning_rate": 0.0001, + "loss": 4.5506, + "loss/crossentropy": 1.8341861963272095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20660528540611267, + "step": 6130 + }, + { + "epoch": 0.12264, + "grad_norm": 2.421875, + "grad_norm_var": 0.018507639567057293, + "learning_rate": 0.0001, + "loss": 4.3388, + "loss/crossentropy": 1.7332024574279785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21615543216466904, + "step": 6132 + }, + { + "epoch": 0.12268, + "grad_norm": 2.078125, + "grad_norm_var": 0.013203684488932292, + "learning_rate": 0.0001, + "loss": 4.3027, + "loss/crossentropy": 1.8959643244743347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.219951793551445, + "step": 6134 + }, + { + "epoch": 0.12272, + "grad_norm": 2.265625, + "grad_norm_var": 0.012601470947265625, + "learning_rate": 0.0001, + "loss": 4.6945, + "loss/crossentropy": 2.234723210334778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2342153787612915, + "step": 6136 + }, + { + "epoch": 0.12276, + "grad_norm": 2.3125, + "grad_norm_var": 0.011637115478515625, + "learning_rate": 0.0001, + "loss": 4.7394, + "loss/crossentropy": 2.2297592759132385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24097590148448944, + "step": 6138 + }, + { + "epoch": 0.1228, + "grad_norm": 2.40625, + "grad_norm_var": 0.013586171468098958, + "learning_rate": 0.0001, + "loss": 4.592, + "loss/crossentropy": 2.013442814350128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2532622739672661, + "step": 6140 + }, + { + "epoch": 0.12284, + "grad_norm": 2.15625, + "grad_norm_var": 0.014833323160807292, + "learning_rate": 0.0001, + "loss": 4.417, + "loss/crossentropy": 2.3066656589508057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24480555206537247, + "step": 6142 + }, + { + "epoch": 0.12288, + "grad_norm": 2.4375, + "grad_norm_var": 0.014241536458333334, + "learning_rate": 0.0001, + "loss": 4.4156, + "loss/crossentropy": 1.9281310439109802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22363708168268204, + "step": 6144 + }, + { + "epoch": 0.12292, + "grad_norm": 2.078125, + "grad_norm_var": 0.016792805989583333, + "learning_rate": 0.0001, + "loss": 4.4314, + "loss/crossentropy": 1.965875267982483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22153983265161514, + "step": 6146 + }, + { + "epoch": 0.12296, + "grad_norm": 2.8125, + "grad_norm_var": 0.03536783854166667, + "learning_rate": 0.0001, + "loss": 4.6139, + "loss/crossentropy": 1.811126947402954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21779045462608337, + "step": 6148 + }, + { + "epoch": 0.123, + "grad_norm": 2.34375, + "grad_norm_var": 0.0311676025390625, + "learning_rate": 0.0001, + "loss": 4.4855, + "loss/crossentropy": 2.5114762783050537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2542252689599991, + "step": 6150 + }, + { + "epoch": 0.12304, + "grad_norm": 2.296875, + "grad_norm_var": 0.028271484375, + "learning_rate": 0.0001, + "loss": 4.1269, + "loss/crossentropy": 1.8784565925598145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24637068808078766, + "step": 6152 + }, + { + "epoch": 0.12308, + "grad_norm": 2.234375, + "grad_norm_var": 0.028685506184895834, + "learning_rate": 0.0001, + "loss": 4.3377, + "loss/crossentropy": 1.6900760531425476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2170872688293457, + "step": 6154 + }, + { + "epoch": 0.12312, + "grad_norm": 2.21875, + "grad_norm_var": 0.02867431640625, + "learning_rate": 0.0001, + "loss": 4.5705, + "loss/crossentropy": 2.413783550262451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2624947279691696, + "step": 6156 + }, + { + "epoch": 0.12316, + "grad_norm": 2.03125, + "grad_norm_var": 0.030126953125, + "learning_rate": 0.0001, + "loss": 4.3463, + "loss/crossentropy": 2.0598058104515076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2071695551276207, + "step": 6158 + }, + { + "epoch": 0.1232, + "grad_norm": 2.375, + "grad_norm_var": 0.029781087239583334, + "learning_rate": 0.0001, + "loss": 4.6702, + "loss/crossentropy": 2.0407246947288513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24353720247745514, + "step": 6160 + }, + { + "epoch": 0.12324, + "grad_norm": 2.171875, + "grad_norm_var": 0.028450520833333333, + "learning_rate": 0.0001, + "loss": 4.3116, + "loss/crossentropy": 1.9608840346336365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20965111255645752, + "step": 6162 + }, + { + "epoch": 0.12328, + "grad_norm": 2.046875, + "grad_norm_var": 0.01640625, + "learning_rate": 0.0001, + "loss": 4.2146, + "loss/crossentropy": 2.0231454372406006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.219897098839283, + "step": 6164 + }, + { + "epoch": 0.12332, + "grad_norm": 2.25, + "grad_norm_var": 0.015999348958333333, + "learning_rate": 0.0001, + "loss": 4.614, + "loss/crossentropy": 2.217663288116455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2466331645846367, + "step": 6166 + }, + { + "epoch": 0.12336, + "grad_norm": 2.234375, + "grad_norm_var": 0.01597900390625, + "learning_rate": 0.0001, + "loss": 4.4135, + "loss/crossentropy": 1.6825732588768005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1999632865190506, + "step": 6168 + }, + { + "epoch": 0.1234, + "grad_norm": 2.5625, + "grad_norm_var": 0.020897420247395833, + "learning_rate": 0.0001, + "loss": 4.4552, + "loss/crossentropy": 2.288950800895691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26286834478378296, + "step": 6170 + }, + { + "epoch": 0.12344, + "grad_norm": 2.453125, + "grad_norm_var": 0.023225911458333335, + "learning_rate": 0.0001, + "loss": 4.3376, + "loss/crossentropy": 1.8202561140060425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22252517193555832, + "step": 6172 + }, + { + "epoch": 0.12348, + "grad_norm": 2.3125, + "grad_norm_var": 0.019710286458333334, + "learning_rate": 0.0001, + "loss": 4.5023, + "loss/crossentropy": 2.1007159948349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24983422458171844, + "step": 6174 + }, + { + "epoch": 0.12352, + "grad_norm": 2.171875, + "grad_norm_var": 0.0199371337890625, + "learning_rate": 0.0001, + "loss": 4.4513, + "loss/crossentropy": 2.0953084230422974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25545743107795715, + "step": 6176 + }, + { + "epoch": 0.12356, + "grad_norm": 2.171875, + "grad_norm_var": 0.019001261393229166, + "learning_rate": 0.0001, + "loss": 4.4315, + "loss/crossentropy": 2.107246518135071, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22741339355707169, + "step": 6178 + }, + { + "epoch": 0.1236, + "grad_norm": 2.265625, + "grad_norm_var": 0.013118489583333334, + "learning_rate": 0.0001, + "loss": 4.4982, + "loss/crossentropy": 2.1219626665115356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22499094158411026, + "step": 6180 + }, + { + "epoch": 0.12364, + "grad_norm": 2.25, + "grad_norm_var": 0.013093058268229167, + "learning_rate": 0.0001, + "loss": 4.4263, + "loss/crossentropy": 1.8892266154289246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2255062311887741, + "step": 6182 + }, + { + "epoch": 0.12368, + "grad_norm": 2.21875, + "grad_norm_var": 0.015315755208333334, + "learning_rate": 0.0001, + "loss": 4.5884, + "loss/crossentropy": 2.0765860080718994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24712087213993073, + "step": 6184 + }, + { + "epoch": 0.12372, + "grad_norm": 2.296875, + "grad_norm_var": 0.010074869791666666, + "learning_rate": 0.0001, + "loss": 4.5431, + "loss/crossentropy": 2.301337718963623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24419523775577545, + "step": 6186 + }, + { + "epoch": 0.12376, + "grad_norm": 2.359375, + "grad_norm_var": 0.008373006184895834, + "learning_rate": 0.0001, + "loss": 4.5043, + "loss/crossentropy": 2.0489712953567505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25768278539180756, + "step": 6188 + }, + { + "epoch": 0.1238, + "grad_norm": 2.140625, + "grad_norm_var": 0.0128082275390625, + "learning_rate": 0.0001, + "loss": 4.2654, + "loss/crossentropy": 1.9328826069831848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22916094958782196, + "step": 6190 + }, + { + "epoch": 0.12384, + "grad_norm": 2.25, + "grad_norm_var": 0.01171875, + "learning_rate": 0.0001, + "loss": 4.1574, + "loss/crossentropy": 1.6330446004867554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20600398629903793, + "step": 6192 + }, + { + "epoch": 0.12388, + "grad_norm": 2.234375, + "grad_norm_var": 0.0111724853515625, + "learning_rate": 0.0001, + "loss": 4.4769, + "loss/crossentropy": 1.988203227519989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22711507230997086, + "step": 6194 + }, + { + "epoch": 0.12392, + "grad_norm": 1.9609375, + "grad_norm_var": 0.01590143839518229, + "learning_rate": 0.0001, + "loss": 4.125, + "loss/crossentropy": 2.036049246788025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2110549360513687, + "step": 6196 + }, + { + "epoch": 0.12396, + "grad_norm": 2.265625, + "grad_norm_var": 0.01624120076497396, + "learning_rate": 0.0001, + "loss": 4.5546, + "loss/crossentropy": 2.217681884765625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23705793917179108, + "step": 6198 + }, + { + "epoch": 0.124, + "grad_norm": 2.234375, + "grad_norm_var": 0.015276845296223958, + "learning_rate": 0.0001, + "loss": 4.4803, + "loss/crossentropy": 2.3878824710845947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24000737071037292, + "step": 6200 + }, + { + "epoch": 0.12404, + "grad_norm": 2.265625, + "grad_norm_var": 0.015852610270182293, + "learning_rate": 0.0001, + "loss": 4.4754, + "loss/crossentropy": 2.2790249586105347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26150786131620407, + "step": 6202 + }, + { + "epoch": 0.12408, + "grad_norm": 2.1875, + "grad_norm_var": 0.015036773681640626, + "learning_rate": 0.0001, + "loss": 4.4703, + "loss/crossentropy": 2.251123785972595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25231797993183136, + "step": 6204 + }, + { + "epoch": 0.12412, + "grad_norm": 2.171875, + "grad_norm_var": 0.010802968343098959, + "learning_rate": 0.0001, + "loss": 4.5294, + "loss/crossentropy": 1.8977670073509216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21783004701137543, + "step": 6206 + }, + { + "epoch": 0.12416, + "grad_norm": 2.265625, + "grad_norm_var": 0.012894439697265624, + "learning_rate": 0.0001, + "loss": 4.6458, + "loss/crossentropy": 2.385319232940674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29223111271858215, + "step": 6208 + }, + { + "epoch": 0.1242, + "grad_norm": 2.40625, + "grad_norm_var": 0.015964508056640625, + "learning_rate": 0.0001, + "loss": 4.6555, + "loss/crossentropy": 1.9274529218673706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22827968001365662, + "step": 6210 + }, + { + "epoch": 0.12424, + "grad_norm": 2.40625, + "grad_norm_var": 0.01226806640625, + "learning_rate": 0.0001, + "loss": 4.8232, + "loss/crossentropy": 2.1861478090286255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2424854040145874, + "step": 6212 + }, + { + "epoch": 0.12428, + "grad_norm": 2.171875, + "grad_norm_var": 0.01207275390625, + "learning_rate": 0.0001, + "loss": 4.3002, + "loss/crossentropy": 2.2234357595443726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2471691370010376, + "step": 6214 + }, + { + "epoch": 0.12432, + "grad_norm": 2.625, + "grad_norm_var": 0.022541300455729166, + "learning_rate": 0.0001, + "loss": 4.5216, + "loss/crossentropy": 2.365513563156128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2640291824936867, + "step": 6216 + }, + { + "epoch": 0.12436, + "grad_norm": 2.21875, + "grad_norm_var": 0.021955362955729165, + "learning_rate": 0.0001, + "loss": 4.282, + "loss/crossentropy": 1.964316964149475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22341662645339966, + "step": 6218 + }, + { + "epoch": 0.1244, + "grad_norm": 2.3125, + "grad_norm_var": 0.021805826822916666, + "learning_rate": 0.0001, + "loss": 4.7078, + "loss/crossentropy": 2.3704408407211304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24599966406822205, + "step": 6220 + }, + { + "epoch": 0.12444, + "grad_norm": 2.1875, + "grad_norm_var": 0.0193756103515625, + "learning_rate": 0.0001, + "loss": 4.7558, + "loss/crossentropy": 2.1461408138275146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25205330550670624, + "step": 6222 + }, + { + "epoch": 0.12448, + "grad_norm": 2.21875, + "grad_norm_var": 0.018745930989583333, + "learning_rate": 0.0001, + "loss": 4.7032, + "loss/crossentropy": 2.3997104167938232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23371660709381104, + "step": 6224 + }, + { + "epoch": 0.12452, + "grad_norm": 2.09375, + "grad_norm_var": 0.0212890625, + "learning_rate": 0.0001, + "loss": 4.1842, + "loss/crossentropy": 1.7976875305175781, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21662414073944092, + "step": 6226 + }, + { + "epoch": 0.12456, + "grad_norm": 2.140625, + "grad_norm_var": 0.02008056640625, + "learning_rate": 0.0001, + "loss": 4.2214, + "loss/crossentropy": 1.9929583668708801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21378497034311295, + "step": 6228 + }, + { + "epoch": 0.1246, + "grad_norm": 2.21875, + "grad_norm_var": 0.02017822265625, + "learning_rate": 0.0001, + "loss": 4.2982, + "loss/crossentropy": 1.8853623867034912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21267645806074142, + "step": 6230 + }, + { + "epoch": 0.12464, + "grad_norm": 2.234375, + "grad_norm_var": 0.008454386393229167, + "learning_rate": 0.0001, + "loss": 4.2638, + "loss/crossentropy": 2.2534812688827515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23003943264484406, + "step": 6232 + }, + { + "epoch": 0.12468, + "grad_norm": 2.0, + "grad_norm_var": 0.010863240559895833, + "learning_rate": 0.0001, + "loss": 4.2504, + "loss/crossentropy": 2.026564121246338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172931283712387, + "step": 6234 + }, + { + "epoch": 0.12472, + "grad_norm": 2.25, + "grad_norm_var": 0.01021728515625, + "learning_rate": 0.0001, + "loss": 4.3555, + "loss/crossentropy": 1.9323118925094604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22170638293027878, + "step": 6236 + }, + { + "epoch": 0.12476, + "grad_norm": 2.296875, + "grad_norm_var": 0.007938639322916666, + "learning_rate": 0.0001, + "loss": 4.479, + "loss/crossentropy": 2.056011915206909, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2656880244612694, + "step": 6238 + }, + { + "epoch": 0.1248, + "grad_norm": 2.296875, + "grad_norm_var": 0.00830078125, + "learning_rate": 0.0001, + "loss": 4.6288, + "loss/crossentropy": 2.095108926296234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23341026157140732, + "step": 6240 + }, + { + "epoch": 0.12484, + "grad_norm": 2.4375, + "grad_norm_var": 0.011555989583333334, + "learning_rate": 0.0001, + "loss": 4.737, + "loss/crossentropy": 2.0198334455490112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24095112830400467, + "step": 6242 + }, + { + "epoch": 0.12488, + "grad_norm": 2.09375, + "grad_norm_var": 0.012105305989583334, + "learning_rate": 0.0001, + "loss": 4.3745, + "loss/crossentropy": 2.0397544503211975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22567399591207504, + "step": 6244 + }, + { + "epoch": 0.12492, + "grad_norm": 2.1875, + "grad_norm_var": 0.0120513916015625, + "learning_rate": 0.0001, + "loss": 4.3633, + "loss/crossentropy": 1.9094319343566895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22690637409687042, + "step": 6246 + }, + { + "epoch": 0.12496, + "grad_norm": 2.265625, + "grad_norm_var": 0.011295572916666666, + "learning_rate": 0.0001, + "loss": 4.6474, + "loss/crossentropy": 2.27765429019928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25075703859329224, + "step": 6248 + }, + { + "epoch": 0.125, + "grad_norm": 2.609375, + "grad_norm_var": 0.017023722330729168, + "learning_rate": 0.0001, + "loss": 4.5769, + "loss/crossentropy": 2.0027456283569336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24267998337745667, + "step": 6250 + }, + { + "epoch": 0.12504, + "grad_norm": 2.359375, + "grad_norm_var": 0.017723592122395833, + "learning_rate": 0.0001, + "loss": 4.5398, + "loss/crossentropy": 2.276857614517212, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24048637598752975, + "step": 6252 + }, + { + "epoch": 0.12508, + "grad_norm": 2.390625, + "grad_norm_var": 0.018065388997395834, + "learning_rate": 0.0001, + "loss": 4.7603, + "loss/crossentropy": 2.1234214305877686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23481453210115433, + "step": 6254 + }, + { + "epoch": 0.12512, + "grad_norm": 2.28125, + "grad_norm_var": 0.019624837239583335, + "learning_rate": 0.0001, + "loss": 4.7537, + "loss/crossentropy": 2.201158881187439, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23473594337701797, + "step": 6256 + }, + { + "epoch": 0.12516, + "grad_norm": 2.296875, + "grad_norm_var": 0.017220052083333333, + "learning_rate": 0.0001, + "loss": 4.7202, + "loss/crossentropy": 2.3437804579734802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24881581962108612, + "step": 6258 + }, + { + "epoch": 0.1252, + "grad_norm": 2.265625, + "grad_norm_var": 0.015461222330729166, + "learning_rate": 0.0001, + "loss": 4.5938, + "loss/crossentropy": 2.0984586477279663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24332843720912933, + "step": 6260 + }, + { + "epoch": 0.12524, + "grad_norm": 2.390625, + "grad_norm_var": 0.016942342122395832, + "learning_rate": 0.0001, + "loss": 4.2317, + "loss/crossentropy": 1.7946080565452576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21510492265224457, + "step": 6262 + }, + { + "epoch": 0.12528, + "grad_norm": 2.171875, + "grad_norm_var": 0.016161092122395835, + "learning_rate": 0.0001, + "loss": 4.3889, + "loss/crossentropy": 2.022711932659149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22890903800725937, + "step": 6264 + }, + { + "epoch": 0.12532, + "grad_norm": 2.046875, + "grad_norm_var": 0.009577433268229166, + "learning_rate": 0.0001, + "loss": 4.288, + "loss/crossentropy": 1.9752087593078613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21972095221281052, + "step": 6266 + }, + { + "epoch": 0.12536, + "grad_norm": 2.234375, + "grad_norm_var": 0.008625284830729166, + "learning_rate": 0.0001, + "loss": 4.4952, + "loss/crossentropy": 1.7267251014709473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21305006742477417, + "step": 6268 + }, + { + "epoch": 0.1254, + "grad_norm": 2.109375, + "grad_norm_var": 0.008869425455729166, + "learning_rate": 0.0001, + "loss": 4.2164, + "loss/crossentropy": 2.249786615371704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24875187873840332, + "step": 6270 + }, + { + "epoch": 0.12544, + "grad_norm": 2.46875, + "grad_norm_var": 0.012190755208333333, + "learning_rate": 0.0001, + "loss": 4.693, + "loss/crossentropy": 2.402994990348816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25536587834358215, + "step": 6272 + }, + { + "epoch": 0.12548, + "grad_norm": 2.46875, + "grad_norm_var": 0.016243489583333333, + "learning_rate": 0.0001, + "loss": 4.606, + "loss/crossentropy": 2.240913987159729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3013365715742111, + "step": 6274 + }, + { + "epoch": 0.12552, + "grad_norm": 2.390625, + "grad_norm_var": 0.019071451822916665, + "learning_rate": 0.0001, + "loss": 4.4691, + "loss/crossentropy": 2.0767332911491394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22113215178251266, + "step": 6276 + }, + { + "epoch": 0.12556, + "grad_norm": 2.140625, + "grad_norm_var": 0.02041015625, + "learning_rate": 0.0001, + "loss": 4.608, + "loss/crossentropy": 1.8625048995018005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20321352779865265, + "step": 6278 + }, + { + "epoch": 0.1256, + "grad_norm": 2.3125, + "grad_norm_var": 0.024072265625, + "learning_rate": 0.0001, + "loss": 4.0386, + "loss/crossentropy": 1.9143638610839844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22085773944854736, + "step": 6280 + }, + { + "epoch": 0.12564, + "grad_norm": 2.34375, + "grad_norm_var": 0.023192342122395834, + "learning_rate": 0.0001, + "loss": 4.372, + "loss/crossentropy": 2.3756210803985596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26027245819568634, + "step": 6282 + }, + { + "epoch": 0.12568, + "grad_norm": 2.21875, + "grad_norm_var": 0.02427978515625, + "learning_rate": 0.0001, + "loss": 4.5526, + "loss/crossentropy": 1.9310896396636963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2086925357580185, + "step": 6284 + }, + { + "epoch": 0.12572, + "grad_norm": 2.34375, + "grad_norm_var": 0.021675618489583333, + "learning_rate": 0.0001, + "loss": 4.4751, + "loss/crossentropy": 2.1757423877716064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.241075336933136, + "step": 6286 + }, + { + "epoch": 0.12576, + "grad_norm": 2.265625, + "grad_norm_var": 0.021284993489583334, + "learning_rate": 0.0001, + "loss": 4.0946, + "loss/crossentropy": 2.3057546615600586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24882248044013977, + "step": 6288 + }, + { + "epoch": 0.1258, + "grad_norm": 3.140625, + "grad_norm_var": 0.06883036295572917, + "learning_rate": 0.0001, + "loss": 4.5211, + "loss/crossentropy": 2.1551633477211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2796258181333542, + "step": 6290 + }, + { + "epoch": 0.12584, + "grad_norm": 2.296875, + "grad_norm_var": 0.06608072916666667, + "learning_rate": 0.0001, + "loss": 4.4915, + "loss/crossentropy": 2.0627459287643433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24781616777181625, + "step": 6292 + }, + { + "epoch": 0.12588, + "grad_norm": 2.453125, + "grad_norm_var": 0.06463216145833334, + "learning_rate": 0.0001, + "loss": 4.5896, + "loss/crossentropy": 1.795321524143219, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2113223671913147, + "step": 6294 + }, + { + "epoch": 0.12592, + "grad_norm": 2.171875, + "grad_norm_var": 0.06004130045572917, + "learning_rate": 0.0001, + "loss": 4.4988, + "loss/crossentropy": 2.323319673538208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24748852849006653, + "step": 6296 + }, + { + "epoch": 0.12596, + "grad_norm": 2.296875, + "grad_norm_var": 0.05998942057291667, + "learning_rate": 0.0001, + "loss": 4.4801, + "loss/crossentropy": 2.053200662136078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2278411090373993, + "step": 6298 + }, + { + "epoch": 0.126, + "grad_norm": 2.4375, + "grad_norm_var": 0.060155232747395836, + "learning_rate": 0.0001, + "loss": 4.4605, + "loss/crossentropy": 2.102539896965027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26071713864803314, + "step": 6300 + }, + { + "epoch": 0.12604, + "grad_norm": 2.265625, + "grad_norm_var": 0.059554036458333334, + "learning_rate": 0.0001, + "loss": 4.4613, + "loss/crossentropy": 1.9316620826721191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23007915914058685, + "step": 6302 + }, + { + "epoch": 0.12608, + "grad_norm": 2.6875, + "grad_norm_var": 0.06288655598958333, + "learning_rate": 0.0001, + "loss": 4.4194, + "loss/crossentropy": 1.9966526627540588, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2424854189157486, + "step": 6304 + }, + { + "epoch": 0.12612, + "grad_norm": 2.046875, + "grad_norm_var": 0.022541300455729166, + "learning_rate": 0.0001, + "loss": 4.2382, + "loss/crossentropy": 1.895868957042694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2255951091647148, + "step": 6306 + }, + { + "epoch": 0.12616, + "grad_norm": 2.171875, + "grad_norm_var": 0.023688761393229167, + "learning_rate": 0.0001, + "loss": 4.7758, + "loss/crossentropy": 2.3897345066070557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24461720883846283, + "step": 6308 + }, + { + "epoch": 0.1262, + "grad_norm": 2.40625, + "grad_norm_var": 0.022858683268229166, + "learning_rate": 0.0001, + "loss": 4.8275, + "loss/crossentropy": 2.059292197227478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29264035820961, + "step": 6310 + }, + { + "epoch": 0.12624, + "grad_norm": 2.390625, + "grad_norm_var": 0.0221343994140625, + "learning_rate": 0.0001, + "loss": 4.5098, + "loss/crossentropy": 1.8142406344413757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22127485275268555, + "step": 6312 + }, + { + "epoch": 0.12628, + "grad_norm": 2.109375, + "grad_norm_var": 0.0245025634765625, + "learning_rate": 0.0001, + "loss": 4.2349, + "loss/crossentropy": 2.0186127424240112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.217330664396286, + "step": 6314 + }, + { + "epoch": 0.12632, + "grad_norm": 2.265625, + "grad_norm_var": 0.024820963541666668, + "learning_rate": 0.0001, + "loss": 4.0102, + "loss/crossentropy": 1.9439889192581177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108919695019722, + "step": 6316 + }, + { + "epoch": 0.12636, + "grad_norm": 2.28125, + "grad_norm_var": 0.024690755208333335, + "learning_rate": 0.0001, + "loss": 4.8672, + "loss/crossentropy": 2.617791175842285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2756097912788391, + "step": 6318 + }, + { + "epoch": 0.1264, + "grad_norm": 2.515625, + "grad_norm_var": 0.017609659830729166, + "learning_rate": 0.0001, + "loss": 4.3918, + "loss/crossentropy": 1.9074286818504333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22491587698459625, + "step": 6320 + }, + { + "epoch": 0.12644, + "grad_norm": 2.09375, + "grad_norm_var": 0.016331990559895832, + "learning_rate": 0.0001, + "loss": 4.4304, + "loss/crossentropy": 2.1414809226989746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22703612595796585, + "step": 6322 + }, + { + "epoch": 0.12648, + "grad_norm": 2.28125, + "grad_norm_var": 0.0154205322265625, + "learning_rate": 0.0001, + "loss": 4.4982, + "loss/crossentropy": 2.291784942150116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24490328133106232, + "step": 6324 + }, + { + "epoch": 0.12652, + "grad_norm": 2.1875, + "grad_norm_var": 0.0143218994140625, + "learning_rate": 0.0001, + "loss": 4.228, + "loss/crossentropy": 1.938249409198761, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22611317038536072, + "step": 6326 + }, + { + "epoch": 0.12656, + "grad_norm": 2.03125, + "grad_norm_var": 0.0187408447265625, + "learning_rate": 0.0001, + "loss": 4.4932, + "loss/crossentropy": 1.7510024905204773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30439992994070053, + "step": 6328 + }, + { + "epoch": 0.1266, + "grad_norm": 2.46875, + "grad_norm_var": 0.022907511393229166, + "learning_rate": 0.0001, + "loss": 4.4351, + "loss/crossentropy": 2.3168221712112427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25660980492830276, + "step": 6330 + }, + { + "epoch": 0.12664, + "grad_norm": 2.046875, + "grad_norm_var": 0.024409993489583334, + "learning_rate": 0.0001, + "loss": 4.5002, + "loss/crossentropy": 1.9957427978515625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21420849114656448, + "step": 6332 + }, + { + "epoch": 0.12668, + "grad_norm": 2.0625, + "grad_norm_var": 0.025104777018229166, + "learning_rate": 0.0001, + "loss": 4.3583, + "loss/crossentropy": 2.4371464252471924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2576342821121216, + "step": 6334 + }, + { + "epoch": 0.12672, + "grad_norm": 2.171875, + "grad_norm_var": 0.021458943684895832, + "learning_rate": 0.0001, + "loss": 4.2497, + "loss/crossentropy": 1.910677433013916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21360614150762558, + "step": 6336 + }, + { + "epoch": 0.12676, + "grad_norm": 2.203125, + "grad_norm_var": 0.020750935872395834, + "learning_rate": 0.0001, + "loss": 4.2679, + "loss/crossentropy": 2.057362914085388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23079442232847214, + "step": 6338 + }, + { + "epoch": 0.1268, + "grad_norm": 2.21875, + "grad_norm_var": 0.0191070556640625, + "learning_rate": 0.0001, + "loss": 4.1818, + "loss/crossentropy": 2.0249438881874084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24038050323724747, + "step": 6340 + }, + { + "epoch": 0.12684, + "grad_norm": 2.25, + "grad_norm_var": 0.019074503580729166, + "learning_rate": 0.0001, + "loss": 4.5374, + "loss/crossentropy": 2.0371533632278442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2355760782957077, + "step": 6342 + }, + { + "epoch": 0.12688, + "grad_norm": 2.203125, + "grad_norm_var": 0.0130523681640625, + "learning_rate": 0.0001, + "loss": 4.4849, + "loss/crossentropy": 2.3101218938827515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24161705374717712, + "step": 6344 + }, + { + "epoch": 0.12692, + "grad_norm": 2.25, + "grad_norm_var": 0.010261027018229167, + "learning_rate": 0.0001, + "loss": 4.6399, + "loss/crossentropy": 2.1990097761154175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2537754699587822, + "step": 6346 + }, + { + "epoch": 0.12696, + "grad_norm": 2.125, + "grad_norm_var": 0.0136383056640625, + "learning_rate": 0.0001, + "loss": 4.2697, + "loss/crossentropy": 2.1783597469329834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2106732353568077, + "step": 6348 + }, + { + "epoch": 0.127, + "grad_norm": 2.359375, + "grad_norm_var": 0.012987263997395833, + "learning_rate": 0.0001, + "loss": 4.5127, + "loss/crossentropy": 2.2316598892211914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23841773718595505, + "step": 6350 + }, + { + "epoch": 0.12704, + "grad_norm": 2.265625, + "grad_norm_var": 0.00943603515625, + "learning_rate": 0.0001, + "loss": 4.5651, + "loss/crossentropy": 2.0329924821853638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22755083441734314, + "step": 6352 + }, + { + "epoch": 0.12708, + "grad_norm": 2.078125, + "grad_norm_var": 0.014525349934895833, + "learning_rate": 0.0001, + "loss": 4.1888, + "loss/crossentropy": 1.9174052476882935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20000722259283066, + "step": 6354 + }, + { + "epoch": 0.12712, + "grad_norm": 2.15625, + "grad_norm_var": 0.0154693603515625, + "learning_rate": 0.0001, + "loss": 4.3321, + "loss/crossentropy": 1.968774676322937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2194407731294632, + "step": 6356 + }, + { + "epoch": 0.12716, + "grad_norm": 2.296875, + "grad_norm_var": 0.0183013916015625, + "learning_rate": 0.0001, + "loss": 4.5768, + "loss/crossentropy": 2.1767213344573975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24483858048915863, + "step": 6358 + }, + { + "epoch": 0.1272, + "grad_norm": 2.171875, + "grad_norm_var": 0.018391927083333332, + "learning_rate": 0.0001, + "loss": 4.5063, + "loss/crossentropy": 1.8390987515449524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21677500754594803, + "step": 6360 + }, + { + "epoch": 0.12724, + "grad_norm": 2.203125, + "grad_norm_var": 0.016039021809895835, + "learning_rate": 0.0001, + "loss": 4.2635, + "loss/crossentropy": 2.1923086643218994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21362057328224182, + "step": 6362 + }, + { + "epoch": 0.12728, + "grad_norm": 2.25, + "grad_norm_var": 0.010347493489583333, + "learning_rate": 0.0001, + "loss": 4.3104, + "loss/crossentropy": 2.030683994293213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22884567826986313, + "step": 6364 + }, + { + "epoch": 0.12732, + "grad_norm": 2.203125, + "grad_norm_var": 0.00816650390625, + "learning_rate": 0.0001, + "loss": 4.452, + "loss/crossentropy": 2.330837845802307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24233026057481766, + "step": 6366 + }, + { + "epoch": 0.12736, + "grad_norm": 2.453125, + "grad_norm_var": 0.012040201822916667, + "learning_rate": 0.0001, + "loss": 4.6404, + "loss/crossentropy": 2.001612663269043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2332654967904091, + "step": 6368 + }, + { + "epoch": 0.1274, + "grad_norm": 2.25, + "grad_norm_var": 0.011750284830729167, + "learning_rate": 0.0001, + "loss": 4.7101, + "loss/crossentropy": 1.9234941601753235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23741928488016129, + "step": 6370 + }, + { + "epoch": 0.12744, + "grad_norm": 2.078125, + "grad_norm_var": 0.012262980143229166, + "learning_rate": 0.0001, + "loss": 4.2164, + "loss/crossentropy": 2.059934139251709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24736596643924713, + "step": 6372 + }, + { + "epoch": 0.12748, + "grad_norm": 2.125, + "grad_norm_var": 0.010798136393229166, + "learning_rate": 0.0001, + "loss": 4.3916, + "loss/crossentropy": 2.25182843208313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24349220097064972, + "step": 6374 + }, + { + "epoch": 0.12752, + "grad_norm": 2.15625, + "grad_norm_var": 0.010888671875, + "learning_rate": 0.0001, + "loss": 4.3742, + "loss/crossentropy": 1.845405638217926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2138313353061676, + "step": 6376 + }, + { + "epoch": 0.12756, + "grad_norm": 2.546875, + "grad_norm_var": 0.0164215087890625, + "learning_rate": 0.0001, + "loss": 4.568, + "loss/crossentropy": 1.8833998441696167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25304871797561646, + "step": 6378 + }, + { + "epoch": 0.1276, + "grad_norm": 2.28125, + "grad_norm_var": 0.017072550455729165, + "learning_rate": 0.0001, + "loss": 4.2282, + "loss/crossentropy": 2.1066314578056335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2231372445821762, + "step": 6380 + }, + { + "epoch": 0.12764, + "grad_norm": 2.328125, + "grad_norm_var": 0.018561808268229167, + "learning_rate": 0.0001, + "loss": 4.3381, + "loss/crossentropy": 2.024670898914337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22260528802871704, + "step": 6382 + }, + { + "epoch": 0.12768, + "grad_norm": 2.109375, + "grad_norm_var": 0.01763916015625, + "learning_rate": 0.0001, + "loss": 4.8132, + "loss/crossentropy": 2.425115466117859, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2429298236966133, + "step": 6384 + }, + { + "epoch": 0.12772, + "grad_norm": 2.109375, + "grad_norm_var": 0.016597493489583334, + "learning_rate": 0.0001, + "loss": 4.2387, + "loss/crossentropy": 2.1847925186157227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2495381161570549, + "step": 6386 + }, + { + "epoch": 0.12776, + "grad_norm": 2.265625, + "grad_norm_var": 0.015746053059895834, + "learning_rate": 0.0001, + "loss": 4.6919, + "loss/crossentropy": 2.1463611125946045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2303348332643509, + "step": 6388 + }, + { + "epoch": 0.1278, + "grad_norm": 2.171875, + "grad_norm_var": 0.014839680989583333, + "learning_rate": 0.0001, + "loss": 4.2714, + "loss/crossentropy": 1.9755331873893738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2232731431722641, + "step": 6390 + }, + { + "epoch": 0.12784, + "grad_norm": 2.078125, + "grad_norm_var": 0.015973917643229165, + "learning_rate": 0.0001, + "loss": 4.5255, + "loss/crossentropy": 2.0741465091705322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22912710905075073, + "step": 6392 + }, + { + "epoch": 0.12788, + "grad_norm": 2.203125, + "grad_norm_var": 0.009471638997395834, + "learning_rate": 0.0001, + "loss": 4.3834, + "loss/crossentropy": 2.1314439177513123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22910036891698837, + "step": 6394 + }, + { + "epoch": 0.12792, + "grad_norm": 2.109375, + "grad_norm_var": 0.0107086181640625, + "learning_rate": 0.0001, + "loss": 4.1615, + "loss/crossentropy": 1.976862907409668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.233707495033741, + "step": 6396 + }, + { + "epoch": 0.12796, + "grad_norm": 2.171875, + "grad_norm_var": 0.010807291666666666, + "learning_rate": 0.0001, + "loss": 4.3939, + "loss/crossentropy": 2.0534666180610657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21424879133701324, + "step": 6398 + }, + { + "epoch": 0.128, + "grad_norm": 2.109375, + "grad_norm_var": 0.008088175455729167, + "learning_rate": 0.0001, + "loss": 4.6647, + "loss/crossentropy": 2.2200992107391357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2836414724588394, + "step": 6400 + }, + { + "epoch": 0.12804, + "grad_norm": 2.234375, + "grad_norm_var": 0.007542928059895833, + "learning_rate": 0.0001, + "loss": 4.3802, + "loss/crossentropy": 1.8642286658287048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21438255906105042, + "step": 6402 + }, + { + "epoch": 0.12808, + "grad_norm": 2.15625, + "grad_norm_var": 0.005464680989583333, + "learning_rate": 0.0001, + "loss": 4.3868, + "loss/crossentropy": 1.9571613073349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22038634121418, + "step": 6404 + }, + { + "epoch": 0.12812, + "grad_norm": 2.125, + "grad_norm_var": 0.008980305989583333, + "learning_rate": 0.0001, + "loss": 4.6888, + "loss/crossentropy": 2.0420787930488586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2532814294099808, + "step": 6406 + }, + { + "epoch": 0.12816, + "grad_norm": 2.109375, + "grad_norm_var": 0.008349609375, + "learning_rate": 0.0001, + "loss": 4.3357, + "loss/crossentropy": 1.7511736750602722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2067284658551216, + "step": 6408 + }, + { + "epoch": 0.1282, + "grad_norm": 2.21875, + "grad_norm_var": 0.0095123291015625, + "learning_rate": 0.0001, + "loss": 4.5568, + "loss/crossentropy": 2.0310762524604797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2763645648956299, + "step": 6410 + }, + { + "epoch": 0.12824, + "grad_norm": 2.28125, + "grad_norm_var": 0.008617146809895834, + "learning_rate": 0.0001, + "loss": 4.575, + "loss/crossentropy": 2.112699866294861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2409205138683319, + "step": 6412 + }, + { + "epoch": 0.12828, + "grad_norm": 2.296875, + "grad_norm_var": 0.14166259765625, + "learning_rate": 0.0001, + "loss": 4.3023, + "loss/crossentropy": 1.8243904113769531, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2079915553331375, + "step": 6414 + }, + { + "epoch": 0.12832, + "grad_norm": 2.125, + "grad_norm_var": 0.14143473307291668, + "learning_rate": 0.0001, + "loss": 4.5395, + "loss/crossentropy": 1.8310211896896362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21304991096258163, + "step": 6416 + }, + { + "epoch": 0.12836, + "grad_norm": 2.140625, + "grad_norm_var": 0.14011128743489584, + "learning_rate": 0.0001, + "loss": 4.3806, + "loss/crossentropy": 1.9653990268707275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22207710891962051, + "step": 6418 + }, + { + "epoch": 0.1284, + "grad_norm": 2.15625, + "grad_norm_var": 0.13982645670572916, + "learning_rate": 0.0001, + "loss": 4.5869, + "loss/crossentropy": 2.0583431124687195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23768695443868637, + "step": 6420 + }, + { + "epoch": 0.12844, + "grad_norm": 2.296875, + "grad_norm_var": 0.1391998291015625, + "learning_rate": 0.0001, + "loss": 4.6512, + "loss/crossentropy": 2.1162279844284058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23963302373886108, + "step": 6422 + }, + { + "epoch": 0.12848, + "grad_norm": 2.28125, + "grad_norm_var": 0.13347066243489583, + "learning_rate": 0.0001, + "loss": 4.6268, + "loss/crossentropy": 2.068794012069702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21967273205518723, + "step": 6424 + }, + { + "epoch": 0.12852, + "grad_norm": 2.109375, + "grad_norm_var": 0.13585611979166667, + "learning_rate": 0.0001, + "loss": 4.4004, + "loss/crossentropy": 2.165997266769409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22991405427455902, + "step": 6426 + }, + { + "epoch": 0.12856, + "grad_norm": 2.25, + "grad_norm_var": 0.14031473795572916, + "learning_rate": 0.0001, + "loss": 4.1816, + "loss/crossentropy": 1.9116491675376892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22784583270549774, + "step": 6428 + }, + { + "epoch": 0.1286, + "grad_norm": 2.296875, + "grad_norm_var": 0.0087890625, + "learning_rate": 0.0001, + "loss": 4.5268, + "loss/crossentropy": 2.296347498893738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2618508040904999, + "step": 6430 + }, + { + "epoch": 0.12864, + "grad_norm": 2.125, + "grad_norm_var": 0.0100250244140625, + "learning_rate": 0.0001, + "loss": 4.0363, + "loss/crossentropy": 2.3961373567581177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27667778730392456, + "step": 6432 + }, + { + "epoch": 0.12868, + "grad_norm": 2.171875, + "grad_norm_var": 0.009598795572916667, + "learning_rate": 0.0001, + "loss": 4.3993, + "loss/crossentropy": 1.9141033291816711, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22818633913993835, + "step": 6434 + }, + { + "epoch": 0.12872, + "grad_norm": 2.15625, + "grad_norm_var": 0.0070149739583333336, + "learning_rate": 0.0001, + "loss": 4.481, + "loss/crossentropy": 2.4431718587875366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24852856248617172, + "step": 6436 + }, + { + "epoch": 0.12876, + "grad_norm": 2.171875, + "grad_norm_var": 0.006494140625, + "learning_rate": 0.0001, + "loss": 4.5153, + "loss/crossentropy": 1.9343949556350708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22181283682584763, + "step": 6438 + }, + { + "epoch": 0.1288, + "grad_norm": 2.171875, + "grad_norm_var": 0.005549112955729167, + "learning_rate": 0.0001, + "loss": 4.4547, + "loss/crossentropy": 2.203734040260315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23536919057369232, + "step": 6440 + }, + { + "epoch": 0.12884, + "grad_norm": 2.046875, + "grad_norm_var": 0.0055572509765625, + "learning_rate": 0.0001, + "loss": 4.354, + "loss/crossentropy": 2.0419046878814697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2121758982539177, + "step": 6442 + }, + { + "epoch": 0.12888, + "grad_norm": 2.328125, + "grad_norm_var": 0.0054595947265625, + "learning_rate": 0.0001, + "loss": 4.5738, + "loss/crossentropy": 2.3551554679870605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23853591084480286, + "step": 6444 + }, + { + "epoch": 0.12892, + "grad_norm": 2.21875, + "grad_norm_var": 0.004813639322916666, + "learning_rate": 0.0001, + "loss": 4.4334, + "loss/crossentropy": 1.9079387784004211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22546496242284775, + "step": 6446 + }, + { + "epoch": 0.12896, + "grad_norm": 2.140625, + "grad_norm_var": 0.00390625, + "learning_rate": 0.0001, + "loss": 4.4188, + "loss/crossentropy": 1.9344156980514526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21923892199993134, + "step": 6448 + }, + { + "epoch": 0.129, + "grad_norm": 2.296875, + "grad_norm_var": 0.0047271728515625, + "learning_rate": 0.0001, + "loss": 4.3219, + "loss/crossentropy": 1.7627189755439758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20726975798606873, + "step": 6450 + }, + { + "epoch": 0.12904, + "grad_norm": 2.28125, + "grad_norm_var": 0.0048736572265625, + "learning_rate": 0.0001, + "loss": 4.4045, + "loss/crossentropy": 1.9988782405853271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23156572133302689, + "step": 6452 + }, + { + "epoch": 0.12908, + "grad_norm": 2.1875, + "grad_norm_var": 0.004621378580729167, + "learning_rate": 0.0001, + "loss": 4.6551, + "loss/crossentropy": 2.3970296382904053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24723558872938156, + "step": 6454 + }, + { + "epoch": 0.12912, + "grad_norm": 2.234375, + "grad_norm_var": 0.00455322265625, + "learning_rate": 0.0001, + "loss": 4.3785, + "loss/crossentropy": 2.009088099002838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2188432812690735, + "step": 6456 + }, + { + "epoch": 0.12916, + "grad_norm": 2.203125, + "grad_norm_var": 0.005882771809895834, + "learning_rate": 0.0001, + "loss": 4.9189, + "loss/crossentropy": 2.166573464870453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22506655752658844, + "step": 6458 + }, + { + "epoch": 0.1292, + "grad_norm": 2.109375, + "grad_norm_var": 0.00670166015625, + "learning_rate": 0.0001, + "loss": 4.3281, + "loss/crossentropy": 2.1567386388778687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23143760859966278, + "step": 6460 + }, + { + "epoch": 0.12924, + "grad_norm": 2.203125, + "grad_norm_var": 0.007307942708333333, + "learning_rate": 0.0001, + "loss": 3.9726, + "loss/crossentropy": 1.8458901643753052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22627578675746918, + "step": 6462 + }, + { + "epoch": 0.12928, + "grad_norm": 2.21875, + "grad_norm_var": 0.00738525390625, + "learning_rate": 0.0001, + "loss": 4.1416, + "loss/crossentropy": 1.7933887243270874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1944974958896637, + "step": 6464 + }, + { + "epoch": 0.12932, + "grad_norm": 2.328125, + "grad_norm_var": 0.0077707926432291664, + "learning_rate": 0.0001, + "loss": 4.3712, + "loss/crossentropy": 2.14457631111145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2461908757686615, + "step": 6466 + }, + { + "epoch": 0.12936, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011563873291015625, + "learning_rate": 0.0001, + "loss": 4.2253, + "loss/crossentropy": 2.2191081047058105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24845656007528305, + "step": 6468 + }, + { + "epoch": 0.1294, + "grad_norm": 2.3125, + "grad_norm_var": 0.012284088134765624, + "learning_rate": 0.0001, + "loss": 4.3992, + "loss/crossentropy": 2.2655181884765625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24961909651756287, + "step": 6470 + }, + { + "epoch": 0.12944, + "grad_norm": 2.171875, + "grad_norm_var": 0.012617746988932291, + "learning_rate": 0.0001, + "loss": 4.2431, + "loss/crossentropy": 1.9992872476577759, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21180924773216248, + "step": 6472 + }, + { + "epoch": 0.12948, + "grad_norm": 2.078125, + "grad_norm_var": 0.009348297119140625, + "learning_rate": 0.0001, + "loss": 4.2865, + "loss/crossentropy": 2.1950103044509888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23401658236980438, + "step": 6474 + }, + { + "epoch": 0.12952, + "grad_norm": 2.234375, + "grad_norm_var": 0.010027821858723958, + "learning_rate": 0.0001, + "loss": 4.2614, + "loss/crossentropy": 2.148743689060211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2146785706281662, + "step": 6476 + }, + { + "epoch": 0.12956, + "grad_norm": 2.140625, + "grad_norm_var": 0.010253651936848959, + "learning_rate": 0.0001, + "loss": 4.7196, + "loss/crossentropy": 2.4312193393707275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2596089243888855, + "step": 6478 + }, + { + "epoch": 0.1296, + "grad_norm": 2.25, + "grad_norm_var": 0.011844635009765625, + "learning_rate": 0.0001, + "loss": 4.7232, + "loss/crossentropy": 2.061104893684387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24564718455076218, + "step": 6480 + }, + { + "epoch": 0.12964, + "grad_norm": 2.21875, + "grad_norm_var": 0.011224110921223959, + "learning_rate": 0.0001, + "loss": 4.6833, + "loss/crossentropy": 2.1994687914848328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2352369725704193, + "step": 6482 + }, + { + "epoch": 0.12968, + "grad_norm": 2.203125, + "grad_norm_var": 0.010692342122395834, + "learning_rate": 0.0001, + "loss": 4.2378, + "loss/crossentropy": 2.085163116455078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2309635877609253, + "step": 6484 + }, + { + "epoch": 0.12972, + "grad_norm": 2.1875, + "grad_norm_var": 0.0098297119140625, + "learning_rate": 0.0001, + "loss": 4.5964, + "loss/crossentropy": 2.3360198736190796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24846713244915009, + "step": 6486 + }, + { + "epoch": 0.12976, + "grad_norm": 2.265625, + "grad_norm_var": 0.010602823893229167, + "learning_rate": 0.0001, + "loss": 4.4454, + "loss/crossentropy": 2.089003086090088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24980145692825317, + "step": 6488 + }, + { + "epoch": 0.1298, + "grad_norm": 2.125, + "grad_norm_var": 0.0108795166015625, + "learning_rate": 0.0001, + "loss": 4.4446, + "loss/crossentropy": 2.0884299874305725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23875930905342102, + "step": 6490 + }, + { + "epoch": 0.12984, + "grad_norm": 2.09375, + "grad_norm_var": 0.01002197265625, + "learning_rate": 0.0001, + "loss": 4.1661, + "loss/crossentropy": 1.9070702195167542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21942836046218872, + "step": 6492 + }, + { + "epoch": 0.12988, + "grad_norm": 2.609375, + "grad_norm_var": 0.02047119140625, + "learning_rate": 0.0001, + "loss": 4.6558, + "loss/crossentropy": 2.1617711782455444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22820374369621277, + "step": 6494 + }, + { + "epoch": 0.12992, + "grad_norm": 2.15625, + "grad_norm_var": 0.0203765869140625, + "learning_rate": 0.0001, + "loss": 4.0938, + "loss/crossentropy": 1.7782898545265198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22162485867738724, + "step": 6496 + }, + { + "epoch": 0.12996, + "grad_norm": 2.1875, + "grad_norm_var": 0.021028645833333335, + "learning_rate": 0.0001, + "loss": 4.4198, + "loss/crossentropy": 2.1364612579345703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22490044683218002, + "step": 6498 + }, + { + "epoch": 0.13, + "grad_norm": 2.34375, + "grad_norm_var": 0.020213826497395834, + "learning_rate": 0.0001, + "loss": 4.569, + "loss/crossentropy": 2.282773971557617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24553906172513962, + "step": 6500 + }, + { + "epoch": 0.13004, + "grad_norm": 2.171875, + "grad_norm_var": 0.020905558268229166, + "learning_rate": 0.0001, + "loss": 4.4965, + "loss/crossentropy": 2.008001983165741, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22152648121118546, + "step": 6502 + }, + { + "epoch": 0.13008, + "grad_norm": 2.328125, + "grad_norm_var": 0.021833292643229165, + "learning_rate": 0.0001, + "loss": 4.4535, + "loss/crossentropy": 2.1681981086730957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26258186995983124, + "step": 6504 + }, + { + "epoch": 0.13012, + "grad_norm": 2.328125, + "grad_norm_var": 0.021761067708333335, + "learning_rate": 0.0001, + "loss": 4.6254, + "loss/crossentropy": 2.5754435062408447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26315446197986603, + "step": 6506 + }, + { + "epoch": 0.13016, + "grad_norm": 2.234375, + "grad_norm_var": 0.021187337239583333, + "learning_rate": 0.0001, + "loss": 4.1505, + "loss/crossentropy": 1.7897658348083496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22884277999401093, + "step": 6508 + }, + { + "epoch": 0.1302, + "grad_norm": 2.46875, + "grad_norm_var": 0.016844685872395834, + "learning_rate": 0.0001, + "loss": 4.6315, + "loss/crossentropy": 1.8081435561180115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21860769391059875, + "step": 6510 + }, + { + "epoch": 0.13024, + "grad_norm": 2.359375, + "grad_norm_var": 0.0140289306640625, + "learning_rate": 0.0001, + "loss": 4.7371, + "loss/crossentropy": 2.243759036064148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2545628473162651, + "step": 6512 + }, + { + "epoch": 0.13028, + "grad_norm": 2.1875, + "grad_norm_var": 0.03447265625, + "learning_rate": 0.0001, + "loss": 4.6147, + "loss/crossentropy": 2.069986939430237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20862630754709244, + "step": 6514 + }, + { + "epoch": 0.13032, + "grad_norm": 2.203125, + "grad_norm_var": 0.0349761962890625, + "learning_rate": 0.0001, + "loss": 4.4758, + "loss/crossentropy": 2.052341878414154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2460884153842926, + "step": 6516 + }, + { + "epoch": 0.13036, + "grad_norm": 2.390625, + "grad_norm_var": 0.0317047119140625, + "learning_rate": 0.0001, + "loss": 4.6369, + "loss/crossentropy": 2.3376708030700684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25382688641548157, + "step": 6518 + }, + { + "epoch": 0.1304, + "grad_norm": 2.125, + "grad_norm_var": 0.0329498291015625, + "learning_rate": 0.0001, + "loss": 4.3048, + "loss/crossentropy": 1.8329171538352966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2135135680437088, + "step": 6520 + }, + { + "epoch": 0.13044, + "grad_norm": 2.421875, + "grad_norm_var": 0.035791015625, + "learning_rate": 0.0001, + "loss": 4.6582, + "loss/crossentropy": 2.077217698097229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2265045866370201, + "step": 6522 + }, + { + "epoch": 0.13048, + "grad_norm": 2.171875, + "grad_norm_var": 0.036783854166666664, + "learning_rate": 0.0001, + "loss": 4.2189, + "loss/crossentropy": 2.0677568912506104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2468869537115097, + "step": 6524 + }, + { + "epoch": 0.13052, + "grad_norm": 2.296875, + "grad_norm_var": 0.03591206868489583, + "learning_rate": 0.0001, + "loss": 4.5246, + "loss/crossentropy": 2.0692074298858643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2602302059531212, + "step": 6526 + }, + { + "epoch": 0.13056, + "grad_norm": 2.421875, + "grad_norm_var": 0.03583984375, + "learning_rate": 0.0001, + "loss": 4.6709, + "loss/crossentropy": 1.9767250418663025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2232941836118698, + "step": 6528 + }, + { + "epoch": 0.1306, + "grad_norm": 2.234375, + "grad_norm_var": 0.01480712890625, + "learning_rate": 0.0001, + "loss": 4.4851, + "loss/crossentropy": 1.7538996934890747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20786649733781815, + "step": 6530 + }, + { + "epoch": 0.13064, + "grad_norm": 2.203125, + "grad_norm_var": 0.01578369140625, + "learning_rate": 0.0001, + "loss": 4.5598, + "loss/crossentropy": 2.061249256134033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23238955438137054, + "step": 6532 + }, + { + "epoch": 0.13068, + "grad_norm": 2.171875, + "grad_norm_var": 0.0167877197265625, + "learning_rate": 0.0001, + "loss": 4.237, + "loss/crossentropy": 1.9083253145217896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22536182403564453, + "step": 6534 + }, + { + "epoch": 0.13072, + "grad_norm": 2.046875, + "grad_norm_var": 0.017952473958333333, + "learning_rate": 0.0001, + "loss": 4.3004, + "loss/crossentropy": 2.052153766155243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2343958392739296, + "step": 6536 + }, + { + "epoch": 0.13076, + "grad_norm": 2.125, + "grad_norm_var": 0.010091145833333334, + "learning_rate": 0.0001, + "loss": 4.3638, + "loss/crossentropy": 1.9979270100593567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20102836191654205, + "step": 6538 + }, + { + "epoch": 0.1308, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012031809488932291, + "learning_rate": 0.0001, + "loss": 4.019, + "loss/crossentropy": 1.618333637714386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21146921813488007, + "step": 6540 + }, + { + "epoch": 0.13084, + "grad_norm": 2.078125, + "grad_norm_var": 0.011940256754557291, + "learning_rate": 0.0001, + "loss": 4.2514, + "loss/crossentropy": 1.771790623664856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21661869436502457, + "step": 6542 + }, + { + "epoch": 0.13088, + "grad_norm": 2.25, + "grad_norm_var": 0.007165273030598958, + "learning_rate": 0.0001, + "loss": 4.3528, + "loss/crossentropy": 2.2347733974456787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2507361173629761, + "step": 6544 + }, + { + "epoch": 0.13092, + "grad_norm": 2.328125, + "grad_norm_var": 0.012389882405598959, + "learning_rate": 0.0001, + "loss": 4.4333, + "loss/crossentropy": 2.226397395133972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.257301464676857, + "step": 6546 + }, + { + "epoch": 0.13096, + "grad_norm": 2.125, + "grad_norm_var": 0.012776438395182292, + "learning_rate": 0.0001, + "loss": 4.1629, + "loss/crossentropy": 1.9884281158447266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2213538959622383, + "step": 6548 + }, + { + "epoch": 0.131, + "grad_norm": 2.0625, + "grad_norm_var": 0.01755549112955729, + "learning_rate": 0.0001, + "loss": 4.549, + "loss/crossentropy": 2.0790398120880127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2259911745786667, + "step": 6550 + }, + { + "epoch": 0.13104, + "grad_norm": 2.28125, + "grad_norm_var": 0.016294097900390624, + "learning_rate": 0.0001, + "loss": 4.5118, + "loss/crossentropy": 2.1309107542037964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23984672129154205, + "step": 6552 + }, + { + "epoch": 0.13108, + "grad_norm": 2.296875, + "grad_norm_var": 0.01744562784830729, + "learning_rate": 0.0001, + "loss": 4.4528, + "loss/crossentropy": 1.9651963114738464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21699358522891998, + "step": 6554 + }, + { + "epoch": 0.13112, + "grad_norm": 2.21875, + "grad_norm_var": 0.014598592122395834, + "learning_rate": 0.0001, + "loss": 4.436, + "loss/crossentropy": 2.405247449874878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24624405801296234, + "step": 6556 + }, + { + "epoch": 0.13116, + "grad_norm": 2.125, + "grad_norm_var": 0.0154296875, + "learning_rate": 0.0001, + "loss": 4.3984, + "loss/crossentropy": 2.215611457824707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24835016578435898, + "step": 6558 + }, + { + "epoch": 0.1312, + "grad_norm": 4.125, + "grad_norm_var": 0.2436920166015625, + "learning_rate": 0.0001, + "loss": 4.3876, + "loss/crossentropy": 2.1731653809547424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24633550643920898, + "step": 6560 + }, + { + "epoch": 0.13124, + "grad_norm": 2.234375, + "grad_norm_var": 0.24501953125, + "learning_rate": 0.0001, + "loss": 4.5101, + "loss/crossentropy": 2.0507587790489197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25702129304409027, + "step": 6562 + }, + { + "epoch": 0.13128, + "grad_norm": 2.375, + "grad_norm_var": 0.2433502197265625, + "learning_rate": 0.0001, + "loss": 4.3464, + "loss/crossentropy": 2.2088446617126465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22610870003700256, + "step": 6564 + }, + { + "epoch": 0.13132, + "grad_norm": 2.34375, + "grad_norm_var": 0.23931884765625, + "learning_rate": 0.0001, + "loss": 4.3612, + "loss/crossentropy": 1.6911352276802063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2188590243458748, + "step": 6566 + }, + { + "epoch": 0.13136, + "grad_norm": 2.09375, + "grad_norm_var": 0.24976806640625, + "learning_rate": 0.0001, + "loss": 3.9795, + "loss/crossentropy": 1.807699978351593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21844930946826935, + "step": 6568 + }, + { + "epoch": 0.1314, + "grad_norm": 2.28125, + "grad_norm_var": 0.24514872233072918, + "learning_rate": 0.0001, + "loss": 4.4293, + "loss/crossentropy": 2.292602300643921, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.242776520550251, + "step": 6570 + }, + { + "epoch": 0.13144, + "grad_norm": 2.03125, + "grad_norm_var": 0.24806315104166668, + "learning_rate": 0.0001, + "loss": 4.07, + "loss/crossentropy": 1.5262329578399658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1674257293343544, + "step": 6572 + }, + { + "epoch": 0.13148, + "grad_norm": 2.15625, + "grad_norm_var": 0.24504801432291667, + "learning_rate": 0.0001, + "loss": 4.4084, + "loss/crossentropy": 2.180319309234619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2260192409157753, + "step": 6574 + }, + { + "epoch": 0.13152, + "grad_norm": 2.265625, + "grad_norm_var": 0.0216705322265625, + "learning_rate": 0.0001, + "loss": 4.544, + "loss/crossentropy": 2.188440203666687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23389383405447006, + "step": 6576 + }, + { + "epoch": 0.13156, + "grad_norm": 2.125, + "grad_norm_var": 0.03427327473958333, + "learning_rate": 0.0001, + "loss": 4.7595, + "loss/crossentropy": 2.20253586769104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3217846751213074, + "step": 6578 + }, + { + "epoch": 0.1316, + "grad_norm": 2.359375, + "grad_norm_var": 0.034764607747395836, + "learning_rate": 0.0001, + "loss": 4.2771, + "loss/crossentropy": 2.1901479959487915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24521923065185547, + "step": 6580 + }, + { + "epoch": 0.13164, + "grad_norm": 2.6875, + "grad_norm_var": 0.04170633951822917, + "learning_rate": 0.0001, + "loss": 4.7105, + "loss/crossentropy": 2.0719348192214966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2640562355518341, + "step": 6582 + }, + { + "epoch": 0.13168, + "grad_norm": 2.1875, + "grad_norm_var": 0.040339152018229164, + "learning_rate": 0.0001, + "loss": 4.1487, + "loss/crossentropy": 2.331762194633484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24890532344579697, + "step": 6584 + }, + { + "epoch": 0.13172, + "grad_norm": 2.34375, + "grad_norm_var": 0.04429931640625, + "learning_rate": 0.0001, + "loss": 4.6013, + "loss/crossentropy": 1.5143779516220093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1957392692565918, + "step": 6586 + }, + { + "epoch": 0.13176, + "grad_norm": 2.171875, + "grad_norm_var": 0.04088134765625, + "learning_rate": 0.0001, + "loss": 4.4311, + "loss/crossentropy": 2.301910698413849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27353671938180923, + "step": 6588 + }, + { + "epoch": 0.1318, + "grad_norm": 2.84375, + "grad_norm_var": 0.11796773274739583, + "learning_rate": 0.0001, + "loss": 4.5035, + "loss/crossentropy": 2.018579602241516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24261966347694397, + "step": 6590 + }, + { + "epoch": 0.13184, + "grad_norm": 2.203125, + "grad_norm_var": 0.11685282389322917, + "learning_rate": 0.0001, + "loss": 4.6931, + "loss/crossentropy": 1.9749983549118042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20727626234292984, + "step": 6592 + }, + { + "epoch": 0.13188, + "grad_norm": 2.0, + "grad_norm_var": 0.11741536458333333, + "learning_rate": 0.0001, + "loss": 4.1929, + "loss/crossentropy": 2.040414035320282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22879169881343842, + "step": 6594 + }, + { + "epoch": 0.13192, + "grad_norm": 2.453125, + "grad_norm_var": 0.11551106770833333, + "learning_rate": 0.0001, + "loss": 4.3331, + "loss/crossentropy": 1.941766619682312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22591909021139145, + "step": 6596 + }, + { + "epoch": 0.13196, + "grad_norm": 2.34375, + "grad_norm_var": 0.10946858723958333, + "learning_rate": 0.0001, + "loss": 4.3659, + "loss/crossentropy": 1.8487200140953064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23654203116893768, + "step": 6598 + }, + { + "epoch": 0.132, + "grad_norm": 2.109375, + "grad_norm_var": 0.10340067545572916, + "learning_rate": 0.0001, + "loss": 4.225, + "loss/crossentropy": 1.9297338724136353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016594037413597, + "step": 6600 + }, + { + "epoch": 0.13204, + "grad_norm": 2.046875, + "grad_norm_var": 0.11005757649739584, + "learning_rate": 0.0001, + "loss": 4.3643, + "loss/crossentropy": 2.048487663269043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20044995844364166, + "step": 6602 + }, + { + "epoch": 0.13208, + "grad_norm": 2.0625, + "grad_norm_var": 0.11295572916666667, + "learning_rate": 0.0001, + "loss": 4.425, + "loss/crossentropy": 2.3620439767837524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24750277400016785, + "step": 6604 + }, + { + "epoch": 0.13212, + "grad_norm": 2.328125, + "grad_norm_var": 0.0158203125, + "learning_rate": 0.0001, + "loss": 4.2578, + "loss/crossentropy": 2.1324113607406616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23109012842178345, + "step": 6606 + }, + { + "epoch": 0.13216, + "grad_norm": 2.1875, + "grad_norm_var": 0.015620930989583334, + "learning_rate": 0.0001, + "loss": 4.4775, + "loss/crossentropy": 2.021477997303009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2152409330010414, + "step": 6608 + }, + { + "epoch": 0.1322, + "grad_norm": 2.1875, + "grad_norm_var": 0.012669881184895834, + "learning_rate": 0.0001, + "loss": 4.6221, + "loss/crossentropy": 2.1693036556243896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.234656922519207, + "step": 6610 + }, + { + "epoch": 0.13224, + "grad_norm": 2.203125, + "grad_norm_var": 0.008854166666666666, + "learning_rate": 0.0001, + "loss": 4.1469, + "loss/crossentropy": 2.1138017177581787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2239084094762802, + "step": 6612 + }, + { + "epoch": 0.13228, + "grad_norm": 2.28125, + "grad_norm_var": 0.008772786458333333, + "learning_rate": 0.0001, + "loss": 4.4862, + "loss/crossentropy": 2.2540252208709717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2614182382822037, + "step": 6614 + }, + { + "epoch": 0.13232, + "grad_norm": 2.078125, + "grad_norm_var": 0.008837890625, + "learning_rate": 0.0001, + "loss": 4.2131, + "loss/crossentropy": 2.033502757549286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2177339717745781, + "step": 6616 + }, + { + "epoch": 0.13236, + "grad_norm": 2.078125, + "grad_norm_var": 0.010054524739583333, + "learning_rate": 0.0001, + "loss": 4.2648, + "loss/crossentropy": 2.2490646839141846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22369390726089478, + "step": 6618 + }, + { + "epoch": 0.1324, + "grad_norm": 2.21875, + "grad_norm_var": 0.0150543212890625, + "learning_rate": 0.0001, + "loss": 4.3571, + "loss/crossentropy": 2.14319908618927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24394190311431885, + "step": 6620 + }, + { + "epoch": 0.13244, + "grad_norm": 2.390625, + "grad_norm_var": 0.01627197265625, + "learning_rate": 0.0001, + "loss": 4.5136, + "loss/crossentropy": 2.045474410057068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22738997638225555, + "step": 6622 + }, + { + "epoch": 0.13248, + "grad_norm": 2.203125, + "grad_norm_var": 0.015816243489583333, + "learning_rate": 0.0001, + "loss": 4.5773, + "loss/crossentropy": 2.1853290796279907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2565506473183632, + "step": 6624 + }, + { + "epoch": 0.13252, + "grad_norm": 2.078125, + "grad_norm_var": 0.017015584309895835, + "learning_rate": 0.0001, + "loss": 4.275, + "loss/crossentropy": 2.1161083579063416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22949891537427902, + "step": 6626 + }, + { + "epoch": 0.13256, + "grad_norm": 2.1875, + "grad_norm_var": 0.0167633056640625, + "learning_rate": 0.0001, + "loss": 4.3154, + "loss/crossentropy": 2.167048454284668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23087909072637558, + "step": 6628 + }, + { + "epoch": 0.1326, + "grad_norm": 2.21875, + "grad_norm_var": 0.0156158447265625, + "learning_rate": 0.0001, + "loss": 4.2777, + "loss/crossentropy": 2.1544610261917114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25153525173664093, + "step": 6630 + }, + { + "epoch": 0.13264, + "grad_norm": 2.484375, + "grad_norm_var": 0.018244425455729168, + "learning_rate": 0.0001, + "loss": 4.4778, + "loss/crossentropy": 2.0319228768348694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22067170590162277, + "step": 6632 + }, + { + "epoch": 0.13268, + "grad_norm": 2.140625, + "grad_norm_var": 0.0162261962890625, + "learning_rate": 0.0001, + "loss": 4.4398, + "loss/crossentropy": 1.9723476767539978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21380966901779175, + "step": 6634 + }, + { + "epoch": 0.13272, + "grad_norm": 2.796875, + "grad_norm_var": 0.031590779622395836, + "learning_rate": 0.0001, + "loss": 4.3474, + "loss/crossentropy": 2.2833333015441895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23245477676391602, + "step": 6636 + }, + { + "epoch": 0.13276, + "grad_norm": 2.28125, + "grad_norm_var": 0.03132222493489583, + "learning_rate": 0.0001, + "loss": 4.4522, + "loss/crossentropy": 2.13793683052063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22665268182754517, + "step": 6638 + }, + { + "epoch": 0.1328, + "grad_norm": 2.734375, + "grad_norm_var": 0.04592692057291667, + "learning_rate": 0.0001, + "loss": 4.6386, + "loss/crossentropy": 2.188693881034851, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2471916377544403, + "step": 6640 + }, + { + "epoch": 0.13284, + "grad_norm": 2.234375, + "grad_norm_var": 0.0438629150390625, + "learning_rate": 0.0001, + "loss": 4.6394, + "loss/crossentropy": 2.169856071472168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23427975177764893, + "step": 6642 + }, + { + "epoch": 0.13288, + "grad_norm": 2.15625, + "grad_norm_var": 0.04365132649739583, + "learning_rate": 0.0001, + "loss": 4.6498, + "loss/crossentropy": 2.1600061655044556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23366540670394897, + "step": 6644 + }, + { + "epoch": 0.13292, + "grad_norm": 2.359375, + "grad_norm_var": 0.044188435872395834, + "learning_rate": 0.0001, + "loss": 4.4521, + "loss/crossentropy": 1.822945475578308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2203340008854866, + "step": 6646 + }, + { + "epoch": 0.13296, + "grad_norm": 2.15625, + "grad_norm_var": 0.045735677083333336, + "learning_rate": 0.0001, + "loss": 4.3263, + "loss/crossentropy": 1.8908653259277344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2320382297039032, + "step": 6648 + }, + { + "epoch": 0.133, + "grad_norm": 2.265625, + "grad_norm_var": 0.04363606770833333, + "learning_rate": 0.0001, + "loss": 4.5065, + "loss/crossentropy": 2.126000165939331, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24388836324214935, + "step": 6650 + }, + { + "epoch": 0.13304, + "grad_norm": 2.328125, + "grad_norm_var": 0.0242828369140625, + "learning_rate": 0.0001, + "loss": 4.5787, + "loss/crossentropy": 2.434928297996521, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24083568900823593, + "step": 6652 + }, + { + "epoch": 0.13308, + "grad_norm": 2.203125, + "grad_norm_var": 0.023824055989583332, + "learning_rate": 0.0001, + "loss": 4.5538, + "loss/crossentropy": 2.2186567783355713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2220132276415825, + "step": 6654 + }, + { + "epoch": 0.13312, + "grad_norm": 2.25, + "grad_norm_var": 0.006257120768229167, + "learning_rate": 0.0001, + "loss": 4.4934, + "loss/crossentropy": 1.849799931049347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21551364660263062, + "step": 6656 + }, + { + "epoch": 0.13316, + "grad_norm": 2.109375, + "grad_norm_var": 0.0072662353515625, + "learning_rate": 0.0001, + "loss": 4.2237, + "loss/crossentropy": 2.082044243812561, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21274320781230927, + "step": 6658 + }, + { + "epoch": 0.1332, + "grad_norm": 2.125, + "grad_norm_var": 0.010838826497395834, + "learning_rate": 0.0001, + "loss": 4.5884, + "loss/crossentropy": 2.1957098245620728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23078418523073196, + "step": 6660 + }, + { + "epoch": 0.13324, + "grad_norm": 2.28125, + "grad_norm_var": 0.16033528645833334, + "learning_rate": 0.0001, + "loss": 4.519, + "loss/crossentropy": 2.228309690952301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27178408950567245, + "step": 6662 + }, + { + "epoch": 0.13328, + "grad_norm": 2.40625, + "grad_norm_var": 0.156298828125, + "learning_rate": 0.0001, + "loss": 4.5987, + "loss/crossentropy": 1.8185940384864807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2145048901438713, + "step": 6664 + }, + { + "epoch": 0.13332, + "grad_norm": 2.171875, + "grad_norm_var": 0.1566558837890625, + "learning_rate": 0.0001, + "loss": 4.4722, + "loss/crossentropy": 2.198649048805237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2262621968984604, + "step": 6666 + }, + { + "epoch": 0.13336, + "grad_norm": 2.15625, + "grad_norm_var": 0.159130859375, + "learning_rate": 0.0001, + "loss": 4.5729, + "loss/crossentropy": 2.2075835466384888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23531068861484528, + "step": 6668 + }, + { + "epoch": 0.1334, + "grad_norm": 2.140625, + "grad_norm_var": 0.16243387858072916, + "learning_rate": 0.0001, + "loss": 4.2913, + "loss/crossentropy": 1.9719768166542053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2206977903842926, + "step": 6670 + }, + { + "epoch": 0.13344, + "grad_norm": 2.03125, + "grad_norm_var": 0.17021484375, + "learning_rate": 0.0001, + "loss": 4.2144, + "loss/crossentropy": 2.304553985595703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23418369889259338, + "step": 6672 + }, + { + "epoch": 0.13348, + "grad_norm": 2.25, + "grad_norm_var": 0.1682281494140625, + "learning_rate": 0.0001, + "loss": 4.4485, + "loss/crossentropy": 2.212409734725952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23888318240642548, + "step": 6674 + }, + { + "epoch": 0.13352, + "grad_norm": 2.0625, + "grad_norm_var": 0.19371337890625, + "learning_rate": 0.0001, + "loss": 4.176, + "loss/crossentropy": 2.001897156238556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2147163525223732, + "step": 6676 + }, + { + "epoch": 0.13356, + "grad_norm": 2.203125, + "grad_norm_var": 0.043680826822916664, + "learning_rate": 0.0001, + "loss": 4.4245, + "loss/crossentropy": 2.216760039329529, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24913202226161957, + "step": 6678 + }, + { + "epoch": 0.1336, + "grad_norm": 2.09375, + "grad_norm_var": 0.04597142537434896, + "learning_rate": 0.0001, + "loss": 4.1862, + "loss/crossentropy": 1.8190750479698181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.200186125934124, + "step": 6680 + }, + { + "epoch": 0.13364, + "grad_norm": 2.140625, + "grad_norm_var": 0.047548166910807294, + "learning_rate": 0.0001, + "loss": 4.7271, + "loss/crossentropy": 2.311274528503418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22400956600904465, + "step": 6682 + }, + { + "epoch": 0.13368, + "grad_norm": 2.34375, + "grad_norm_var": 0.5293841044108073, + "learning_rate": 0.0001, + "loss": 4.3852, + "loss/crossentropy": 2.0381893515586853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23101551085710526, + "step": 6684 + }, + { + "epoch": 0.13372, + "grad_norm": 2.21875, + "grad_norm_var": 0.5331776936848959, + "learning_rate": 0.0001, + "loss": 4.1185, + "loss/crossentropy": 1.7441503405570984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20208899676799774, + "step": 6686 + }, + { + "epoch": 0.13376, + "grad_norm": 2.171875, + "grad_norm_var": 0.5252593994140625, + "learning_rate": 0.0001, + "loss": 4.3835, + "loss/crossentropy": 1.8874938488006592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2255900353193283, + "step": 6688 + }, + { + "epoch": 0.1338, + "grad_norm": 2.3125, + "grad_norm_var": 0.5284006754557292, + "learning_rate": 0.0001, + "loss": 4.2563, + "loss/crossentropy": 2.2768125534057617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21636968851089478, + "step": 6690 + }, + { + "epoch": 0.13384, + "grad_norm": 2.09375, + "grad_norm_var": 0.5063954671223958, + "learning_rate": 0.0001, + "loss": 4.278, + "loss/crossentropy": 2.0253931283950806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28937554359436035, + "step": 6692 + }, + { + "epoch": 0.13388, + "grad_norm": 2.28125, + "grad_norm_var": 0.5079661051432292, + "learning_rate": 0.0001, + "loss": 4.4205, + "loss/crossentropy": 2.1940718293190002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23384064435958862, + "step": 6694 + }, + { + "epoch": 0.13392, + "grad_norm": 2.078125, + "grad_norm_var": 0.4987993876139323, + "learning_rate": 0.0001, + "loss": 4.5066, + "loss/crossentropy": 2.071534514427185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2146512120962143, + "step": 6696 + }, + { + "epoch": 0.13396, + "grad_norm": 2.28125, + "grad_norm_var": 0.5017534891764323, + "learning_rate": 0.0001, + "loss": 4.348, + "loss/crossentropy": 1.8530714511871338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21837462484836578, + "step": 6698 + }, + { + "epoch": 0.134, + "grad_norm": 2.28125, + "grad_norm_var": 0.013396962483723959, + "learning_rate": 0.0001, + "loss": 4.521, + "loss/crossentropy": 2.210664451122284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2199932038784027, + "step": 6700 + }, + { + "epoch": 0.13404, + "grad_norm": 2.171875, + "grad_norm_var": 0.011506144205729167, + "learning_rate": 0.0001, + "loss": 4.4226, + "loss/crossentropy": 1.8530223965644836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.209881991147995, + "step": 6702 + }, + { + "epoch": 0.13408, + "grad_norm": 2.234375, + "grad_norm_var": 0.011839803059895833, + "learning_rate": 0.0001, + "loss": 4.4302, + "loss/crossentropy": 1.8609183430671692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22096765041351318, + "step": 6704 + }, + { + "epoch": 0.13412, + "grad_norm": 2.59375, + "grad_norm_var": 0.021214803059895832, + "learning_rate": 0.0001, + "loss": 4.8429, + "loss/crossentropy": 2.33315110206604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26414938271045685, + "step": 6706 + }, + { + "epoch": 0.13416, + "grad_norm": 2.359375, + "grad_norm_var": 0.04798075358072917, + "learning_rate": 0.0001, + "loss": 4.6054, + "loss/crossentropy": 2.2656116485595703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2769838646054268, + "step": 6708 + }, + { + "epoch": 0.1342, + "grad_norm": 2.1875, + "grad_norm_var": 0.0466705322265625, + "learning_rate": 0.0001, + "loss": 4.4875, + "loss/crossentropy": 2.2131590843200684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23640615493059158, + "step": 6710 + }, + { + "epoch": 0.13424, + "grad_norm": 2.25, + "grad_norm_var": 0.044831339518229166, + "learning_rate": 0.0001, + "loss": 4.1554, + "loss/crossentropy": 1.8667671084403992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.212738998234272, + "step": 6712 + }, + { + "epoch": 0.13428, + "grad_norm": 2.203125, + "grad_norm_var": 0.04480692545572917, + "learning_rate": 0.0001, + "loss": 4.0958, + "loss/crossentropy": 1.9699830412864685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23507894575595856, + "step": 6714 + }, + { + "epoch": 0.13432, + "grad_norm": 2.078125, + "grad_norm_var": 0.04632161458333333, + "learning_rate": 0.0001, + "loss": 4.1571, + "loss/crossentropy": 1.8108918070793152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2192004919052124, + "step": 6716 + }, + { + "epoch": 0.13436, + "grad_norm": 2.203125, + "grad_norm_var": 0.042740885416666666, + "learning_rate": 0.0001, + "loss": 4.4983, + "loss/crossentropy": 2.0528674125671387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22221814841032028, + "step": 6718 + }, + { + "epoch": 0.1344, + "grad_norm": 2.046875, + "grad_norm_var": 0.046305338541666664, + "learning_rate": 0.0001, + "loss": 4.2544, + "loss/crossentropy": 1.7881956696510315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21330178529024124, + "step": 6720 + }, + { + "epoch": 0.13444, + "grad_norm": 2.3125, + "grad_norm_var": 0.039159138997395836, + "learning_rate": 0.0001, + "loss": 4.5424, + "loss/crossentropy": 2.2016018629074097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22042546421289444, + "step": 6722 + }, + { + "epoch": 0.13448, + "grad_norm": 2.171875, + "grad_norm_var": 0.006322224934895833, + "learning_rate": 0.0001, + "loss": 4.3159, + "loss/crossentropy": 1.9661846160888672, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21542846411466599, + "step": 6724 + }, + { + "epoch": 0.13452, + "grad_norm": 2.328125, + "grad_norm_var": 0.031037394205729166, + "learning_rate": 0.0001, + "loss": 4.4473, + "loss/crossentropy": 2.1073482036590576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23370730876922607, + "step": 6726 + }, + { + "epoch": 0.13456, + "grad_norm": 2.25, + "grad_norm_var": 0.030826822916666666, + "learning_rate": 0.0001, + "loss": 4.6069, + "loss/crossentropy": 2.1937917470932007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22414422780275345, + "step": 6728 + }, + { + "epoch": 0.1346, + "grad_norm": 2.109375, + "grad_norm_var": 0.030436197916666668, + "learning_rate": 0.0001, + "loss": 4.2937, + "loss/crossentropy": 2.078732967376709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21227504312992096, + "step": 6730 + }, + { + "epoch": 0.13464, + "grad_norm": 2.34375, + "grad_norm_var": 0.030301920572916665, + "learning_rate": 0.0001, + "loss": 4.3043, + "loss/crossentropy": 1.9002525806427002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21004119515419006, + "step": 6732 + }, + { + "epoch": 0.13468, + "grad_norm": 2.15625, + "grad_norm_var": 0.030794270833333335, + "learning_rate": 0.0001, + "loss": 4.5156, + "loss/crossentropy": 2.055518925189972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21610142290592194, + "step": 6734 + }, + { + "epoch": 0.13472, + "grad_norm": 2.1875, + "grad_norm_var": 0.028473917643229166, + "learning_rate": 0.0001, + "loss": 4.4889, + "loss/crossentropy": 2.0521084666252136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23528365790843964, + "step": 6736 + }, + { + "epoch": 0.13476, + "grad_norm": 2.203125, + "grad_norm_var": 0.02730712890625, + "learning_rate": 0.0001, + "loss": 4.5051, + "loss/crossentropy": 2.2536301612854004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21544227004051208, + "step": 6738 + }, + { + "epoch": 0.1348, + "grad_norm": 2.1875, + "grad_norm_var": 0.027179972330729166, + "learning_rate": 0.0001, + "loss": 4.5164, + "loss/crossentropy": 2.2610143423080444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2532905787229538, + "step": 6740 + }, + { + "epoch": 0.13484, + "grad_norm": 2.109375, + "grad_norm_var": 0.00836181640625, + "learning_rate": 0.0001, + "loss": 4.1301, + "loss/crossentropy": 1.8840081095695496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21514790505170822, + "step": 6742 + }, + { + "epoch": 0.13488, + "grad_norm": 2.109375, + "grad_norm_var": 0.0084136962890625, + "learning_rate": 0.0001, + "loss": 4.2532, + "loss/crossentropy": 2.0841002464294434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2328692153096199, + "step": 6744 + }, + { + "epoch": 0.13492, + "grad_norm": 2.390625, + "grad_norm_var": 0.010798136393229166, + "learning_rate": 0.0001, + "loss": 4.6335, + "loss/crossentropy": 2.507196068763733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2736222892999649, + "step": 6746 + }, + { + "epoch": 0.13496, + "grad_norm": 2.421875, + "grad_norm_var": 0.016527303059895835, + "learning_rate": 0.0001, + "loss": 4.189, + "loss/crossentropy": 2.0180357098579407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22328195720911026, + "step": 6748 + }, + { + "epoch": 0.135, + "grad_norm": 2.15625, + "grad_norm_var": 0.016097005208333334, + "learning_rate": 0.0001, + "loss": 4.2854, + "loss/crossentropy": 2.2457560300827026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26572495698928833, + "step": 6750 + }, + { + "epoch": 0.13504, + "grad_norm": 2.6875, + "grad_norm_var": 0.031493123372395834, + "learning_rate": 0.0001, + "loss": 4.885, + "loss/crossentropy": 2.1280853748321533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22716474533081055, + "step": 6752 + }, + { + "epoch": 0.13508, + "grad_norm": 2.25, + "grad_norm_var": 0.03216145833333333, + "learning_rate": 0.0001, + "loss": 4.3901, + "loss/crossentropy": 2.1843650341033936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24630828201770782, + "step": 6754 + }, + { + "epoch": 0.13512, + "grad_norm": 2.21875, + "grad_norm_var": 0.03390299479166667, + "learning_rate": 0.0001, + "loss": 4.3584, + "loss/crossentropy": 1.7955012917518616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20614011585712433, + "step": 6756 + }, + { + "epoch": 0.13516, + "grad_norm": 2.1875, + "grad_norm_var": 0.028563435872395834, + "learning_rate": 0.0001, + "loss": 4.3546, + "loss/crossentropy": 1.9315852522850037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22918011993169785, + "step": 6758 + }, + { + "epoch": 0.1352, + "grad_norm": 2.140625, + "grad_norm_var": 0.028055826822916668, + "learning_rate": 0.0001, + "loss": 4.2659, + "loss/crossentropy": 1.982887327671051, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21635843813419342, + "step": 6760 + }, + { + "epoch": 0.13524, + "grad_norm": 2.0625, + "grad_norm_var": 0.029215494791666668, + "learning_rate": 0.0001, + "loss": 4.2828, + "loss/crossentropy": 2.25021892786026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22931701689958572, + "step": 6762 + }, + { + "epoch": 0.13528, + "grad_norm": 2.34375, + "grad_norm_var": 0.02252197265625, + "learning_rate": 0.0001, + "loss": 4.5991, + "loss/crossentropy": 2.5220746994018555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2406606376171112, + "step": 6764 + }, + { + "epoch": 0.13532, + "grad_norm": 2.375, + "grad_norm_var": 0.022386678059895835, + "learning_rate": 0.0001, + "loss": 4.5143, + "loss/crossentropy": 1.8115127086639404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.220379076898098, + "step": 6766 + }, + { + "epoch": 0.13536, + "grad_norm": 2.0625, + "grad_norm_var": 0.010856119791666667, + "learning_rate": 0.0001, + "loss": 4.5113, + "loss/crossentropy": 1.8998088240623474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22993376106023788, + "step": 6768 + }, + { + "epoch": 0.1354, + "grad_norm": 2.09375, + "grad_norm_var": 0.01051025390625, + "learning_rate": 0.0001, + "loss": 4.28, + "loss/crossentropy": 2.0183660984039307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21991200000047684, + "step": 6770 + }, + { + "epoch": 0.13544, + "grad_norm": 2.296875, + "grad_norm_var": 0.009847005208333334, + "learning_rate": 0.0001, + "loss": 4.6224, + "loss/crossentropy": 2.1927448511123657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2536699026823044, + "step": 6772 + }, + { + "epoch": 0.13548, + "grad_norm": 2.234375, + "grad_norm_var": 0.014404296875, + "learning_rate": 0.0001, + "loss": 4.196, + "loss/crossentropy": 1.92184317111969, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20716708898544312, + "step": 6774 + }, + { + "epoch": 0.13552, + "grad_norm": 2.296875, + "grad_norm_var": 0.016813151041666665, + "learning_rate": 0.0001, + "loss": 4.7779, + "loss/crossentropy": 2.2437468767166138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26381388306617737, + "step": 6776 + }, + { + "epoch": 0.13556, + "grad_norm": 2.0625, + "grad_norm_var": 0.016722615559895834, + "learning_rate": 0.0001, + "loss": 4.2907, + "loss/crossentropy": 2.087414026260376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21566492319107056, + "step": 6778 + }, + { + "epoch": 0.1356, + "grad_norm": 2.140625, + "grad_norm_var": 0.015653483072916665, + "learning_rate": 0.0001, + "loss": 4.4273, + "loss/crossentropy": 2.1936367750167847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22492723166942596, + "step": 6780 + }, + { + "epoch": 0.13564, + "grad_norm": 2.109375, + "grad_norm_var": 0.014046223958333333, + "learning_rate": 0.0001, + "loss": 4.2992, + "loss/crossentropy": 1.7642306685447693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20992937684059143, + "step": 6782 + }, + { + "epoch": 0.13568, + "grad_norm": 2.140625, + "grad_norm_var": 0.014371744791666667, + "learning_rate": 0.0001, + "loss": 4.3593, + "loss/crossentropy": 2.01781964302063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22552715986967087, + "step": 6784 + }, + { + "epoch": 0.13572, + "grad_norm": 2.140625, + "grad_norm_var": 0.016402180989583334, + "learning_rate": 0.0001, + "loss": 4.4708, + "loss/crossentropy": 2.0788158774375916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24181769788265228, + "step": 6786 + }, + { + "epoch": 0.13576, + "grad_norm": 2.171875, + "grad_norm_var": 0.01539306640625, + "learning_rate": 0.0001, + "loss": 4.2163, + "loss/crossentropy": 2.0424017310142517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21705424785614014, + "step": 6788 + }, + { + "epoch": 0.1358, + "grad_norm": 2.46875, + "grad_norm_var": 0.019025675455729165, + "learning_rate": 0.0001, + "loss": 4.1182, + "loss/crossentropy": 1.6175345182418823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1899409219622612, + "step": 6790 + }, + { + "epoch": 0.13584, + "grad_norm": 2.3125, + "grad_norm_var": 0.016852823893229167, + "learning_rate": 0.0001, + "loss": 4.2914, + "loss/crossentropy": 2.004386007785797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22395263612270355, + "step": 6792 + }, + { + "epoch": 0.13588, + "grad_norm": 2.109375, + "grad_norm_var": 0.019245402018229166, + "learning_rate": 0.0001, + "loss": 4.2182, + "loss/crossentropy": 1.9224132895469666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2326791137456894, + "step": 6794 + }, + { + "epoch": 0.13592, + "grad_norm": 2.1875, + "grad_norm_var": 0.019755045572916668, + "learning_rate": 0.0001, + "loss": 4.4768, + "loss/crossentropy": 1.8331453204154968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21586360037326813, + "step": 6796 + }, + { + "epoch": 0.13596, + "grad_norm": 2.1875, + "grad_norm_var": 0.0197906494140625, + "learning_rate": 0.0001, + "loss": 4.3059, + "loss/crossentropy": 2.535244107246399, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24744001775979996, + "step": 6798 + }, + { + "epoch": 0.136, + "grad_norm": 2.25, + "grad_norm_var": 0.017704264322916666, + "learning_rate": 0.0001, + "loss": 4.4444, + "loss/crossentropy": 2.0433249473571777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2502296343445778, + "step": 6800 + }, + { + "epoch": 0.13604, + "grad_norm": 2.09375, + "grad_norm_var": 0.0221588134765625, + "learning_rate": 0.0001, + "loss": 4.4619, + "loss/crossentropy": 2.35608172416687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2536633685231209, + "step": 6802 + }, + { + "epoch": 0.13608, + "grad_norm": 2.1875, + "grad_norm_var": 0.022362263997395833, + "learning_rate": 0.0001, + "loss": 4.4493, + "loss/crossentropy": 2.1230265498161316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2276441976428032, + "step": 6804 + }, + { + "epoch": 0.13612, + "grad_norm": 2.15625, + "grad_norm_var": 0.015925089518229168, + "learning_rate": 0.0001, + "loss": 4.2125, + "loss/crossentropy": 2.0186346769332886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22158686816692352, + "step": 6806 + }, + { + "epoch": 0.13616, + "grad_norm": 2.28125, + "grad_norm_var": 0.0151763916015625, + "learning_rate": 0.0001, + "loss": 4.6065, + "loss/crossentropy": 2.4136343002319336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22668009996414185, + "step": 6808 + }, + { + "epoch": 0.1362, + "grad_norm": 2.109375, + "grad_norm_var": 0.012495930989583333, + "learning_rate": 0.0001, + "loss": 4.3372, + "loss/crossentropy": 2.1241788268089294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23394256830215454, + "step": 6810 + }, + { + "epoch": 0.13624, + "grad_norm": 2.4375, + "grad_norm_var": 0.017145792643229168, + "learning_rate": 0.0001, + "loss": 4.6063, + "loss/crossentropy": 1.9051874279975891, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2560805454850197, + "step": 6812 + }, + { + "epoch": 0.13628, + "grad_norm": 2.234375, + "grad_norm_var": 0.015458170572916667, + "learning_rate": 0.0001, + "loss": 4.2419, + "loss/crossentropy": 1.9248363375663757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22457116842269897, + "step": 6814 + }, + { + "epoch": 0.13632, + "grad_norm": 1.921875, + "grad_norm_var": 0.022264607747395835, + "learning_rate": 0.0001, + "loss": 4.5953, + "loss/crossentropy": 2.3065048456192017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.245137557387352, + "step": 6816 + }, + { + "epoch": 0.13636, + "grad_norm": 2.234375, + "grad_norm_var": 0.0152496337890625, + "learning_rate": 0.0001, + "loss": 4.6096, + "loss/crossentropy": 2.152611255645752, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22794383764266968, + "step": 6818 + }, + { + "epoch": 0.1364, + "grad_norm": 2.09375, + "grad_norm_var": 0.016242472330729167, + "learning_rate": 0.0001, + "loss": 4.129, + "loss/crossentropy": 1.9548735618591309, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20764236897230148, + "step": 6820 + }, + { + "epoch": 0.13644, + "grad_norm": 2.1875, + "grad_norm_var": 0.016136678059895833, + "learning_rate": 0.0001, + "loss": 4.3501, + "loss/crossentropy": 2.11979341506958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22793393582105637, + "step": 6822 + }, + { + "epoch": 0.13648, + "grad_norm": 2.15625, + "grad_norm_var": 0.016748046875, + "learning_rate": 0.0001, + "loss": 4.3338, + "loss/crossentropy": 2.351949691772461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2366446554660797, + "step": 6824 + }, + { + "epoch": 0.13652, + "grad_norm": 2.0625, + "grad_norm_var": 0.018843587239583334, + "learning_rate": 0.0001, + "loss": 4.3764, + "loss/crossentropy": 2.1197460889816284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24759536981582642, + "step": 6826 + }, + { + "epoch": 0.13656, + "grad_norm": 2.0625, + "grad_norm_var": 0.013264973958333334, + "learning_rate": 0.0001, + "loss": 4.1204, + "loss/crossentropy": 1.8491687178611755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20777105540037155, + "step": 6828 + }, + { + "epoch": 0.1366, + "grad_norm": 2.25, + "grad_norm_var": 0.013459269205729167, + "learning_rate": 0.0001, + "loss": 4.4867, + "loss/crossentropy": 1.964136004447937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2368694394826889, + "step": 6830 + }, + { + "epoch": 0.13664, + "grad_norm": 2.140625, + "grad_norm_var": 0.006859334309895834, + "learning_rate": 0.0001, + "loss": 4.469, + "loss/crossentropy": 1.8988104462623596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19548720866441727, + "step": 6832 + }, + { + "epoch": 0.13668, + "grad_norm": 2.125, + "grad_norm_var": 0.0059855143229166664, + "learning_rate": 0.0001, + "loss": 4.3104, + "loss/crossentropy": 1.757002353668213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21120281517505646, + "step": 6834 + }, + { + "epoch": 0.13672, + "grad_norm": 2.109375, + "grad_norm_var": 0.005952962239583333, + "learning_rate": 0.0001, + "loss": 4.4239, + "loss/crossentropy": 1.9414420127868652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22160177677869797, + "step": 6836 + }, + { + "epoch": 0.13676, + "grad_norm": 2.0625, + "grad_norm_var": 0.0072174072265625, + "learning_rate": 0.0001, + "loss": 4.4294, + "loss/crossentropy": 2.280028223991394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24696747958660126, + "step": 6838 + }, + { + "epoch": 0.1368, + "grad_norm": 2.203125, + "grad_norm_var": 0.006787109375, + "learning_rate": 0.0001, + "loss": 4.523, + "loss/crossentropy": 2.106986403465271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24022039771080017, + "step": 6840 + }, + { + "epoch": 0.13684, + "grad_norm": 2.15625, + "grad_norm_var": 0.0349517822265625, + "learning_rate": 0.0001, + "loss": 4.2609, + "loss/crossentropy": 1.9540700912475586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1968660056591034, + "step": 6842 + }, + { + "epoch": 0.13688, + "grad_norm": 2.0625, + "grad_norm_var": 0.03394775390625, + "learning_rate": 0.0001, + "loss": 4.2181, + "loss/crossentropy": 1.6771780252456665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20230162143707275, + "step": 6844 + }, + { + "epoch": 0.13692, + "grad_norm": 2.5625, + "grad_norm_var": 0.04419657389322917, + "learning_rate": 0.0001, + "loss": 4.7567, + "loss/crossentropy": 2.059873402118683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.276339128613472, + "step": 6846 + }, + { + "epoch": 0.13696, + "grad_norm": 2.0625, + "grad_norm_var": 0.04684244791666667, + "learning_rate": 0.0001, + "loss": 4.2512, + "loss/crossentropy": 1.7943353056907654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1893974468111992, + "step": 6848 + }, + { + "epoch": 0.137, + "grad_norm": 2.265625, + "grad_norm_var": 0.046223958333333336, + "learning_rate": 0.0001, + "loss": 4.687, + "loss/crossentropy": 2.314136028289795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24898302555084229, + "step": 6850 + }, + { + "epoch": 0.13704, + "grad_norm": 2.328125, + "grad_norm_var": 0.04383036295572917, + "learning_rate": 0.0001, + "loss": 4.4257, + "loss/crossentropy": 2.0062466263771057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23169535398483276, + "step": 6852 + }, + { + "epoch": 0.13708, + "grad_norm": 2.09375, + "grad_norm_var": 0.044873046875, + "learning_rate": 0.0001, + "loss": 4.5787, + "loss/crossentropy": 2.3600821495056152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25484780967235565, + "step": 6854 + }, + { + "epoch": 0.13712, + "grad_norm": 2.28125, + "grad_norm_var": 0.043675740559895836, + "learning_rate": 0.0001, + "loss": 4.2113, + "loss/crossentropy": 1.885023295879364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.255520723760128, + "step": 6856 + }, + { + "epoch": 0.13716, + "grad_norm": 2.234375, + "grad_norm_var": 0.019806925455729166, + "learning_rate": 0.0001, + "loss": 4.6493, + "loss/crossentropy": 2.2864162921905518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22469403594732285, + "step": 6858 + }, + { + "epoch": 0.1372, + "grad_norm": 2.09375, + "grad_norm_var": 0.018961588541666668, + "learning_rate": 0.0001, + "loss": 4.4017, + "loss/crossentropy": 1.908643126487732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22016740590333939, + "step": 6860 + }, + { + "epoch": 0.13724, + "grad_norm": 2.3125, + "grad_norm_var": 0.010282389322916667, + "learning_rate": 0.0001, + "loss": 4.7255, + "loss/crossentropy": 2.1028786301612854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23632919788360596, + "step": 6862 + }, + { + "epoch": 0.13728, + "grad_norm": 2.546875, + "grad_norm_var": 0.01539306640625, + "learning_rate": 0.0001, + "loss": 4.4043, + "loss/crossentropy": 2.0363592505455017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22131157666444778, + "step": 6864 + }, + { + "epoch": 0.13732, + "grad_norm": 2.171875, + "grad_norm_var": 0.0158843994140625, + "learning_rate": 0.0001, + "loss": 4.4357, + "loss/crossentropy": 2.030495524406433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.233994759619236, + "step": 6866 + }, + { + "epoch": 0.13736, + "grad_norm": 2.046875, + "grad_norm_var": 0.018033854166666665, + "learning_rate": 0.0001, + "loss": 4.3036, + "loss/crossentropy": 1.6365603804588318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1909056007862091, + "step": 6868 + }, + { + "epoch": 0.1374, + "grad_norm": 2.0625, + "grad_norm_var": 0.015217081705729166, + "learning_rate": 0.0001, + "loss": 4.655, + "loss/crossentropy": 2.205111026763916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2267644703388214, + "step": 6870 + }, + { + "epoch": 0.13744, + "grad_norm": 2.0625, + "grad_norm_var": 0.016437784830729166, + "learning_rate": 0.0001, + "loss": 4.4207, + "loss/crossentropy": 2.0179646015167236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2307143434882164, + "step": 6872 + }, + { + "epoch": 0.13748, + "grad_norm": 2.09375, + "grad_norm_var": 0.018290201822916668, + "learning_rate": 0.0001, + "loss": 4.2154, + "loss/crossentropy": 1.9697463512420654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2207925096154213, + "step": 6874 + }, + { + "epoch": 0.13752, + "grad_norm": 2.265625, + "grad_norm_var": 0.019189453125, + "learning_rate": 0.0001, + "loss": 4.5691, + "loss/crossentropy": 2.186875820159912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25762687623500824, + "step": 6876 + }, + { + "epoch": 0.13756, + "grad_norm": 2.109375, + "grad_norm_var": 0.017867024739583334, + "learning_rate": 0.0001, + "loss": 4.2713, + "loss/crossentropy": 2.3203837871551514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23470903187990189, + "step": 6878 + }, + { + "epoch": 0.1376, + "grad_norm": 2.1875, + "grad_norm_var": 0.0066640218098958336, + "learning_rate": 0.0001, + "loss": 4.4154, + "loss/crossentropy": 2.2642472982406616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2454553246498108, + "step": 6880 + }, + { + "epoch": 0.13764, + "grad_norm": 2.1875, + "grad_norm_var": 0.008687337239583334, + "learning_rate": 0.0001, + "loss": 4.4076, + "loss/crossentropy": 1.9313859343528748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21797804534435272, + "step": 6882 + }, + { + "epoch": 0.13768, + "grad_norm": 3.4375, + "grad_norm_var": 0.11204020182291667, + "learning_rate": 0.0001, + "loss": 4.7846, + "loss/crossentropy": 2.5469977855682373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2651800066232681, + "step": 6884 + }, + { + "epoch": 0.13772, + "grad_norm": 2.296875, + "grad_norm_var": 0.11030171712239584, + "learning_rate": 0.0001, + "loss": 4.4893, + "loss/crossentropy": 2.549328088760376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2566085457801819, + "step": 6886 + }, + { + "epoch": 0.13776, + "grad_norm": 2.03125, + "grad_norm_var": 0.1126617431640625, + "learning_rate": 0.0001, + "loss": 4.4927, + "loss/crossentropy": 2.3094369769096375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24709751456975937, + "step": 6888 + }, + { + "epoch": 0.1378, + "grad_norm": 2.109375, + "grad_norm_var": 0.1097808837890625, + "learning_rate": 0.0001, + "loss": 4.2412, + "loss/crossentropy": 1.5071046948432922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19776180386543274, + "step": 6890 + }, + { + "epoch": 0.13784, + "grad_norm": 2.078125, + "grad_norm_var": 0.1101226806640625, + "learning_rate": 0.0001, + "loss": 4.3705, + "loss/crossentropy": 2.0064221620559692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20836234837770462, + "step": 6892 + }, + { + "epoch": 0.13788, + "grad_norm": 2.078125, + "grad_norm_var": 0.10816650390625, + "learning_rate": 0.0001, + "loss": 4.3608, + "loss/crossentropy": 2.1216301321983337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22151901572942734, + "step": 6894 + }, + { + "epoch": 0.13792, + "grad_norm": 2.25, + "grad_norm_var": 0.1073150634765625, + "learning_rate": 0.0001, + "loss": 4.4168, + "loss/crossentropy": 1.8417679071426392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21999332308769226, + "step": 6896 + }, + { + "epoch": 0.13796, + "grad_norm": 2.328125, + "grad_norm_var": 0.10695699055989584, + "learning_rate": 0.0001, + "loss": 4.7005, + "loss/crossentropy": 2.4651769399642944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27029043436050415, + "step": 6898 + }, + { + "epoch": 0.138, + "grad_norm": 2.21875, + "grad_norm_var": 0.014188639322916667, + "learning_rate": 0.0001, + "loss": 4.2985, + "loss/crossentropy": 1.7225988507270813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20930497348308563, + "step": 6900 + }, + { + "epoch": 0.13804, + "grad_norm": 2.34375, + "grad_norm_var": 0.015360514322916666, + "learning_rate": 0.0001, + "loss": 4.1156, + "loss/crossentropy": 2.1218297481536865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21276423335075378, + "step": 6902 + }, + { + "epoch": 0.13808, + "grad_norm": 2.25, + "grad_norm_var": 0.010856119791666667, + "learning_rate": 0.0001, + "loss": 4.2706, + "loss/crossentropy": 2.040019452571869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2438269630074501, + "step": 6904 + }, + { + "epoch": 0.13812, + "grad_norm": 15.8125, + "grad_norm_var": 11.600536092122395, + "learning_rate": 0.0001, + "loss": 4.5041, + "loss/crossentropy": 1.8229625225067139, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22197778522968292, + "step": 6906 + }, + { + "epoch": 0.13816, + "grad_norm": 2.25, + "grad_norm_var": 11.543973795572917, + "learning_rate": 0.0001, + "loss": 4.7087, + "loss/crossentropy": 2.453408360481262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24046239256858826, + "step": 6908 + }, + { + "epoch": 0.1382, + "grad_norm": 2.046875, + "grad_norm_var": 11.55152587890625, + "learning_rate": 0.0001, + "loss": 4.4806, + "loss/crossentropy": 2.2724320888519287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23739346861839294, + "step": 6910 + }, + { + "epoch": 0.13824, + "grad_norm": 2.125, + "grad_norm_var": 11.562272135416666, + "learning_rate": 0.0001, + "loss": 4.2747, + "loss/crossentropy": 2.2382686138153076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22692064195871353, + "step": 6912 + }, + { + "epoch": 0.13828, + "grad_norm": 2.3125, + "grad_norm_var": 11.54869384765625, + "learning_rate": 0.0001, + "loss": 4.6867, + "loss/crossentropy": 2.021562337875366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24746088683605194, + "step": 6914 + }, + { + "epoch": 0.13832, + "grad_norm": 2.21875, + "grad_norm_var": 11.546556599934895, + "learning_rate": 0.0001, + "loss": 4.3247, + "loss/crossentropy": 2.1071943044662476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2144002616405487, + "step": 6916 + }, + { + "epoch": 0.13836, + "grad_norm": 2.28125, + "grad_norm_var": 11.550846354166667, + "learning_rate": 0.0001, + "loss": 4.2686, + "loss/crossentropy": 1.9641517400741577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22584721446037292, + "step": 6918 + }, + { + "epoch": 0.1384, + "grad_norm": 2.078125, + "grad_norm_var": 11.559130859375, + "learning_rate": 0.0001, + "loss": 4.3194, + "loss/crossentropy": 2.4430564641952515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24351171404123306, + "step": 6920 + }, + { + "epoch": 0.13844, + "grad_norm": 2.125, + "grad_norm_var": 0.022484334309895833, + "learning_rate": 0.0001, + "loss": 4.4202, + "loss/crossentropy": 2.2237725257873535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2248261496424675, + "step": 6922 + }, + { + "epoch": 0.13848, + "grad_norm": 2.09375, + "grad_norm_var": 0.015412394205729167, + "learning_rate": 0.0001, + "loss": 4.2028, + "loss/crossentropy": 1.7291913628578186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19805100560188293, + "step": 6924 + }, + { + "epoch": 0.13852, + "grad_norm": 2.15625, + "grad_norm_var": 0.013923136393229167, + "learning_rate": 0.0001, + "loss": 4.3972, + "loss/crossentropy": 1.807108223438263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20386626571416855, + "step": 6926 + }, + { + "epoch": 0.13856, + "grad_norm": 2.28125, + "grad_norm_var": 0.01422119140625, + "learning_rate": 0.0001, + "loss": 4.5188, + "loss/crossentropy": 2.510676622390747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24666880816221237, + "step": 6928 + }, + { + "epoch": 0.1386, + "grad_norm": 2.015625, + "grad_norm_var": 0.007515462239583334, + "learning_rate": 0.0001, + "loss": 4.2006, + "loss/crossentropy": 1.9420115947723389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2214776575565338, + "step": 6930 + }, + { + "epoch": 0.13864, + "grad_norm": 2.3125, + "grad_norm_var": 0.010837554931640625, + "learning_rate": 0.0001, + "loss": 4.4445, + "loss/crossentropy": 2.2288190722465515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23719585686922073, + "step": 6932 + }, + { + "epoch": 0.13868, + "grad_norm": 2.09375, + "grad_norm_var": 0.011043294270833334, + "learning_rate": 0.0001, + "loss": 4.071, + "loss/crossentropy": 2.04274183511734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21653369069099426, + "step": 6934 + }, + { + "epoch": 0.13872, + "grad_norm": 2.296875, + "grad_norm_var": 0.0121002197265625, + "learning_rate": 0.0001, + "loss": 4.4041, + "loss/crossentropy": 1.9149779081344604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19941364973783493, + "step": 6936 + }, + { + "epoch": 0.13876, + "grad_norm": 2.15625, + "grad_norm_var": 0.011554972330729166, + "learning_rate": 0.0001, + "loss": 4.2577, + "loss/crossentropy": 1.7983179092407227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18979590386152267, + "step": 6938 + }, + { + "epoch": 0.1388, + "grad_norm": 2.296875, + "grad_norm_var": 0.023164876302083335, + "learning_rate": 0.0001, + "loss": 4.3314, + "loss/crossentropy": 2.1919915080070496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22652295976877213, + "step": 6940 + }, + { + "epoch": 0.13884, + "grad_norm": 2.21875, + "grad_norm_var": 0.023152669270833332, + "learning_rate": 0.0001, + "loss": 4.494, + "loss/crossentropy": 2.0362821221351624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24296525120735168, + "step": 6942 + }, + { + "epoch": 0.13888, + "grad_norm": 2.21875, + "grad_norm_var": 0.023653157552083335, + "learning_rate": 0.0001, + "loss": 4.4135, + "loss/crossentropy": 2.0371538400650024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2217850610613823, + "step": 6944 + }, + { + "epoch": 0.13892, + "grad_norm": 2.5, + "grad_norm_var": 0.0268310546875, + "learning_rate": 0.0001, + "loss": 4.3371, + "loss/crossentropy": 1.9137988686561584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2218162938952446, + "step": 6946 + }, + { + "epoch": 0.13896, + "grad_norm": 2.46875, + "grad_norm_var": 0.03438898722330729, + "learning_rate": 0.0001, + "loss": 4.6521, + "loss/crossentropy": 2.3215843439102173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2576482892036438, + "step": 6948 + }, + { + "epoch": 0.139, + "grad_norm": 2.171875, + "grad_norm_var": 0.026590983072916668, + "learning_rate": 0.0001, + "loss": 4.6004, + "loss/crossentropy": 2.169154405593872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24629274010658264, + "step": 6950 + }, + { + "epoch": 0.13904, + "grad_norm": 2.203125, + "grad_norm_var": 0.03173421223958333, + "learning_rate": 0.0001, + "loss": 4.3549, + "loss/crossentropy": 2.1355135440826416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2316850870847702, + "step": 6952 + }, + { + "epoch": 0.13908, + "grad_norm": 2.140625, + "grad_norm_var": 0.03183186848958333, + "learning_rate": 0.0001, + "loss": 4.3462, + "loss/crossentropy": 2.264985144138336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23467915505170822, + "step": 6954 + }, + { + "epoch": 0.13912, + "grad_norm": 2.203125, + "grad_norm_var": 0.025926717122395835, + "learning_rate": 0.0001, + "loss": 4.6559, + "loss/crossentropy": 2.1007654666900635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2376948595046997, + "step": 6956 + }, + { + "epoch": 0.13916, + "grad_norm": 2.28125, + "grad_norm_var": 0.02603759765625, + "learning_rate": 0.0001, + "loss": 4.4871, + "loss/crossentropy": 2.284608840942383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24811238050460815, + "step": 6958 + }, + { + "epoch": 0.1392, + "grad_norm": 2.125, + "grad_norm_var": 0.025911458333333335, + "learning_rate": 0.0001, + "loss": 4.2338, + "loss/crossentropy": 1.657732367515564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19541934877634048, + "step": 6960 + }, + { + "epoch": 0.13924, + "grad_norm": 2.171875, + "grad_norm_var": 0.022391764322916667, + "learning_rate": 0.0001, + "loss": 4.3832, + "loss/crossentropy": 1.9607325792312622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2269211858510971, + "step": 6962 + }, + { + "epoch": 0.13928, + "grad_norm": 2.09375, + "grad_norm_var": 0.006571451822916667, + "learning_rate": 0.0001, + "loss": 4.3759, + "loss/crossentropy": 1.7454752326011658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21780573576688766, + "step": 6964 + }, + { + "epoch": 0.13932, + "grad_norm": 2.421875, + "grad_norm_var": 0.010350545247395834, + "learning_rate": 0.0001, + "loss": 4.7664, + "loss/crossentropy": 2.001866638660431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24929454922676086, + "step": 6966 + }, + { + "epoch": 0.13936, + "grad_norm": 2.21875, + "grad_norm_var": 0.008006795247395834, + "learning_rate": 0.0001, + "loss": 4.4181, + "loss/crossentropy": 1.9167855978012085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22114858031272888, + "step": 6968 + }, + { + "epoch": 0.1394, + "grad_norm": 2.15625, + "grad_norm_var": 0.008003743489583333, + "learning_rate": 0.0001, + "loss": 4.2284, + "loss/crossentropy": 2.0324739813804626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23271320760250092, + "step": 6970 + }, + { + "epoch": 0.13944, + "grad_norm": 2.25, + "grad_norm_var": 0.016307576497395834, + "learning_rate": 0.0001, + "loss": 4.5375, + "loss/crossentropy": 2.162013590335846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22832238674163818, + "step": 6972 + }, + { + "epoch": 0.13948, + "grad_norm": 2.203125, + "grad_norm_var": 0.018944295247395833, + "learning_rate": 0.0001, + "loss": 4.1406, + "loss/crossentropy": 2.074672818183899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2402098923921585, + "step": 6974 + }, + { + "epoch": 0.13952, + "grad_norm": 2.0625, + "grad_norm_var": 0.019758097330729165, + "learning_rate": 0.0001, + "loss": 4.4221, + "loss/crossentropy": 1.9982299208641052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21126113086938858, + "step": 6976 + }, + { + "epoch": 0.13956, + "grad_norm": 2.171875, + "grad_norm_var": 0.021751912434895833, + "learning_rate": 0.0001, + "loss": 4.3391, + "loss/crossentropy": 1.944049894809723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2296794354915619, + "step": 6978 + }, + { + "epoch": 0.1396, + "grad_norm": 2.21875, + "grad_norm_var": 0.02047119140625, + "learning_rate": 0.0001, + "loss": 4.4344, + "loss/crossentropy": 2.308506488800049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2589666247367859, + "step": 6980 + }, + { + "epoch": 0.13964, + "grad_norm": 2.296875, + "grad_norm_var": 0.017724609375, + "learning_rate": 0.0001, + "loss": 4.2867, + "loss/crossentropy": 2.129163682460785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21905581653118134, + "step": 6982 + }, + { + "epoch": 0.13968, + "grad_norm": 2.234375, + "grad_norm_var": 0.017902628580729166, + "learning_rate": 0.0001, + "loss": 4.3023, + "loss/crossentropy": 1.8560669422149658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2119361013174057, + "step": 6984 + }, + { + "epoch": 0.13972, + "grad_norm": 2.140625, + "grad_norm_var": 0.018602498372395835, + "learning_rate": 0.0001, + "loss": 4.1212, + "loss/crossentropy": 1.8194095492362976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20872193574905396, + "step": 6986 + }, + { + "epoch": 0.13976, + "grad_norm": 2.15625, + "grad_norm_var": 0.005501302083333334, + "learning_rate": 0.0001, + "loss": 4.2612, + "loss/crossentropy": 2.0200153589248657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20160746574401855, + "step": 6988 + }, + { + "epoch": 0.1398, + "grad_norm": 2.359375, + "grad_norm_var": 0.007298787434895833, + "learning_rate": 0.0001, + "loss": 4.2757, + "loss/crossentropy": 1.982479751110077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23396103084087372, + "step": 6990 + }, + { + "epoch": 0.13984, + "grad_norm": 2.171875, + "grad_norm_var": 0.006494140625, + "learning_rate": 0.0001, + "loss": 4.5075, + "loss/crossentropy": 2.17054283618927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23827539384365082, + "step": 6992 + }, + { + "epoch": 0.13988, + "grad_norm": 2.171875, + "grad_norm_var": 0.0056640625, + "learning_rate": 0.0001, + "loss": 4.1794, + "loss/crossentropy": 1.619499921798706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1917721927165985, + "step": 6994 + }, + { + "epoch": 0.13992, + "grad_norm": 2.15625, + "grad_norm_var": 0.005353800455729167, + "learning_rate": 0.0001, + "loss": 4.3833, + "loss/crossentropy": 2.1082500219345093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23768241703510284, + "step": 6996 + }, + { + "epoch": 0.13996, + "grad_norm": 2.1875, + "grad_norm_var": 0.004076131184895833, + "learning_rate": 0.0001, + "loss": 4.6731, + "loss/crossentropy": 1.8480825424194336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21046122163534164, + "step": 6998 + }, + { + "epoch": 0.14, + "grad_norm": 2.4375, + "grad_norm_var": 0.008463541666666666, + "learning_rate": 0.0001, + "loss": 4.5285, + "loss/crossentropy": 2.0547631978988647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22980494797229767, + "step": 7000 + }, + { + "epoch": 0.14004, + "grad_norm": 2.15625, + "grad_norm_var": 0.007835896809895833, + "learning_rate": 0.0001, + "loss": 4.4625, + "loss/crossentropy": 2.0695141553878784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100546732544899, + "step": 7002 + }, + { + "epoch": 0.14008, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012277984619140625, + "learning_rate": 0.0001, + "loss": 4.3716, + "loss/crossentropy": 2.105263113975525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24552470445632935, + "step": 7004 + }, + { + "epoch": 0.14012, + "grad_norm": 2.375, + "grad_norm_var": 0.014427693684895833, + "learning_rate": 0.0001, + "loss": 4.3566, + "loss/crossentropy": 2.03000670671463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24780434370040894, + "step": 7006 + }, + { + "epoch": 0.14016, + "grad_norm": 2.40625, + "grad_norm_var": 0.0173980712890625, + "learning_rate": 0.0001, + "loss": 4.4758, + "loss/crossentropy": 2.288944959640503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2760937511920929, + "step": 7008 + }, + { + "epoch": 0.1402, + "grad_norm": 1.9609375, + "grad_norm_var": 0.06544570922851563, + "learning_rate": 0.0001, + "loss": 3.9326, + "loss/crossentropy": 1.790147304534912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1905786320567131, + "step": 7010 + }, + { + "epoch": 0.14024, + "grad_norm": 2.609375, + "grad_norm_var": 0.07765884399414062, + "learning_rate": 0.0001, + "loss": 4.6364, + "loss/crossentropy": 2.008346378803253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23138166218996048, + "step": 7012 + }, + { + "epoch": 0.14028, + "grad_norm": 2.1875, + "grad_norm_var": 0.07974014282226563, + "learning_rate": 0.0001, + "loss": 4.2304, + "loss/crossentropy": 1.9694496393203735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21347863972187042, + "step": 7014 + }, + { + "epoch": 0.14032, + "grad_norm": 2.203125, + "grad_norm_var": 0.07948989868164062, + "learning_rate": 0.0001, + "loss": 4.3685, + "loss/crossentropy": 2.0907286405563354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23349857330322266, + "step": 7016 + }, + { + "epoch": 0.14036, + "grad_norm": 2.40625, + "grad_norm_var": 0.07850316365559896, + "learning_rate": 0.0001, + "loss": 4.6454, + "loss/crossentropy": 2.161414623260498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23841089010238647, + "step": 7018 + }, + { + "epoch": 0.1404, + "grad_norm": 2.25, + "grad_norm_var": 0.07315266927083333, + "learning_rate": 0.0001, + "loss": 4.4868, + "loss/crossentropy": 2.1402887105941772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25493840873241425, + "step": 7020 + }, + { + "epoch": 0.14044, + "grad_norm": 2.234375, + "grad_norm_var": 0.06809666951497396, + "learning_rate": 0.0001, + "loss": 4.1248, + "loss/crossentropy": 2.0703811049461365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24300381541252136, + "step": 7022 + }, + { + "epoch": 0.14048, + "grad_norm": 2.53125, + "grad_norm_var": 0.0716875712076823, + "learning_rate": 0.0001, + "loss": 4.6165, + "loss/crossentropy": 2.152569532394409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23347805440425873, + "step": 7024 + }, + { + "epoch": 0.14052, + "grad_norm": 2.171875, + "grad_norm_var": 0.03453369140625, + "learning_rate": 0.0001, + "loss": 4.1642, + "loss/crossentropy": 2.0831095576286316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2198955938220024, + "step": 7026 + }, + { + "epoch": 0.14056, + "grad_norm": 2.234375, + "grad_norm_var": 0.019498697916666665, + "learning_rate": 0.0001, + "loss": 4.4929, + "loss/crossentropy": 2.1631508469581604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23971489816904068, + "step": 7028 + }, + { + "epoch": 0.1406, + "grad_norm": 2.421875, + "grad_norm_var": 0.019481404622395834, + "learning_rate": 0.0001, + "loss": 4.4687, + "loss/crossentropy": 2.1683043241500854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2628757208585739, + "step": 7030 + }, + { + "epoch": 0.14064, + "grad_norm": 2.1875, + "grad_norm_var": 0.07377827962239583, + "learning_rate": 0.0001, + "loss": 4.522, + "loss/crossentropy": 2.021001398563385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22922180593013763, + "step": 7032 + }, + { + "epoch": 0.14068, + "grad_norm": 2.265625, + "grad_norm_var": 0.0725494384765625, + "learning_rate": 0.0001, + "loss": 4.7729, + "loss/crossentropy": 2.267430543899536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27525072544813156, + "step": 7034 + }, + { + "epoch": 0.14072, + "grad_norm": 2.0625, + "grad_norm_var": 0.0788726806640625, + "learning_rate": 0.0001, + "loss": 4.3118, + "loss/crossentropy": 2.066729426383972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21959475427865982, + "step": 7036 + }, + { + "epoch": 0.14076, + "grad_norm": 2.0625, + "grad_norm_var": 0.08162333170572916, + "learning_rate": 0.0001, + "loss": 4.3203, + "loss/crossentropy": 1.7972697019577026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21773608028888702, + "step": 7038 + }, + { + "epoch": 0.1408, + "grad_norm": 2.171875, + "grad_norm_var": 0.08068745930989583, + "learning_rate": 0.0001, + "loss": 4.0745, + "loss/crossentropy": 1.751904845237732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19929176568984985, + "step": 7040 + }, + { + "epoch": 0.14084, + "grad_norm": 2.171875, + "grad_norm_var": 0.07649332682291667, + "learning_rate": 0.0001, + "loss": 4.4188, + "loss/crossentropy": 1.8432873487472534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21235650032758713, + "step": 7042 + }, + { + "epoch": 0.14088, + "grad_norm": 2.15625, + "grad_norm_var": 0.07618815104166667, + "learning_rate": 0.0001, + "loss": 4.2343, + "loss/crossentropy": 1.9589285850524902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21506928652524948, + "step": 7044 + }, + { + "epoch": 0.14092, + "grad_norm": 2.046875, + "grad_norm_var": 0.0788726806640625, + "learning_rate": 0.0001, + "loss": 4.229, + "loss/crossentropy": 2.3658028841018677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23405643552541733, + "step": 7046 + }, + { + "epoch": 0.14096, + "grad_norm": 2.140625, + "grad_norm_var": 0.008837890625, + "learning_rate": 0.0001, + "loss": 4.3922, + "loss/crossentropy": 2.088135540485382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22823208570480347, + "step": 7048 + }, + { + "epoch": 0.141, + "grad_norm": 2.09375, + "grad_norm_var": 0.006810506184895833, + "learning_rate": 0.0001, + "loss": 4.2367, + "loss/crossentropy": 1.9309821724891663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21409911662340164, + "step": 7050 + }, + { + "epoch": 0.14104, + "grad_norm": 2.1875, + "grad_norm_var": 0.006636555989583333, + "learning_rate": 0.0001, + "loss": 4.5237, + "loss/crossentropy": 2.5411492586135864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25017087161540985, + "step": 7052 + }, + { + "epoch": 0.14108, + "grad_norm": 2.03125, + "grad_norm_var": 0.007352701822916667, + "learning_rate": 0.0001, + "loss": 4.4998, + "loss/crossentropy": 2.3210322856903076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23967693746089935, + "step": 7054 + }, + { + "epoch": 0.14112, + "grad_norm": 2.046875, + "grad_norm_var": 0.009370930989583333, + "learning_rate": 0.0001, + "loss": 4.4607, + "loss/crossentropy": 2.054674744606018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23270255327224731, + "step": 7056 + }, + { + "epoch": 0.14116, + "grad_norm": 2.125, + "grad_norm_var": 0.008968098958333334, + "learning_rate": 0.0001, + "loss": 4.5423, + "loss/crossentropy": 2.545789122581482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25560182332992554, + "step": 7058 + }, + { + "epoch": 0.1412, + "grad_norm": 2.171875, + "grad_norm_var": 0.008861287434895834, + "learning_rate": 0.0001, + "loss": 4.2682, + "loss/crossentropy": 2.262348175048828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24763159453868866, + "step": 7060 + }, + { + "epoch": 0.14124, + "grad_norm": 2.1875, + "grad_norm_var": 0.008447265625, + "learning_rate": 0.0001, + "loss": 4.618, + "loss/crossentropy": 2.1045475602149963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2472379505634308, + "step": 7062 + }, + { + "epoch": 0.14128, + "grad_norm": 2.046875, + "grad_norm_var": 0.0093170166015625, + "learning_rate": 0.0001, + "loss": 4.2312, + "loss/crossentropy": 1.5632115006446838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19010942429304123, + "step": 7064 + }, + { + "epoch": 0.14132, + "grad_norm": 2.171875, + "grad_norm_var": 0.0116851806640625, + "learning_rate": 0.0001, + "loss": 4.2638, + "loss/crossentropy": 2.0847875475883484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21371345967054367, + "step": 7066 + }, + { + "epoch": 0.14136, + "grad_norm": 2.25, + "grad_norm_var": 0.018016560872395834, + "learning_rate": 0.0001, + "loss": 4.5171, + "loss/crossentropy": 2.2243804931640625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23065787553787231, + "step": 7068 + }, + { + "epoch": 0.1414, + "grad_norm": 2.171875, + "grad_norm_var": 0.016927083333333332, + "learning_rate": 0.0001, + "loss": 4.2812, + "loss/crossentropy": 1.9477753639221191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21616356819868088, + "step": 7070 + }, + { + "epoch": 0.14144, + "grad_norm": 2.5625, + "grad_norm_var": 0.024169921875, + "learning_rate": 0.0001, + "loss": 4.7005, + "loss/crossentropy": 2.2598072290420532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25795431435108185, + "step": 7072 + }, + { + "epoch": 0.14148, + "grad_norm": 2.171875, + "grad_norm_var": 0.022652180989583333, + "learning_rate": 0.0001, + "loss": 4.4345, + "loss/crossentropy": 1.8817242980003357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21974974125623703, + "step": 7074 + }, + { + "epoch": 0.14152, + "grad_norm": 2.0, + "grad_norm_var": 0.0256744384765625, + "learning_rate": 0.0001, + "loss": 4.5688, + "loss/crossentropy": 2.5275847911834717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24608048796653748, + "step": 7076 + }, + { + "epoch": 0.14156, + "grad_norm": 2.125, + "grad_norm_var": 0.0265045166015625, + "learning_rate": 0.0001, + "loss": 4.3937, + "loss/crossentropy": 2.400865852832794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22960034757852554, + "step": 7078 + }, + { + "epoch": 0.1416, + "grad_norm": 2.140625, + "grad_norm_var": 0.0237457275390625, + "learning_rate": 0.0001, + "loss": 4.4005, + "loss/crossentropy": 1.908901333808899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22271078824996948, + "step": 7080 + }, + { + "epoch": 0.14164, + "grad_norm": 2.171875, + "grad_norm_var": 0.02213134765625, + "learning_rate": 0.0001, + "loss": 4.1384, + "loss/crossentropy": 2.3330780267715454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23944097012281418, + "step": 7082 + }, + { + "epoch": 0.14168, + "grad_norm": 2.140625, + "grad_norm_var": 0.018941243489583332, + "learning_rate": 0.0001, + "loss": 4.3516, + "loss/crossentropy": 2.332213521003723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23576530069112778, + "step": 7084 + }, + { + "epoch": 0.14172, + "grad_norm": 2.125, + "grad_norm_var": 0.022435506184895832, + "learning_rate": 0.0001, + "loss": 4.4654, + "loss/crossentropy": 2.2269067764282227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25341375917196274, + "step": 7086 + }, + { + "epoch": 0.14176, + "grad_norm": 2.203125, + "grad_norm_var": 0.01470947265625, + "learning_rate": 0.0001, + "loss": 4.2491, + "loss/crossentropy": 2.461983561515808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24491792172193527, + "step": 7088 + }, + { + "epoch": 0.1418, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013952382405598958, + "learning_rate": 0.0001, + "loss": 4.2348, + "loss/crossentropy": 2.428719997406006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2394469603896141, + "step": 7090 + }, + { + "epoch": 0.14184, + "grad_norm": 2.109375, + "grad_norm_var": 0.011433664957682292, + "learning_rate": 0.0001, + "loss": 4.2242, + "loss/crossentropy": 2.32351291179657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23441501706838608, + "step": 7092 + }, + { + "epoch": 0.14188, + "grad_norm": 2.171875, + "grad_norm_var": 0.011482493082682291, + "learning_rate": 0.0001, + "loss": 4.4155, + "loss/crossentropy": 2.1165764331817627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2350979596376419, + "step": 7094 + }, + { + "epoch": 0.14192, + "grad_norm": 2.3125, + "grad_norm_var": 0.014288075764973958, + "learning_rate": 0.0001, + "loss": 4.4952, + "loss/crossentropy": 2.150681734085083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2475891336798668, + "step": 7096 + }, + { + "epoch": 0.14196, + "grad_norm": 2.046875, + "grad_norm_var": 0.013079579671223958, + "learning_rate": 0.0001, + "loss": 4.2753, + "loss/crossentropy": 2.038177013397217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2293965071439743, + "step": 7098 + }, + { + "epoch": 0.142, + "grad_norm": 2.0625, + "grad_norm_var": 0.014062245686848959, + "learning_rate": 0.0001, + "loss": 4.298, + "loss/crossentropy": 1.899521827697754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20856370776891708, + "step": 7100 + }, + { + "epoch": 0.14204, + "grad_norm": 2.296875, + "grad_norm_var": 0.010593414306640625, + "learning_rate": 0.0001, + "loss": 4.3449, + "loss/crossentropy": 1.9807924032211304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22756796330213547, + "step": 7102 + }, + { + "epoch": 0.14208, + "grad_norm": 2.0625, + "grad_norm_var": 0.009421539306640626, + "learning_rate": 0.0001, + "loss": 4.2893, + "loss/crossentropy": 2.158667206764221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22594892233610153, + "step": 7104 + }, + { + "epoch": 0.14212, + "grad_norm": 2.15625, + "grad_norm_var": 0.007811482747395833, + "learning_rate": 0.0001, + "loss": 4.3751, + "loss/crossentropy": 2.3133270144462585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22925584018230438, + "step": 7106 + }, + { + "epoch": 0.14216, + "grad_norm": 2.015625, + "grad_norm_var": 0.010107421875, + "learning_rate": 0.0001, + "loss": 4.2762, + "loss/crossentropy": 1.9796301126480103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21650104224681854, + "step": 7108 + }, + { + "epoch": 0.1422, + "grad_norm": 2.09375, + "grad_norm_var": 0.010054524739583333, + "learning_rate": 0.0001, + "loss": 4.2778, + "loss/crossentropy": 2.092659056186676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23407263308763504, + "step": 7110 + }, + { + "epoch": 0.14224, + "grad_norm": 2.125, + "grad_norm_var": 0.009056599934895833, + "learning_rate": 0.0001, + "loss": 4.6078, + "loss/crossentropy": 1.970819890499115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2262839823961258, + "step": 7112 + }, + { + "epoch": 0.14228, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011923980712890626, + "learning_rate": 0.0001, + "loss": 4.0642, + "loss/crossentropy": 1.5877107381820679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.182430237531662, + "step": 7114 + }, + { + "epoch": 0.14232, + "grad_norm": 2.21875, + "grad_norm_var": 0.020336659749348958, + "learning_rate": 0.0001, + "loss": 4.6997, + "loss/crossentropy": 2.3208755254745483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25018931180238724, + "step": 7116 + }, + { + "epoch": 0.14236, + "grad_norm": 2.203125, + "grad_norm_var": 0.01904271443684896, + "learning_rate": 0.0001, + "loss": 4.4483, + "loss/crossentropy": 2.1348973512649536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23516173660755157, + "step": 7118 + }, + { + "epoch": 0.1424, + "grad_norm": 2.46875, + "grad_norm_var": 0.02533543904622396, + "learning_rate": 0.0001, + "loss": 4.3443, + "loss/crossentropy": 2.1442995071411133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23094037175178528, + "step": 7120 + }, + { + "epoch": 0.14244, + "grad_norm": 2.1875, + "grad_norm_var": 0.025608062744140625, + "learning_rate": 0.0001, + "loss": 4.3297, + "loss/crossentropy": 2.24001145362854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2479296177625656, + "step": 7122 + }, + { + "epoch": 0.14248, + "grad_norm": 2.734375, + "grad_norm_var": 0.038917795817057295, + "learning_rate": 0.0001, + "loss": 4.4122, + "loss/crossentropy": 1.8284733891487122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23670874536037445, + "step": 7124 + }, + { + "epoch": 0.14252, + "grad_norm": 2.375, + "grad_norm_var": 0.04146499633789062, + "learning_rate": 0.0001, + "loss": 4.6223, + "loss/crossentropy": 2.1003533601760864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24235840141773224, + "step": 7126 + }, + { + "epoch": 0.14256, + "grad_norm": 2.078125, + "grad_norm_var": 0.04201024373372396, + "learning_rate": 0.0001, + "loss": 4.2591, + "loss/crossentropy": 2.3661316633224487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.258208692073822, + "step": 7128 + }, + { + "epoch": 0.1426, + "grad_norm": 2.109375, + "grad_norm_var": 0.030989583333333334, + "learning_rate": 0.0001, + "loss": 4.3288, + "loss/crossentropy": 2.2374593019485474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2543400973081589, + "step": 7130 + }, + { + "epoch": 0.14264, + "grad_norm": 2.15625, + "grad_norm_var": 0.02945556640625, + "learning_rate": 0.0001, + "loss": 4.5939, + "loss/crossentropy": 1.9141342639923096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23257911205291748, + "step": 7132 + }, + { + "epoch": 0.14268, + "grad_norm": 2.4375, + "grad_norm_var": 0.031224568684895832, + "learning_rate": 0.0001, + "loss": 4.1893, + "loss/crossentropy": 1.992666780948639, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2298717424273491, + "step": 7134 + }, + { + "epoch": 0.14272, + "grad_norm": 2.03125, + "grad_norm_var": 0.033503214518229164, + "learning_rate": 0.0001, + "loss": 4.2665, + "loss/crossentropy": 1.9794283509254456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2112378552556038, + "step": 7136 + }, + { + "epoch": 0.14276, + "grad_norm": 2.5, + "grad_norm_var": 0.0382232666015625, + "learning_rate": 0.0001, + "loss": 4.3801, + "loss/crossentropy": 2.1011139154434204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27278490364551544, + "step": 7138 + }, + { + "epoch": 0.1428, + "grad_norm": 2.234375, + "grad_norm_var": 0.02340087890625, + "learning_rate": 0.0001, + "loss": 4.6989, + "loss/crossentropy": 2.3489880561828613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24445360153913498, + "step": 7140 + }, + { + "epoch": 0.14284, + "grad_norm": 2.28125, + "grad_norm_var": 0.0173736572265625, + "learning_rate": 0.0001, + "loss": 4.3418, + "loss/crossentropy": 2.011172831058502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.210151307284832, + "step": 7142 + }, + { + "epoch": 0.14288, + "grad_norm": 2.25, + "grad_norm_var": 0.017366536458333335, + "learning_rate": 0.0001, + "loss": 4.3488, + "loss/crossentropy": 1.963642418384552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22939135879278183, + "step": 7144 + }, + { + "epoch": 0.14292, + "grad_norm": 2.21875, + "grad_norm_var": 0.016194661458333332, + "learning_rate": 0.0001, + "loss": 4.5166, + "loss/crossentropy": 2.2739341259002686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23235367238521576, + "step": 7146 + }, + { + "epoch": 0.14296, + "grad_norm": 2.21875, + "grad_norm_var": 0.018973795572916667, + "learning_rate": 0.0001, + "loss": 4.5977, + "loss/crossentropy": 2.282576322555542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24144183099269867, + "step": 7148 + }, + { + "epoch": 0.143, + "grad_norm": 2.078125, + "grad_norm_var": 0.02329279581705729, + "learning_rate": 0.0001, + "loss": 4.141, + "loss/crossentropy": 1.7847901582717896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19184929877519608, + "step": 7150 + }, + { + "epoch": 0.14304, + "grad_norm": 2.15625, + "grad_norm_var": 0.021740468343098958, + "learning_rate": 0.0001, + "loss": 4.3379, + "loss/crossentropy": 2.165170907974243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.223430335521698, + "step": 7152 + }, + { + "epoch": 0.14308, + "grad_norm": 2.15625, + "grad_norm_var": 0.018873850504557293, + "learning_rate": 0.0001, + "loss": 4.3407, + "loss/crossentropy": 2.0395787954330444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21662698686122894, + "step": 7154 + }, + { + "epoch": 0.14312, + "grad_norm": 2.125, + "grad_norm_var": 0.01907323201497396, + "learning_rate": 0.0001, + "loss": 4.4936, + "loss/crossentropy": 2.014316141605377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23184175789356232, + "step": 7156 + }, + { + "epoch": 0.14316, + "grad_norm": 2.296875, + "grad_norm_var": 0.019419097900390626, + "learning_rate": 0.0001, + "loss": 4.5612, + "loss/crossentropy": 2.2581117153167725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24055174738168716, + "step": 7158 + }, + { + "epoch": 0.1432, + "grad_norm": 2.28125, + "grad_norm_var": 0.021022288004557292, + "learning_rate": 0.0001, + "loss": 4.3853, + "loss/crossentropy": 2.0905630588531494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2719188630580902, + "step": 7160 + }, + { + "epoch": 0.14324, + "grad_norm": 2.25, + "grad_norm_var": 0.025233713785807292, + "learning_rate": 0.0001, + "loss": 4.6485, + "loss/crossentropy": 2.414529800415039, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24188701063394547, + "step": 7162 + }, + { + "epoch": 0.14328, + "grad_norm": 2.328125, + "grad_norm_var": 0.02240778605143229, + "learning_rate": 0.0001, + "loss": 4.3605, + "loss/crossentropy": 2.028432607650757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2300800457596779, + "step": 7164 + }, + { + "epoch": 0.14332, + "grad_norm": 2.515625, + "grad_norm_var": 0.03299153645833333, + "learning_rate": 0.0001, + "loss": 4.2947, + "loss/crossentropy": 2.096144199371338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23311930894851685, + "step": 7166 + }, + { + "epoch": 0.14336, + "grad_norm": 2.25, + "grad_norm_var": 0.029622395833333332, + "learning_rate": 0.0001, + "loss": 4.4375, + "loss/crossentropy": 2.259281277656555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25558799505233765, + "step": 7168 + }, + { + "epoch": 0.1434, + "grad_norm": 2.25, + "grad_norm_var": 0.0219390869140625, + "learning_rate": 0.0001, + "loss": 4.4364, + "loss/crossentropy": 2.0766254663467407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22725434601306915, + "step": 7170 + }, + { + "epoch": 0.14344, + "grad_norm": 2.15625, + "grad_norm_var": 0.024312337239583332, + "learning_rate": 0.0001, + "loss": 4.4695, + "loss/crossentropy": 2.26702618598938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22584324330091476, + "step": 7172 + }, + { + "epoch": 0.14348, + "grad_norm": 2.203125, + "grad_norm_var": 0.027534993489583333, + "learning_rate": 0.0001, + "loss": 4.3417, + "loss/crossentropy": 2.1933096647262573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23658160120248795, + "step": 7174 + }, + { + "epoch": 0.14352, + "grad_norm": 2.1875, + "grad_norm_var": 0.027372233072916665, + "learning_rate": 0.0001, + "loss": 4.351, + "loss/crossentropy": 2.2003660202026367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22349942475557327, + "step": 7176 + }, + { + "epoch": 0.14356, + "grad_norm": 2.28125, + "grad_norm_var": 0.025145467122395834, + "learning_rate": 0.0001, + "loss": 4.684, + "loss/crossentropy": 2.4630067348480225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.253003865480423, + "step": 7178 + }, + { + "epoch": 0.1436, + "grad_norm": 2.109375, + "grad_norm_var": 0.027082316080729165, + "learning_rate": 0.0001, + "loss": 4.6875, + "loss/crossentropy": 2.264480948448181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27604997158050537, + "step": 7180 + }, + { + "epoch": 0.14364, + "grad_norm": 2.203125, + "grad_norm_var": 0.00712890625, + "learning_rate": 0.0001, + "loss": 4.3064, + "loss/crossentropy": 2.1641955375671387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22587595880031586, + "step": 7182 + }, + { + "epoch": 0.14368, + "grad_norm": 2.296875, + "grad_norm_var": 0.00943603515625, + "learning_rate": 0.0001, + "loss": 4.6137, + "loss/crossentropy": 2.1432350873947144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24324018508195877, + "step": 7184 + }, + { + "epoch": 0.14372, + "grad_norm": 1.9140625, + "grad_norm_var": 0.014611562093098959, + "learning_rate": 0.0001, + "loss": 4.3394, + "loss/crossentropy": 1.7448238134384155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1751260682940483, + "step": 7186 + }, + { + "epoch": 0.14376, + "grad_norm": 2.1875, + "grad_norm_var": 0.015295155843098958, + "learning_rate": 0.0001, + "loss": 4.3602, + "loss/crossentropy": 2.3202184438705444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2314094603061676, + "step": 7188 + }, + { + "epoch": 0.1438, + "grad_norm": 1.984375, + "grad_norm_var": 0.019681549072265624, + "learning_rate": 0.0001, + "loss": 4.187, + "loss/crossentropy": 1.970094919204712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24998464435338974, + "step": 7190 + }, + { + "epoch": 0.14384, + "grad_norm": 2.125, + "grad_norm_var": 0.02075780232747396, + "learning_rate": 0.0001, + "loss": 4.215, + "loss/crossentropy": 2.1331114768981934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2008385732769966, + "step": 7192 + }, + { + "epoch": 0.14388, + "grad_norm": 2.03125, + "grad_norm_var": 0.021345774332682293, + "learning_rate": 0.0001, + "loss": 4.3976, + "loss/crossentropy": 2.1659106016159058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22314336150884628, + "step": 7194 + }, + { + "epoch": 0.14392, + "grad_norm": 2.1875, + "grad_norm_var": 0.018507639567057293, + "learning_rate": 0.0001, + "loss": 4.6324, + "loss/crossentropy": 2.3382883071899414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26393643021583557, + "step": 7196 + }, + { + "epoch": 0.14396, + "grad_norm": 2.34375, + "grad_norm_var": 0.020499420166015626, + "learning_rate": 0.0001, + "loss": 4.7988, + "loss/crossentropy": 2.1325554847717285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2457222416996956, + "step": 7198 + }, + { + "epoch": 0.144, + "grad_norm": 2.078125, + "grad_norm_var": 0.03144709269205729, + "learning_rate": 0.0001, + "loss": 4.3175, + "loss/crossentropy": 1.7927106022834778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20284093916416168, + "step": 7200 + }, + { + "epoch": 0.14404, + "grad_norm": 2.296875, + "grad_norm_var": 0.03050715128580729, + "learning_rate": 0.0001, + "loss": 4.1676, + "loss/crossentropy": 1.9799031615257263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23475942015647888, + "step": 7202 + }, + { + "epoch": 0.14408, + "grad_norm": 2.234375, + "grad_norm_var": 0.02939020792643229, + "learning_rate": 0.0001, + "loss": 4.2627, + "loss/crossentropy": 2.0590370893478394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2160736471414566, + "step": 7204 + }, + { + "epoch": 0.14412, + "grad_norm": 2.34375, + "grad_norm_var": 0.02800267537434896, + "learning_rate": 0.0001, + "loss": 4.4397, + "loss/crossentropy": 1.9866149425506592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23413674533367157, + "step": 7206 + }, + { + "epoch": 0.14416, + "grad_norm": 2.5625, + "grad_norm_var": 0.03551610310872396, + "learning_rate": 0.0001, + "loss": 4.526, + "loss/crossentropy": 2.1320748925209045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24469739198684692, + "step": 7208 + }, + { + "epoch": 0.1442, + "grad_norm": 2.375, + "grad_norm_var": 0.034708404541015626, + "learning_rate": 0.0001, + "loss": 4.5834, + "loss/crossentropy": 2.225857973098755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23838083446025848, + "step": 7210 + }, + { + "epoch": 0.14424, + "grad_norm": 2.078125, + "grad_norm_var": 0.03794733683268229, + "learning_rate": 0.0001, + "loss": 3.9952, + "loss/crossentropy": 1.9118528962135315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064301297068596, + "step": 7212 + }, + { + "epoch": 0.14428, + "grad_norm": 2.21875, + "grad_norm_var": 0.03806940714518229, + "learning_rate": 0.0001, + "loss": 4.182, + "loss/crossentropy": 1.8142234086990356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172461450099945, + "step": 7214 + }, + { + "epoch": 0.14432, + "grad_norm": 2.203125, + "grad_norm_var": 0.02540868123372396, + "learning_rate": 0.0001, + "loss": 4.4446, + "loss/crossentropy": 1.9308255910873413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2236044555902481, + "step": 7216 + }, + { + "epoch": 0.14436, + "grad_norm": 2.3125, + "grad_norm_var": 0.022391764322916667, + "learning_rate": 0.0001, + "loss": 4.4597, + "loss/crossentropy": 1.9821211695671082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21110422909259796, + "step": 7218 + }, + { + "epoch": 0.1444, + "grad_norm": 2.328125, + "grad_norm_var": 0.023176066080729165, + "learning_rate": 0.0001, + "loss": 4.6732, + "loss/crossentropy": 2.216045379638672, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23433538526296616, + "step": 7220 + }, + { + "epoch": 0.14444, + "grad_norm": 2.5, + "grad_norm_var": 0.024442545572916665, + "learning_rate": 0.0001, + "loss": 4.6089, + "loss/crossentropy": 2.2303662300109863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2390453889966011, + "step": 7222 + }, + { + "epoch": 0.14448, + "grad_norm": 2.078125, + "grad_norm_var": 0.017020670572916667, + "learning_rate": 0.0001, + "loss": 4.0805, + "loss/crossentropy": 2.2152082920074463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22196897864341736, + "step": 7224 + }, + { + "epoch": 0.14452, + "grad_norm": 2.09375, + "grad_norm_var": 0.014860026041666667, + "learning_rate": 0.0001, + "loss": 4.2843, + "loss/crossentropy": 2.134513795375824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21964208781719208, + "step": 7226 + }, + { + "epoch": 0.14456, + "grad_norm": 2.0625, + "grad_norm_var": 0.015262858072916666, + "learning_rate": 0.0001, + "loss": 4.0289, + "loss/crossentropy": 1.6803861260414124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18217483162879944, + "step": 7228 + }, + { + "epoch": 0.1446, + "grad_norm": 2.078125, + "grad_norm_var": 0.0152252197265625, + "learning_rate": 0.0001, + "loss": 4.1353, + "loss/crossentropy": 1.6597792506217957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20317095518112183, + "step": 7230 + }, + { + "epoch": 0.14464, + "grad_norm": 2.015625, + "grad_norm_var": 0.017704264322916666, + "learning_rate": 0.0001, + "loss": 4.2978, + "loss/crossentropy": 1.776586651802063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21835225820541382, + "step": 7232 + }, + { + "epoch": 0.14468, + "grad_norm": 2.03125, + "grad_norm_var": 0.017154947916666666, + "learning_rate": 0.0001, + "loss": 4.1092, + "loss/crossentropy": 1.7347259521484375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21964067220687866, + "step": 7234 + }, + { + "epoch": 0.14472, + "grad_norm": 2.046875, + "grad_norm_var": 0.015380859375, + "learning_rate": 0.0001, + "loss": 4.2069, + "loss/crossentropy": 1.79097181558609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21776803582906723, + "step": 7236 + }, + { + "epoch": 0.14476, + "grad_norm": 1.984375, + "grad_norm_var": 0.0069244384765625, + "learning_rate": 0.0001, + "loss": 4.0335, + "loss/crossentropy": 2.051329553127289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21669812500476837, + "step": 7238 + }, + { + "epoch": 0.1448, + "grad_norm": 2.15625, + "grad_norm_var": 0.0070220947265625, + "learning_rate": 0.0001, + "loss": 4.3206, + "loss/crossentropy": 1.965324580669403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2283879667520523, + "step": 7240 + }, + { + "epoch": 0.14484, + "grad_norm": 2.296875, + "grad_norm_var": 0.010692342122395834, + "learning_rate": 0.0001, + "loss": 4.5952, + "loss/crossentropy": 2.248784363269806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24906039983034134, + "step": 7242 + }, + { + "epoch": 0.14488, + "grad_norm": 2.125, + "grad_norm_var": 0.010529581705729167, + "learning_rate": 0.0001, + "loss": 4.3321, + "loss/crossentropy": 1.9946890473365784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22583268582820892, + "step": 7244 + }, + { + "epoch": 0.14492, + "grad_norm": 2.0625, + "grad_norm_var": 0.010660807291666666, + "learning_rate": 0.0001, + "loss": 4.1436, + "loss/crossentropy": 2.2306413650512695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22068945318460464, + "step": 7246 + }, + { + "epoch": 0.14496, + "grad_norm": 2.1875, + "grad_norm_var": 0.00758056640625, + "learning_rate": 0.0001, + "loss": 4.2159, + "loss/crossentropy": 2.110253095626831, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22045104205608368, + "step": 7248 + }, + { + "epoch": 0.145, + "grad_norm": 2.21875, + "grad_norm_var": 0.008210245768229167, + "learning_rate": 0.0001, + "loss": 4.0693, + "loss/crossentropy": 1.928157925605774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2043258175253868, + "step": 7250 + }, + { + "epoch": 0.14504, + "grad_norm": 2.0, + "grad_norm_var": 0.009110514322916667, + "learning_rate": 0.0001, + "loss": 4.2458, + "loss/crossentropy": 2.2674691677093506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23258862644433975, + "step": 7252 + }, + { + "epoch": 0.14508, + "grad_norm": 2.765625, + "grad_norm_var": 0.03052978515625, + "learning_rate": 0.0001, + "loss": 4.5107, + "loss/crossentropy": 2.2825024127960205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.263367660343647, + "step": 7254 + }, + { + "epoch": 0.14512, + "grad_norm": 2.265625, + "grad_norm_var": 0.040185546875, + "learning_rate": 0.0001, + "loss": 4.2394, + "loss/crossentropy": 2.1546168327331543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23073332011699677, + "step": 7256 + }, + { + "epoch": 0.14516, + "grad_norm": 2.34375, + "grad_norm_var": 0.040816243489583334, + "learning_rate": 0.0001, + "loss": 4.5504, + "loss/crossentropy": 2.0490044951438904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24161820113658905, + "step": 7258 + }, + { + "epoch": 0.1452, + "grad_norm": 2.234375, + "grad_norm_var": 0.04038798014322917, + "learning_rate": 0.0001, + "loss": 4.6468, + "loss/crossentropy": 2.115446150302887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2537624090909958, + "step": 7260 + }, + { + "epoch": 0.14524, + "grad_norm": 2.203125, + "grad_norm_var": 0.22388407389322917, + "learning_rate": 0.0001, + "loss": 4.1457, + "loss/crossentropy": 2.0302165746688843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21026766300201416, + "step": 7262 + }, + { + "epoch": 0.14528, + "grad_norm": 2.171875, + "grad_norm_var": 0.2228179931640625, + "learning_rate": 0.0001, + "loss": 4.4077, + "loss/crossentropy": 2.102527379989624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2354147508740425, + "step": 7264 + }, + { + "epoch": 0.14532, + "grad_norm": 2.171875, + "grad_norm_var": 0.21614176432291668, + "learning_rate": 0.0001, + "loss": 4.1108, + "loss/crossentropy": 2.0095282793045044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22679834067821503, + "step": 7266 + }, + { + "epoch": 0.14536, + "grad_norm": 2.265625, + "grad_norm_var": 0.20706278483072918, + "learning_rate": 0.0001, + "loss": 4.3849, + "loss/crossentropy": 1.8988603353500366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21238050609827042, + "step": 7268 + }, + { + "epoch": 0.1454, + "grad_norm": 2.203125, + "grad_norm_var": 0.19975484212239583, + "learning_rate": 0.0001, + "loss": 4.6614, + "loss/crossentropy": 2.186660885810852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24453039467334747, + "step": 7270 + }, + { + "epoch": 0.14544, + "grad_norm": 2.0625, + "grad_norm_var": 0.20244852701822916, + "learning_rate": 0.0001, + "loss": 4.1409, + "loss/crossentropy": 1.8927155137062073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2147793024778366, + "step": 7272 + }, + { + "epoch": 0.14548, + "grad_norm": 2.046875, + "grad_norm_var": 0.20608317057291667, + "learning_rate": 0.0001, + "loss": 4.1375, + "loss/crossentropy": 1.8969642519950867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21910040825605392, + "step": 7274 + }, + { + "epoch": 0.14552, + "grad_norm": 2.21875, + "grad_norm_var": 0.20545247395833333, + "learning_rate": 0.0001, + "loss": 4.3325, + "loss/crossentropy": 2.090053617954254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22576116025447845, + "step": 7276 + }, + { + "epoch": 0.14556, + "grad_norm": 2.171875, + "grad_norm_var": 0.010480753580729167, + "learning_rate": 0.0001, + "loss": 4.7263, + "loss/crossentropy": 2.1606650352478027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22937766462564468, + "step": 7278 + }, + { + "epoch": 0.1456, + "grad_norm": 2.015625, + "grad_norm_var": 0.013451131184895833, + "learning_rate": 0.0001, + "loss": 4.0924, + "loss/crossentropy": 1.9946333765983582, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20145532488822937, + "step": 7280 + }, + { + "epoch": 0.14564, + "grad_norm": 2.296875, + "grad_norm_var": 0.013834635416666666, + "learning_rate": 0.0001, + "loss": 4.6519, + "loss/crossentropy": 2.0958545207977295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22669509798288345, + "step": 7282 + }, + { + "epoch": 0.14568, + "grad_norm": 2.328125, + "grad_norm_var": 0.016109212239583334, + "learning_rate": 0.0001, + "loss": 4.4224, + "loss/crossentropy": 2.0515894889831543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2520884945988655, + "step": 7284 + }, + { + "epoch": 0.14572, + "grad_norm": 2.171875, + "grad_norm_var": 0.016063435872395834, + "learning_rate": 0.0001, + "loss": 4.1572, + "loss/crossentropy": 2.034587264060974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21523287147283554, + "step": 7286 + }, + { + "epoch": 0.14576, + "grad_norm": 2.140625, + "grad_norm_var": 0.016080729166666665, + "learning_rate": 0.0001, + "loss": 3.8649, + "loss/crossentropy": 1.6578314900398254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19603776931762695, + "step": 7288 + }, + { + "epoch": 0.1458, + "grad_norm": 2.109375, + "grad_norm_var": 0.014167277018229167, + "learning_rate": 0.0001, + "loss": 4.2443, + "loss/crossentropy": 2.0019100308418274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22880420833826065, + "step": 7290 + }, + { + "epoch": 0.14584, + "grad_norm": 2.046875, + "grad_norm_var": 0.018723297119140624, + "learning_rate": 0.0001, + "loss": 3.981, + "loss/crossentropy": 2.068517565727234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19675085693597794, + "step": 7292 + }, + { + "epoch": 0.14588, + "grad_norm": 2.203125, + "grad_norm_var": 0.013734690348307292, + "learning_rate": 0.0001, + "loss": 4.4037, + "loss/crossentropy": 2.000797212123871, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20464950054883957, + "step": 7294 + }, + { + "epoch": 0.14592, + "grad_norm": 2.125, + "grad_norm_var": 0.013734690348307292, + "learning_rate": 0.0001, + "loss": 4.4132, + "loss/crossentropy": 2.1914668679237366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2406519278883934, + "step": 7296 + }, + { + "epoch": 0.14596, + "grad_norm": 2.390625, + "grad_norm_var": 0.01685358683268229, + "learning_rate": 0.0001, + "loss": 4.4737, + "loss/crossentropy": 2.123211979866028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.250032439827919, + "step": 7298 + }, + { + "epoch": 0.146, + "grad_norm": 2.625, + "grad_norm_var": 0.028507232666015625, + "learning_rate": 0.0001, + "loss": 4.718, + "loss/crossentropy": 2.0686148405075073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22581970691680908, + "step": 7300 + }, + { + "epoch": 0.14604, + "grad_norm": 2.25, + "grad_norm_var": 0.029288482666015626, + "learning_rate": 0.0001, + "loss": 4.2295, + "loss/crossentropy": 2.141907751560211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21503636240959167, + "step": 7302 + }, + { + "epoch": 0.14608, + "grad_norm": 2.09375, + "grad_norm_var": 0.028436024983723957, + "learning_rate": 0.0001, + "loss": 4.1493, + "loss/crossentropy": 1.6741206645965576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20490698516368866, + "step": 7304 + }, + { + "epoch": 0.14612, + "grad_norm": 2.09375, + "grad_norm_var": 0.027854156494140626, + "learning_rate": 0.0001, + "loss": 4.1259, + "loss/crossentropy": 1.8094561696052551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21328043192625046, + "step": 7306 + }, + { + "epoch": 0.14616, + "grad_norm": 2.109375, + "grad_norm_var": 0.020970662434895832, + "learning_rate": 0.0001, + "loss": 4.3715, + "loss/crossentropy": 2.204083800315857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24204879999160767, + "step": 7308 + }, + { + "epoch": 0.1462, + "grad_norm": 2.234375, + "grad_norm_var": 0.021126302083333333, + "learning_rate": 0.0001, + "loss": 4.2614, + "loss/crossentropy": 2.0166819095611572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2219880372285843, + "step": 7310 + }, + { + "epoch": 0.14624, + "grad_norm": 2.25, + "grad_norm_var": 0.022684733072916668, + "learning_rate": 0.0001, + "loss": 4.3768, + "loss/crossentropy": 2.4667757749557495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2880199924111366, + "step": 7312 + }, + { + "epoch": 0.14628, + "grad_norm": 2.1875, + "grad_norm_var": 0.019287109375, + "learning_rate": 0.0001, + "loss": 4.2005, + "loss/crossentropy": 2.1497310400009155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21792854368686676, + "step": 7314 + }, + { + "epoch": 0.14632, + "grad_norm": 2.25, + "grad_norm_var": 0.0062896728515625, + "learning_rate": 0.0001, + "loss": 4.3911, + "loss/crossentropy": 2.179584264755249, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23200812935829163, + "step": 7316 + }, + { + "epoch": 0.14636, + "grad_norm": 6.0625, + "grad_norm_var": 0.9582590738932292, + "learning_rate": 0.0001, + "loss": 4.1545, + "loss/crossentropy": 1.4067250490188599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.204126738011837, + "step": 7318 + }, + { + "epoch": 0.1464, + "grad_norm": 2.390625, + "grad_norm_var": 0.9473592122395833, + "learning_rate": 0.0001, + "loss": 4.2439, + "loss/crossentropy": 1.9879435896873474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.247541606426239, + "step": 7320 + }, + { + "epoch": 0.14644, + "grad_norm": 2.28125, + "grad_norm_var": 0.9377919514973958, + "learning_rate": 0.0001, + "loss": 4.1795, + "loss/crossentropy": 1.62649005651474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2054424211382866, + "step": 7322 + }, + { + "epoch": 0.14648, + "grad_norm": 2.15625, + "grad_norm_var": 0.9344228108723959, + "learning_rate": 0.0001, + "loss": 4.1479, + "loss/crossentropy": 1.9012435674667358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2240126132965088, + "step": 7324 + }, + { + "epoch": 0.14652, + "grad_norm": 2.03125, + "grad_norm_var": 0.948193359375, + "learning_rate": 0.0001, + "loss": 3.6841, + "loss/crossentropy": 1.8239200115203857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19691675901412964, + "step": 7326 + }, + { + "epoch": 0.14656, + "grad_norm": 2.171875, + "grad_norm_var": 0.94068603515625, + "learning_rate": 0.0001, + "loss": 4.2223, + "loss/crossentropy": 1.9689037799835205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21301231533288956, + "step": 7328 + }, + { + "epoch": 0.1466, + "grad_norm": 2.09375, + "grad_norm_var": 0.9363433837890625, + "learning_rate": 0.0001, + "loss": 4.3733, + "loss/crossentropy": 2.1064560413360596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23990632593631744, + "step": 7330 + }, + { + "epoch": 0.14664, + "grad_norm": 2.109375, + "grad_norm_var": 0.9397125244140625, + "learning_rate": 0.0001, + "loss": 4.2765, + "loss/crossentropy": 2.0871987342834473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23108436167240143, + "step": 7332 + }, + { + "epoch": 0.14668, + "grad_norm": 2.265625, + "grad_norm_var": 0.0209869384765625, + "learning_rate": 0.0001, + "loss": 4.4517, + "loss/crossentropy": 2.3061007857322693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24406791478395462, + "step": 7334 + }, + { + "epoch": 0.14672, + "grad_norm": 2.296875, + "grad_norm_var": 0.016943359375, + "learning_rate": 0.0001, + "loss": 4.4278, + "loss/crossentropy": 1.9471244812011719, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20859277993440628, + "step": 7336 + }, + { + "epoch": 0.14676, + "grad_norm": 2.28125, + "grad_norm_var": 0.0131744384765625, + "learning_rate": 0.0001, + "loss": 4.188, + "loss/crossentropy": 2.0628740191459656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25501881539821625, + "step": 7338 + }, + { + "epoch": 0.1468, + "grad_norm": 1.9921875, + "grad_norm_var": 0.014481353759765624, + "learning_rate": 0.0001, + "loss": 4.4329, + "loss/crossentropy": 1.8065250515937805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20222238451242447, + "step": 7340 + }, + { + "epoch": 0.14684, + "grad_norm": 2.109375, + "grad_norm_var": 0.012910715738932292, + "learning_rate": 0.0001, + "loss": 4.5431, + "loss/crossentropy": 2.134244918823242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22319403290748596, + "step": 7342 + }, + { + "epoch": 0.14688, + "grad_norm": 2.109375, + "grad_norm_var": 0.012359364827473959, + "learning_rate": 0.0001, + "loss": 4.3098, + "loss/crossentropy": 2.5807924270629883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26865454018116, + "step": 7344 + }, + { + "epoch": 0.14692, + "grad_norm": 2.203125, + "grad_norm_var": 0.015421295166015625, + "learning_rate": 0.0001, + "loss": 4.684, + "loss/crossentropy": 2.4128278493881226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25771988928318024, + "step": 7346 + }, + { + "epoch": 0.14696, + "grad_norm": 2.046875, + "grad_norm_var": 0.017561594645182293, + "learning_rate": 0.0001, + "loss": 4.0915, + "loss/crossentropy": 1.7323983907699585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2146565169095993, + "step": 7348 + }, + { + "epoch": 0.147, + "grad_norm": 2.125, + "grad_norm_var": 0.015553538004557292, + "learning_rate": 0.0001, + "loss": 4.5511, + "loss/crossentropy": 2.036192536354065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21692586690187454, + "step": 7350 + }, + { + "epoch": 0.14704, + "grad_norm": 2.09375, + "grad_norm_var": 0.012320709228515626, + "learning_rate": 0.0001, + "loss": 4.4173, + "loss/crossentropy": 1.9586528539657593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21543999761343002, + "step": 7352 + }, + { + "epoch": 0.14708, + "grad_norm": 2.234375, + "grad_norm_var": 0.011805979410807292, + "learning_rate": 0.0001, + "loss": 4.4506, + "loss/crossentropy": 2.3444113731384277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24080512672662735, + "step": 7354 + }, + { + "epoch": 0.14712, + "grad_norm": 2.15625, + "grad_norm_var": 0.0097076416015625, + "learning_rate": 0.0001, + "loss": 4.4865, + "loss/crossentropy": 2.3060439825057983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24567800760269165, + "step": 7356 + }, + { + "epoch": 0.14716, + "grad_norm": 2.359375, + "grad_norm_var": 0.01279296875, + "learning_rate": 0.0001, + "loss": 4.5975, + "loss/crossentropy": 2.2267106771469116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2345375493168831, + "step": 7358 + }, + { + "epoch": 0.1472, + "grad_norm": 2.359375, + "grad_norm_var": 0.014481608072916667, + "learning_rate": 0.0001, + "loss": 4.7052, + "loss/crossentropy": 2.2470518350601196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2417411357164383, + "step": 7360 + }, + { + "epoch": 0.14724, + "grad_norm": 1.9765625, + "grad_norm_var": 0.012953440348307291, + "learning_rate": 0.0001, + "loss": 4.3299, + "loss/crossentropy": 2.200543165206909, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24434109032154083, + "step": 7362 + }, + { + "epoch": 0.14728, + "grad_norm": 2.203125, + "grad_norm_var": 0.010227203369140625, + "learning_rate": 0.0001, + "loss": 4.3282, + "loss/crossentropy": 1.995844304561615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22591491788625717, + "step": 7364 + }, + { + "epoch": 0.14732, + "grad_norm": 2.046875, + "grad_norm_var": 0.011580149332682291, + "learning_rate": 0.0001, + "loss": 4.3354, + "loss/crossentropy": 1.9180658459663391, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22776676714420319, + "step": 7366 + }, + { + "epoch": 0.14736, + "grad_norm": 2.078125, + "grad_norm_var": 0.011840565999348959, + "learning_rate": 0.0001, + "loss": 4.3662, + "loss/crossentropy": 2.473931312561035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26284685730934143, + "step": 7368 + }, + { + "epoch": 0.1474, + "grad_norm": 2.109375, + "grad_norm_var": 0.011744944254557292, + "learning_rate": 0.0001, + "loss": 4.3167, + "loss/crossentropy": 2.0392738580703735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22323766350746155, + "step": 7370 + }, + { + "epoch": 0.14744, + "grad_norm": 2.203125, + "grad_norm_var": 0.012094879150390625, + "learning_rate": 0.0001, + "loss": 4.3032, + "loss/crossentropy": 1.9847410917282104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22324485331773758, + "step": 7372 + }, + { + "epoch": 0.14748, + "grad_norm": 2.171875, + "grad_norm_var": 0.008459218343098958, + "learning_rate": 0.0001, + "loss": 4.2794, + "loss/crossentropy": 1.930326521396637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24155349284410477, + "step": 7374 + }, + { + "epoch": 0.14752, + "grad_norm": 2.09375, + "grad_norm_var": 0.0061724344889322914, + "learning_rate": 0.0001, + "loss": 4.0648, + "loss/crossentropy": 1.825449824333191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2087552770972252, + "step": 7376 + }, + { + "epoch": 0.14756, + "grad_norm": 2.15625, + "grad_norm_var": 0.004325358072916666, + "learning_rate": 0.0001, + "loss": 4.2405, + "loss/crossentropy": 2.156645655632019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2519562169909477, + "step": 7378 + }, + { + "epoch": 0.1476, + "grad_norm": 2.40625, + "grad_norm_var": 0.008333333333333333, + "learning_rate": 0.0001, + "loss": 4.5316, + "loss/crossentropy": 2.255813479423523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23737946152687073, + "step": 7380 + }, + { + "epoch": 0.14764, + "grad_norm": 2.109375, + "grad_norm_var": 0.00797119140625, + "learning_rate": 0.0001, + "loss": 4.5795, + "loss/crossentropy": 2.47933566570282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23890959471464157, + "step": 7382 + }, + { + "epoch": 0.14768, + "grad_norm": 2.09375, + "grad_norm_var": 0.007710774739583333, + "learning_rate": 0.0001, + "loss": 4.5328, + "loss/crossentropy": 2.139566659927368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22334300726652145, + "step": 7384 + }, + { + "epoch": 0.14772, + "grad_norm": 2.125, + "grad_norm_var": 0.007673136393229167, + "learning_rate": 0.0001, + "loss": 4.1581, + "loss/crossentropy": 1.6182149052619934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19345169514417648, + "step": 7386 + }, + { + "epoch": 0.14776, + "grad_norm": 2.40625, + "grad_norm_var": 0.010758463541666667, + "learning_rate": 0.0001, + "loss": 4.4571, + "loss/crossentropy": 1.9984004497528076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24823245406150818, + "step": 7388 + }, + { + "epoch": 0.1478, + "grad_norm": 2.171875, + "grad_norm_var": 0.010445149739583333, + "learning_rate": 0.0001, + "loss": 4.3218, + "loss/crossentropy": 2.5892586708068848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2489030361175537, + "step": 7390 + }, + { + "epoch": 0.14784, + "grad_norm": 2.140625, + "grad_norm_var": 0.009598795572916667, + "learning_rate": 0.0001, + "loss": 4.6077, + "loss/crossentropy": 2.3723479509353638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23511765897274017, + "step": 7392 + }, + { + "epoch": 0.14788, + "grad_norm": 2.484375, + "grad_norm_var": 0.0145660400390625, + "learning_rate": 0.0001, + "loss": 4.3902, + "loss/crossentropy": 2.001940071582794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22256013005971909, + "step": 7394 + }, + { + "epoch": 0.14792, + "grad_norm": 2.0625, + "grad_norm_var": 0.013850911458333334, + "learning_rate": 0.0001, + "loss": 4.4422, + "loss/crossentropy": 2.2255555391311646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23905682563781738, + "step": 7396 + }, + { + "epoch": 0.14796, + "grad_norm": 2.28125, + "grad_norm_var": 0.013277180989583333, + "learning_rate": 0.0001, + "loss": 4.6217, + "loss/crossentropy": 2.5670583248138428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25223178416490555, + "step": 7398 + }, + { + "epoch": 0.148, + "grad_norm": 2.203125, + "grad_norm_var": 2.3002278645833334, + "learning_rate": 0.0001, + "loss": 4.6365, + "loss/crossentropy": 2.16109037399292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2601509317755699, + "step": 7400 + }, + { + "epoch": 0.14804, + "grad_norm": 2.09375, + "grad_norm_var": 2.3012847900390625, + "learning_rate": 0.0001, + "loss": 4.4169, + "loss/crossentropy": 2.1484315395355225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21474219858646393, + "step": 7402 + }, + { + "epoch": 0.14808, + "grad_norm": 4.3125, + "grad_norm_var": 2.48385009765625, + "learning_rate": 0.0001, + "loss": 4.7452, + "loss/crossentropy": 2.183099091053009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25882233679294586, + "step": 7404 + }, + { + "epoch": 0.14812, + "grad_norm": 2.0625, + "grad_norm_var": 2.491097005208333, + "learning_rate": 0.0001, + "loss": 3.9739, + "loss/crossentropy": 1.795831561088562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20870305597782135, + "step": 7406 + }, + { + "epoch": 0.14816, + "grad_norm": 2.09375, + "grad_norm_var": 2.5058553059895834, + "learning_rate": 0.0001, + "loss": 4.3433, + "loss/crossentropy": 2.202280640602112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2150556966662407, + "step": 7408 + }, + { + "epoch": 0.1482, + "grad_norm": 2.03125, + "grad_norm_var": 2.5274251302083335, + "learning_rate": 0.0001, + "loss": 4.3063, + "loss/crossentropy": 2.2616937160491943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24934212118387222, + "step": 7410 + }, + { + "epoch": 0.14824, + "grad_norm": 2.171875, + "grad_norm_var": 2.51627197265625, + "learning_rate": 0.0001, + "loss": 4.3303, + "loss/crossentropy": 1.909091055393219, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23191271722316742, + "step": 7412 + }, + { + "epoch": 0.14828, + "grad_norm": 2.09375, + "grad_norm_var": 2.5269765218098956, + "learning_rate": 0.0001, + "loss": 4.4224, + "loss/crossentropy": 2.1383588314056396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22893162816762924, + "step": 7414 + }, + { + "epoch": 0.14832, + "grad_norm": 2.234375, + "grad_norm_var": 0.2983062744140625, + "learning_rate": 0.0001, + "loss": 4.3664, + "loss/crossentropy": 2.0628907680511475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23075110465288162, + "step": 7416 + }, + { + "epoch": 0.14836, + "grad_norm": 2.15625, + "grad_norm_var": 0.29704488118489586, + "learning_rate": 0.0001, + "loss": 4.2109, + "loss/crossentropy": 1.8996745347976685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21171879768371582, + "step": 7418 + }, + { + "epoch": 0.1484, + "grad_norm": 2.046875, + "grad_norm_var": 0.0032867431640625, + "learning_rate": 0.0001, + "loss": 4.4241, + "loss/crossentropy": 2.2367645502090454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2563689202070236, + "step": 7420 + }, + { + "epoch": 0.14844, + "grad_norm": 2.265625, + "grad_norm_var": 0.003934733072916667, + "learning_rate": 0.0001, + "loss": 4.2326, + "loss/crossentropy": 2.274489164352417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2462785318493843, + "step": 7422 + }, + { + "epoch": 0.14848, + "grad_norm": 2.328125, + "grad_norm_var": 0.00562744140625, + "learning_rate": 0.0001, + "loss": 4.5288, + "loss/crossentropy": 2.227620482444763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24729500710964203, + "step": 7424 + }, + { + "epoch": 0.14852, + "grad_norm": 2.0, + "grad_norm_var": 0.006864420572916667, + "learning_rate": 0.0001, + "loss": 4.0913, + "loss/crossentropy": 1.9911785125732422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.216292105615139, + "step": 7426 + }, + { + "epoch": 0.14856, + "grad_norm": 2.15625, + "grad_norm_var": 0.008199055989583334, + "learning_rate": 0.0001, + "loss": 4.4071, + "loss/crossentropy": 1.8715736865997314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2289058193564415, + "step": 7428 + }, + { + "epoch": 0.1486, + "grad_norm": 2.125, + "grad_norm_var": 0.007991536458333334, + "learning_rate": 0.0001, + "loss": 4.4671, + "loss/crossentropy": 2.0192378759384155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2327374964952469, + "step": 7430 + }, + { + "epoch": 0.14864, + "grad_norm": 2.046875, + "grad_norm_var": 0.008226521809895833, + "learning_rate": 0.0001, + "loss": 4.213, + "loss/crossentropy": 1.9407023191452026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22586089372634888, + "step": 7432 + }, + { + "epoch": 0.14868, + "grad_norm": 2.203125, + "grad_norm_var": 0.01011962890625, + "learning_rate": 0.0001, + "loss": 4.7816, + "loss/crossentropy": 2.6349592208862305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2605717331171036, + "step": 7434 + }, + { + "epoch": 0.14872, + "grad_norm": 2.28125, + "grad_norm_var": 0.009989420572916666, + "learning_rate": 0.0001, + "loss": 4.5511, + "loss/crossentropy": 2.3015077114105225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2292959839105606, + "step": 7436 + }, + { + "epoch": 0.14876, + "grad_norm": 2.1875, + "grad_norm_var": 0.013231404622395833, + "learning_rate": 0.0001, + "loss": 4.1663, + "loss/crossentropy": 2.081148624420166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21435046195983887, + "step": 7438 + }, + { + "epoch": 0.1488, + "grad_norm": 2.109375, + "grad_norm_var": 0.011735026041666667, + "learning_rate": 0.0001, + "loss": 4.4342, + "loss/crossentropy": 2.235422372817993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23794984817504883, + "step": 7440 + }, + { + "epoch": 0.14884, + "grad_norm": 2.171875, + "grad_norm_var": 0.011295572916666666, + "learning_rate": 0.0001, + "loss": 4.4595, + "loss/crossentropy": 2.1423263549804688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2200494110584259, + "step": 7442 + }, + { + "epoch": 0.14888, + "grad_norm": 2.421875, + "grad_norm_var": 0.018342081705729166, + "learning_rate": 0.0001, + "loss": 4.514, + "loss/crossentropy": 2.3156672716140747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25282321125268936, + "step": 7444 + }, + { + "epoch": 0.14892, + "grad_norm": 2.15625, + "grad_norm_var": 0.020051066080729166, + "learning_rate": 0.0001, + "loss": 4.3107, + "loss/crossentropy": 1.7777396440505981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104710191488266, + "step": 7446 + }, + { + "epoch": 0.14896, + "grad_norm": 2.09375, + "grad_norm_var": 0.023341623942057292, + "learning_rate": 0.0001, + "loss": 3.874, + "loss/crossentropy": 1.8015541434288025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1976182758808136, + "step": 7448 + }, + { + "epoch": 0.149, + "grad_norm": 2.578125, + "grad_norm_var": 0.04948298136393229, + "learning_rate": 0.0001, + "loss": 4.5647, + "loss/crossentropy": 2.36995792388916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23767317831516266, + "step": 7450 + }, + { + "epoch": 0.14904, + "grad_norm": 2.1875, + "grad_norm_var": 0.050142161051432294, + "learning_rate": 0.0001, + "loss": 4.7568, + "loss/crossentropy": 2.2867971062660217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2666979879140854, + "step": 7452 + }, + { + "epoch": 0.14908, + "grad_norm": 2.328125, + "grad_norm_var": 0.043794504801432294, + "learning_rate": 0.0001, + "loss": 4.773, + "loss/crossentropy": 2.247913956642151, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2122594192624092, + "step": 7454 + }, + { + "epoch": 0.14912, + "grad_norm": 2.109375, + "grad_norm_var": 0.04197362263997396, + "learning_rate": 0.0001, + "loss": 4.273, + "loss/crossentropy": 2.0020886063575745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22448039799928665, + "step": 7456 + }, + { + "epoch": 0.14916, + "grad_norm": 2.15625, + "grad_norm_var": 0.043702952067057294, + "learning_rate": 0.0001, + "loss": 4.3139, + "loss/crossentropy": 2.2290679216384888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21489766240119934, + "step": 7458 + }, + { + "epoch": 0.1492, + "grad_norm": 2.609375, + "grad_norm_var": 0.049344635009765624, + "learning_rate": 0.0001, + "loss": 4.5919, + "loss/crossentropy": 2.0447877049446106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22640856355428696, + "step": 7460 + }, + { + "epoch": 0.14924, + "grad_norm": 2.3125, + "grad_norm_var": 0.045904286702473956, + "learning_rate": 0.0001, + "loss": 4.4614, + "loss/crossentropy": 2.395468831062317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2395479902625084, + "step": 7462 + }, + { + "epoch": 0.14928, + "grad_norm": 2.0625, + "grad_norm_var": 0.04019775390625, + "learning_rate": 0.0001, + "loss": 4.1552, + "loss/crossentropy": 2.117182433605194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22302603721618652, + "step": 7464 + }, + { + "epoch": 0.14932, + "grad_norm": 2.125, + "grad_norm_var": 0.019498697916666665, + "learning_rate": 0.0001, + "loss": 4.4046, + "loss/crossentropy": 2.206045985221863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2548774778842926, + "step": 7466 + }, + { + "epoch": 0.14936, + "grad_norm": 2.046875, + "grad_norm_var": 0.0194244384765625, + "learning_rate": 0.0001, + "loss": 4.1531, + "loss/crossentropy": 1.9933450818061829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22455794364213943, + "step": 7468 + }, + { + "epoch": 0.1494, + "grad_norm": 2.125, + "grad_norm_var": 0.017731730143229166, + "learning_rate": 0.0001, + "loss": 4.2744, + "loss/crossentropy": 2.216577649116516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24432705342769623, + "step": 7470 + }, + { + "epoch": 0.14944, + "grad_norm": 2.25, + "grad_norm_var": 0.018163045247395832, + "learning_rate": 0.0001, + "loss": 4.6072, + "loss/crossentropy": 2.4282405376434326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2599884122610092, + "step": 7472 + }, + { + "epoch": 0.14948, + "grad_norm": 2.140625, + "grad_norm_var": 0.018089803059895833, + "learning_rate": 0.0001, + "loss": 4.2833, + "loss/crossentropy": 2.2677053213119507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24509359151124954, + "step": 7474 + }, + { + "epoch": 0.14952, + "grad_norm": 2.046875, + "grad_norm_var": 0.007877604166666666, + "learning_rate": 0.0001, + "loss": 4.1271, + "loss/crossentropy": 1.9610475897789001, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19867657870054245, + "step": 7476 + }, + { + "epoch": 0.14956, + "grad_norm": 2.4375, + "grad_norm_var": 0.012565104166666667, + "learning_rate": 0.0001, + "loss": 4.6286, + "loss/crossentropy": 2.1379209756851196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24773475527763367, + "step": 7478 + }, + { + "epoch": 0.1496, + "grad_norm": 2.0625, + "grad_norm_var": 0.0125152587890625, + "learning_rate": 0.0001, + "loss": 4.3593, + "loss/crossentropy": 2.295411467552185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2417762354016304, + "step": 7480 + }, + { + "epoch": 0.14964, + "grad_norm": 2.28125, + "grad_norm_var": 0.013374837239583333, + "learning_rate": 0.0001, + "loss": 4.4839, + "loss/crossentropy": 2.0983279943466187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22220823168754578, + "step": 7482 + }, + { + "epoch": 0.14968, + "grad_norm": 2.34375, + "grad_norm_var": 0.015086873372395834, + "learning_rate": 0.0001, + "loss": 4.3977, + "loss/crossentropy": 2.130508065223694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23387905955314636, + "step": 7484 + }, + { + "epoch": 0.14972, + "grad_norm": 2.265625, + "grad_norm_var": 0.015869140625, + "learning_rate": 0.0001, + "loss": 4.545, + "loss/crossentropy": 2.0727924704551697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23198049515485764, + "step": 7486 + }, + { + "epoch": 0.14976, + "grad_norm": 2.28125, + "grad_norm_var": 0.015523274739583334, + "learning_rate": 0.0001, + "loss": 4.3722, + "loss/crossentropy": 2.1407171487808228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23349495232105255, + "step": 7488 + }, + { + "epoch": 0.1498, + "grad_norm": 2.0625, + "grad_norm_var": 0.016063435872395834, + "learning_rate": 0.0001, + "loss": 4.3483, + "loss/crossentropy": 2.123879909515381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2230711579322815, + "step": 7490 + }, + { + "epoch": 0.14984, + "grad_norm": 1.8984375, + "grad_norm_var": 0.02194188435872396, + "learning_rate": 0.0001, + "loss": 3.8687, + "loss/crossentropy": 2.1362847685813904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22217128425836563, + "step": 7492 + }, + { + "epoch": 0.14988, + "grad_norm": 2.234375, + "grad_norm_var": 0.01625544230143229, + "learning_rate": 0.0001, + "loss": 4.4753, + "loss/crossentropy": 2.2171897292137146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.237472265958786, + "step": 7494 + }, + { + "epoch": 0.14992, + "grad_norm": 2.203125, + "grad_norm_var": 0.016841379801432292, + "learning_rate": 0.0001, + "loss": 4.2439, + "loss/crossentropy": 1.9622138142585754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19700734317302704, + "step": 7496 + }, + { + "epoch": 0.14996, + "grad_norm": 2.078125, + "grad_norm_var": 0.01666234334309896, + "learning_rate": 0.0001, + "loss": 4.1296, + "loss/crossentropy": 2.300232410430908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26260019838809967, + "step": 7498 + }, + { + "epoch": 0.15, + "grad_norm": 2.21875, + "grad_norm_var": 0.013765207926432292, + "learning_rate": 0.0001, + "loss": 4.4466, + "loss/crossentropy": 1.9829946756362915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21770135313272476, + "step": 7500 + }, + { + "epoch": 0.15004, + "grad_norm": 2.109375, + "grad_norm_var": 0.010935211181640625, + "learning_rate": 0.0001, + "loss": 4.017, + "loss/crossentropy": 1.8421878218650818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20965785533189774, + "step": 7502 + }, + { + "epoch": 0.15008, + "grad_norm": 2.203125, + "grad_norm_var": 0.010267893473307291, + "learning_rate": 0.0001, + "loss": 4.3111, + "loss/crossentropy": 2.0559566020965576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21752064675092697, + "step": 7504 + }, + { + "epoch": 0.15012, + "grad_norm": 2.25, + "grad_norm_var": 0.009834543863932291, + "learning_rate": 0.0001, + "loss": 4.5526, + "loss/crossentropy": 1.862777590751648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2026660442352295, + "step": 7506 + }, + { + "epoch": 0.15016, + "grad_norm": 2.046875, + "grad_norm_var": 0.0064198811848958336, + "learning_rate": 0.0001, + "loss": 4.0238, + "loss/crossentropy": 2.340041399002075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23489895462989807, + "step": 7508 + }, + { + "epoch": 0.1502, + "grad_norm": 2.21875, + "grad_norm_var": 0.006224568684895833, + "learning_rate": 0.0001, + "loss": 4.5434, + "loss/crossentropy": 2.3596150875091553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23775358498096466, + "step": 7510 + }, + { + "epoch": 0.15024, + "grad_norm": 2.015625, + "grad_norm_var": 0.005980428059895833, + "learning_rate": 0.0001, + "loss": 4.4637, + "loss/crossentropy": 1.883503019809723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22191710770130157, + "step": 7512 + }, + { + "epoch": 0.15028, + "grad_norm": 2.09375, + "grad_norm_var": 0.006723785400390625, + "learning_rate": 0.0001, + "loss": 4.1485, + "loss/crossentropy": 2.12862491607666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2177818939089775, + "step": 7514 + }, + { + "epoch": 0.15032, + "grad_norm": 2.078125, + "grad_norm_var": 0.005936431884765625, + "learning_rate": 0.0001, + "loss": 4.167, + "loss/crossentropy": 1.9326343536376953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20991001278162003, + "step": 7516 + }, + { + "epoch": 0.15036, + "grad_norm": 2.046875, + "grad_norm_var": 0.008070627848307291, + "learning_rate": 0.0001, + "loss": 4.4552, + "loss/crossentropy": 2.1532761454582214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2308126464486122, + "step": 7518 + }, + { + "epoch": 0.1504, + "grad_norm": 2.21875, + "grad_norm_var": 0.007867177327473959, + "learning_rate": 0.0001, + "loss": 4.42, + "loss/crossentropy": 1.876515507698059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1940685734152794, + "step": 7520 + }, + { + "epoch": 0.15044, + "grad_norm": 2.28125, + "grad_norm_var": 0.008937327067057292, + "learning_rate": 0.0001, + "loss": 4.3515, + "loss/crossentropy": 2.3677611351013184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24787116795778275, + "step": 7522 + }, + { + "epoch": 0.15048, + "grad_norm": 2.296875, + "grad_norm_var": 0.010892486572265625, + "learning_rate": 0.0001, + "loss": 4.0489, + "loss/crossentropy": 1.8500076532363892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19271107017993927, + "step": 7524 + }, + { + "epoch": 0.15052, + "grad_norm": 2.171875, + "grad_norm_var": 0.010432688395182292, + "learning_rate": 0.0001, + "loss": 4.3298, + "loss/crossentropy": 1.8013980984687805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2181655913591385, + "step": 7526 + }, + { + "epoch": 0.15056, + "grad_norm": 2.09375, + "grad_norm_var": 0.010361480712890624, + "learning_rate": 0.0001, + "loss": 4.4958, + "loss/crossentropy": 2.6469568014144897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2441466525197029, + "step": 7528 + }, + { + "epoch": 0.1506, + "grad_norm": 2.203125, + "grad_norm_var": 0.009227498372395834, + "learning_rate": 0.0001, + "loss": 4.1317, + "loss/crossentropy": 1.7992960214614868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2209397256374359, + "step": 7530 + }, + { + "epoch": 0.15064, + "grad_norm": 2.625, + "grad_norm_var": 0.023258463541666666, + "learning_rate": 0.0001, + "loss": 4.9501, + "loss/crossentropy": 2.1914783120155334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23764144629240036, + "step": 7532 + }, + { + "epoch": 0.15068, + "grad_norm": 2.140625, + "grad_norm_var": 0.02906494140625, + "learning_rate": 0.0001, + "loss": 4.4833, + "loss/crossentropy": 2.0035120844841003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21483591943979263, + "step": 7534 + }, + { + "epoch": 0.15072, + "grad_norm": 1.921875, + "grad_norm_var": 0.03235677083333333, + "learning_rate": 0.0001, + "loss": 4.2674, + "loss/crossentropy": 2.0218639969825745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20919294655323029, + "step": 7536 + }, + { + "epoch": 0.15076, + "grad_norm": 2.015625, + "grad_norm_var": 0.03219401041666667, + "learning_rate": 0.0001, + "loss": 4.1091, + "loss/crossentropy": 2.006688416004181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21247616410255432, + "step": 7538 + }, + { + "epoch": 0.1508, + "grad_norm": 2.484375, + "grad_norm_var": 0.03542378743489583, + "learning_rate": 0.0001, + "loss": 4.5424, + "loss/crossentropy": 2.1565613746643066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30109211802482605, + "step": 7540 + }, + { + "epoch": 0.15084, + "grad_norm": 2.359375, + "grad_norm_var": 0.03603413899739583, + "learning_rate": 0.0001, + "loss": 4.3621, + "loss/crossentropy": 1.9676685333251953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23605409264564514, + "step": 7542 + }, + { + "epoch": 0.15088, + "grad_norm": 2.34375, + "grad_norm_var": 0.051493326822916664, + "learning_rate": 0.0001, + "loss": 4.4178, + "loss/crossentropy": 2.1150137186050415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22903620451688766, + "step": 7544 + }, + { + "epoch": 0.15092, + "grad_norm": 2.34375, + "grad_norm_var": 0.051253255208333334, + "learning_rate": 0.0001, + "loss": 4.3186, + "loss/crossentropy": 2.1894554495811462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24362927675247192, + "step": 7546 + }, + { + "epoch": 0.15096, + "grad_norm": 2.21875, + "grad_norm_var": 0.0410308837890625, + "learning_rate": 0.0001, + "loss": 4.4529, + "loss/crossentropy": 1.9624019861221313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22628726810216904, + "step": 7548 + }, + { + "epoch": 0.151, + "grad_norm": 2.0, + "grad_norm_var": 0.04338277180989583, + "learning_rate": 0.0001, + "loss": 4.2795, + "loss/crossentropy": 2.1547625064849854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2295984849333763, + "step": 7550 + }, + { + "epoch": 0.15104, + "grad_norm": 2.125, + "grad_norm_var": 0.03857014973958333, + "learning_rate": 0.0001, + "loss": 4.4638, + "loss/crossentropy": 2.229305863380432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2427791878581047, + "step": 7552 + }, + { + "epoch": 0.15108, + "grad_norm": 2.4375, + "grad_norm_var": 0.037093098958333334, + "learning_rate": 0.0001, + "loss": 4.7319, + "loss/crossentropy": 2.4998362064361572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23718395829200745, + "step": 7554 + }, + { + "epoch": 0.15112, + "grad_norm": 2.15625, + "grad_norm_var": 0.033722941080729166, + "learning_rate": 0.0001, + "loss": 4.2118, + "loss/crossentropy": 2.319428563117981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26024487614631653, + "step": 7556 + }, + { + "epoch": 0.15116, + "grad_norm": 2.015625, + "grad_norm_var": 0.0357818603515625, + "learning_rate": 0.0001, + "loss": 4.2451, + "loss/crossentropy": 1.861966609954834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2121218591928482, + "step": 7558 + }, + { + "epoch": 0.1512, + "grad_norm": 2.265625, + "grad_norm_var": 0.015751139322916666, + "learning_rate": 0.0001, + "loss": 4.5963, + "loss/crossentropy": 2.1688510179519653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22341617196798325, + "step": 7560 + }, + { + "epoch": 0.15124, + "grad_norm": 2.140625, + "grad_norm_var": 0.013932291666666667, + "learning_rate": 0.0001, + "loss": 4.4808, + "loss/crossentropy": 2.2833406925201416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22648481279611588, + "step": 7562 + }, + { + "epoch": 0.15128, + "grad_norm": 2.203125, + "grad_norm_var": 0.0138671875, + "learning_rate": 0.0001, + "loss": 4.4126, + "loss/crossentropy": 2.224393129348755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23414459079504013, + "step": 7564 + }, + { + "epoch": 0.15132, + "grad_norm": 2.25, + "grad_norm_var": 0.0098297119140625, + "learning_rate": 0.0001, + "loss": 4.253, + "loss/crossentropy": 2.003828763961792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2166791632771492, + "step": 7566 + }, + { + "epoch": 0.15136, + "grad_norm": 2.3125, + "grad_norm_var": 0.01168212890625, + "learning_rate": 0.0001, + "loss": 4.0904, + "loss/crossentropy": 2.1813005208969116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28873007744550705, + "step": 7568 + }, + { + "epoch": 0.1514, + "grad_norm": 1.9921875, + "grad_norm_var": 0.028696441650390626, + "learning_rate": 0.0001, + "loss": 4.3824, + "loss/crossentropy": 2.3818061351776123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24916332960128784, + "step": 7570 + }, + { + "epoch": 0.15144, + "grad_norm": 2.03125, + "grad_norm_var": 0.030460357666015625, + "learning_rate": 0.0001, + "loss": 4.1825, + "loss/crossentropy": 2.041518449783325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22352956235408783, + "step": 7572 + }, + { + "epoch": 0.15148, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0326904296875, + "learning_rate": 0.0001, + "loss": 3.8531, + "loss/crossentropy": 1.856759488582611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20227209478616714, + "step": 7574 + }, + { + "epoch": 0.15152, + "grad_norm": 2.09375, + "grad_norm_var": 0.03157450358072917, + "learning_rate": 0.0001, + "loss": 4.237, + "loss/crossentropy": 1.9612281918525696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22188737243413925, + "step": 7576 + }, + { + "epoch": 0.15156, + "grad_norm": 2.015625, + "grad_norm_var": 0.0337554931640625, + "learning_rate": 0.0001, + "loss": 4.0405, + "loss/crossentropy": 1.9610649943351746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2282957062125206, + "step": 7578 + }, + { + "epoch": 0.1516, + "grad_norm": 2.078125, + "grad_norm_var": 0.03400472005208333, + "learning_rate": 0.0001, + "loss": 4.1253, + "loss/crossentropy": 1.9239189624786377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22926658391952515, + "step": 7580 + }, + { + "epoch": 0.15164, + "grad_norm": 2.171875, + "grad_norm_var": 0.033299763997395836, + "learning_rate": 0.0001, + "loss": 4.548, + "loss/crossentropy": 2.425737738609314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24260805547237396, + "step": 7582 + }, + { + "epoch": 0.15168, + "grad_norm": 2.28125, + "grad_norm_var": 0.03242085774739583, + "learning_rate": 0.0001, + "loss": 4.2894, + "loss/crossentropy": 1.9602521061897278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23231150209903717, + "step": 7584 + }, + { + "epoch": 0.15172, + "grad_norm": 2.03125, + "grad_norm_var": 0.007954661051432292, + "learning_rate": 0.0001, + "loss": 4.3131, + "loss/crossentropy": 2.3671375513076782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24326395988464355, + "step": 7586 + }, + { + "epoch": 0.15176, + "grad_norm": 2.015625, + "grad_norm_var": 0.007675933837890625, + "learning_rate": 0.0001, + "loss": 4.1297, + "loss/crossentropy": 2.0128119587898254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20082567632198334, + "step": 7588 + }, + { + "epoch": 0.1518, + "grad_norm": 2.265625, + "grad_norm_var": 0.0085845947265625, + "learning_rate": 0.0001, + "loss": 4.5813, + "loss/crossentropy": 2.3719125986099243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.243422269821167, + "step": 7590 + }, + { + "epoch": 0.15184, + "grad_norm": 2.28125, + "grad_norm_var": 0.010575358072916667, + "learning_rate": 0.0001, + "loss": 4.6177, + "loss/crossentropy": 2.088695764541626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23578013479709625, + "step": 7592 + }, + { + "epoch": 0.15188, + "grad_norm": 2.203125, + "grad_norm_var": 0.01099853515625, + "learning_rate": 0.0001, + "loss": 4.3719, + "loss/crossentropy": 2.151872456073761, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2496410757303238, + "step": 7594 + }, + { + "epoch": 0.15192, + "grad_norm": 2.265625, + "grad_norm_var": 0.017281087239583333, + "learning_rate": 0.0001, + "loss": 4.6041, + "loss/crossentropy": 2.020963430404663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25003430247306824, + "step": 7596 + }, + { + "epoch": 0.15196, + "grad_norm": 2.34375, + "grad_norm_var": 0.0199615478515625, + "learning_rate": 0.0001, + "loss": 4.4439, + "loss/crossentropy": 1.9290395379066467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20771078765392303, + "step": 7598 + }, + { + "epoch": 0.152, + "grad_norm": 2.09375, + "grad_norm_var": 0.020536295572916665, + "learning_rate": 0.0001, + "loss": 4.2461, + "loss/crossentropy": 2.0420188307762146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2375432327389717, + "step": 7600 + }, + { + "epoch": 0.15204, + "grad_norm": 2.21875, + "grad_norm_var": 0.019169108072916666, + "learning_rate": 0.0001, + "loss": 4.4291, + "loss/crossentropy": 2.474969744682312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2535504847764969, + "step": 7602 + }, + { + "epoch": 0.15208, + "grad_norm": 2.3125, + "grad_norm_var": 0.015550740559895833, + "learning_rate": 0.0001, + "loss": 4.3487, + "loss/crossentropy": 2.177125334739685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24919381737709045, + "step": 7604 + }, + { + "epoch": 0.15212, + "grad_norm": 2.3125, + "grad_norm_var": 0.015510050455729167, + "learning_rate": 0.0001, + "loss": 4.4719, + "loss/crossentropy": 2.348747491836548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22336618602275848, + "step": 7606 + }, + { + "epoch": 0.15216, + "grad_norm": 2.28125, + "grad_norm_var": 0.014676920572916667, + "learning_rate": 0.0001, + "loss": 4.4502, + "loss/crossentropy": 1.718321442604065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19830843806266785, + "step": 7608 + }, + { + "epoch": 0.1522, + "grad_norm": 2.171875, + "grad_norm_var": 0.015901692708333335, + "learning_rate": 0.0001, + "loss": 4.4505, + "loss/crossentropy": 2.2954800128936768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23203244805335999, + "step": 7610 + }, + { + "epoch": 0.15224, + "grad_norm": 2.0625, + "grad_norm_var": 0.013570149739583334, + "learning_rate": 0.0001, + "loss": 4.1141, + "loss/crossentropy": 1.6918454766273499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18312305957078934, + "step": 7612 + }, + { + "epoch": 0.15228, + "grad_norm": 2.015625, + "grad_norm_var": 0.012727864583333333, + "learning_rate": 0.0001, + "loss": 4.1312, + "loss/crossentropy": 1.887774109840393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2151220142841339, + "step": 7614 + }, + { + "epoch": 0.15232, + "grad_norm": 2.125, + "grad_norm_var": 0.01373291015625, + "learning_rate": 0.0001, + "loss": 4.1274, + "loss/crossentropy": 2.0903998613357544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22486191242933273, + "step": 7616 + }, + { + "epoch": 0.15236, + "grad_norm": 2.0625, + "grad_norm_var": 0.016310373942057293, + "learning_rate": 0.0001, + "loss": 4.0659, + "loss/crossentropy": 1.929358720779419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20021560043096542, + "step": 7618 + }, + { + "epoch": 0.1524, + "grad_norm": 2.703125, + "grad_norm_var": 0.03484064737955729, + "learning_rate": 0.0001, + "loss": 4.5726, + "loss/crossentropy": 1.9812661409378052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21395482122898102, + "step": 7620 + }, + { + "epoch": 0.15244, + "grad_norm": 2.390625, + "grad_norm_var": 0.03358942667643229, + "learning_rate": 0.0001, + "loss": 4.15, + "loss/crossentropy": 2.148552179336548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22805052995681763, + "step": 7622 + }, + { + "epoch": 0.15248, + "grad_norm": 2.078125, + "grad_norm_var": 0.03416926066080729, + "learning_rate": 0.0001, + "loss": 4.3471, + "loss/crossentropy": 2.012804687023163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2248745858669281, + "step": 7624 + }, + { + "epoch": 0.15252, + "grad_norm": 2.046875, + "grad_norm_var": 0.03463312784830729, + "learning_rate": 0.0001, + "loss": 4.3948, + "loss/crossentropy": 2.3378156423568726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2457970678806305, + "step": 7626 + }, + { + "epoch": 0.15256, + "grad_norm": 2.296875, + "grad_norm_var": 0.036382802327473956, + "learning_rate": 0.0001, + "loss": 4.5414, + "loss/crossentropy": 2.0815274119377136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22083784639835358, + "step": 7628 + }, + { + "epoch": 0.1526, + "grad_norm": 1.984375, + "grad_norm_var": 0.039589182535807295, + "learning_rate": 0.0001, + "loss": 4.2609, + "loss/crossentropy": 2.172307014465332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25988608598709106, + "step": 7630 + }, + { + "epoch": 0.15264, + "grad_norm": 1.9921875, + "grad_norm_var": 0.03921305338541667, + "learning_rate": 0.0001, + "loss": 4.3711, + "loss/crossentropy": 1.973683476448059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20379862189292908, + "step": 7632 + }, + { + "epoch": 0.15268, + "grad_norm": 1.921875, + "grad_norm_var": 0.039406077067057295, + "learning_rate": 0.0001, + "loss": 4.3394, + "loss/crossentropy": 2.175020456314087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22480874508619308, + "step": 7634 + }, + { + "epoch": 0.15272, + "grad_norm": 2.1875, + "grad_norm_var": 0.022304026285807292, + "learning_rate": 0.0001, + "loss": 4.2176, + "loss/crossentropy": 2.063227415084839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2324352264404297, + "step": 7636 + }, + { + "epoch": 0.15276, + "grad_norm": 2.296875, + "grad_norm_var": 0.019769032796223957, + "learning_rate": 0.0001, + "loss": 4.4654, + "loss/crossentropy": 1.9297555088996887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2171614021062851, + "step": 7638 + }, + { + "epoch": 0.1528, + "grad_norm": 2.296875, + "grad_norm_var": 0.019421132405598958, + "learning_rate": 0.0001, + "loss": 4.4698, + "loss/crossentropy": 1.9808542132377625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21765583008527756, + "step": 7640 + }, + { + "epoch": 0.15284, + "grad_norm": 2.09375, + "grad_norm_var": 0.018790435791015626, + "learning_rate": 0.0001, + "loss": 4.4145, + "loss/crossentropy": 2.009860336780548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20963389426469803, + "step": 7642 + }, + { + "epoch": 0.15288, + "grad_norm": 2.015625, + "grad_norm_var": 0.017319488525390624, + "learning_rate": 0.0001, + "loss": 4.2649, + "loss/crossentropy": 1.912703514099121, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23796956986188889, + "step": 7644 + }, + { + "epoch": 0.15292, + "grad_norm": 2.25, + "grad_norm_var": 0.012341054280598958, + "learning_rate": 0.0001, + "loss": 4.4965, + "loss/crossentropy": 2.2945642471313477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2621624022722244, + "step": 7646 + }, + { + "epoch": 0.15296, + "grad_norm": 2.03125, + "grad_norm_var": 0.011067708333333334, + "learning_rate": 0.0001, + "loss": 4.2573, + "loss/crossentropy": 2.185902237892151, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23890018463134766, + "step": 7648 + }, + { + "epoch": 0.153, + "grad_norm": 2.546875, + "grad_norm_var": 0.018648274739583335, + "learning_rate": 0.0001, + "loss": 4.5815, + "loss/crossentropy": 1.8625503778457642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2206135168671608, + "step": 7650 + }, + { + "epoch": 0.15304, + "grad_norm": 1.9765625, + "grad_norm_var": 0.020684560139973957, + "learning_rate": 0.0001, + "loss": 3.9142, + "loss/crossentropy": 1.8815893530845642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21166741847991943, + "step": 7652 + }, + { + "epoch": 0.15308, + "grad_norm": 2.203125, + "grad_norm_var": 0.020979563395182293, + "learning_rate": 0.0001, + "loss": 4.2277, + "loss/crossentropy": 2.156697630882263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22994951903820038, + "step": 7654 + }, + { + "epoch": 0.15312, + "grad_norm": 2.25, + "grad_norm_var": 0.02005182902018229, + "learning_rate": 0.0001, + "loss": 4.366, + "loss/crossentropy": 1.6800576448440552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21905134618282318, + "step": 7656 + }, + { + "epoch": 0.15316, + "grad_norm": 2.046875, + "grad_norm_var": 0.02061945597330729, + "learning_rate": 0.0001, + "loss": 4.1446, + "loss/crossentropy": 1.9215145707130432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2034405767917633, + "step": 7658 + }, + { + "epoch": 0.1532, + "grad_norm": 2.390625, + "grad_norm_var": 0.02380956013997396, + "learning_rate": 0.0001, + "loss": 4.6475, + "loss/crossentropy": 2.153718650341034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25605182349681854, + "step": 7660 + }, + { + "epoch": 0.15324, + "grad_norm": 2.203125, + "grad_norm_var": 0.02466608683268229, + "learning_rate": 0.0001, + "loss": 4.2676, + "loss/crossentropy": 1.8782889246940613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20574549585580826, + "step": 7662 + }, + { + "epoch": 0.15328, + "grad_norm": 2.265625, + "grad_norm_var": 0.031040191650390625, + "learning_rate": 0.0001, + "loss": 4.2171, + "loss/crossentropy": 2.003354489803314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24152260273694992, + "step": 7664 + }, + { + "epoch": 0.15332, + "grad_norm": 2.203125, + "grad_norm_var": 0.022299957275390626, + "learning_rate": 0.0001, + "loss": 4.5828, + "loss/crossentropy": 2.217758059501648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24700726568698883, + "step": 7666 + }, + { + "epoch": 0.15336, + "grad_norm": 2.1875, + "grad_norm_var": 0.017757161458333334, + "learning_rate": 0.0001, + "loss": 4.3418, + "loss/crossentropy": 1.934537410736084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2163502648472786, + "step": 7668 + }, + { + "epoch": 0.1534, + "grad_norm": 2.03125, + "grad_norm_var": 0.020340983072916666, + "learning_rate": 0.0001, + "loss": 4.3, + "loss/crossentropy": 2.007661819458008, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21769292652606964, + "step": 7670 + }, + { + "epoch": 0.15344, + "grad_norm": 2.09375, + "grad_norm_var": 0.020653279622395833, + "learning_rate": 0.0001, + "loss": 4.3082, + "loss/crossentropy": 2.1586949825286865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2453690618276596, + "step": 7672 + }, + { + "epoch": 0.15348, + "grad_norm": 2.109375, + "grad_norm_var": 0.0197174072265625, + "learning_rate": 0.0001, + "loss": 4.4433, + "loss/crossentropy": 2.2201706171035767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22639702260494232, + "step": 7674 + }, + { + "epoch": 0.15352, + "grad_norm": 2.15625, + "grad_norm_var": 0.015104166666666667, + "learning_rate": 0.0001, + "loss": 4.2337, + "loss/crossentropy": 2.059146285057068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22411910444498062, + "step": 7676 + }, + { + "epoch": 0.15356, + "grad_norm": 2.0625, + "grad_norm_var": 0.015397135416666667, + "learning_rate": 0.0001, + "loss": 4.2318, + "loss/crossentropy": 1.9768954515457153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21798508614301682, + "step": 7678 + }, + { + "epoch": 0.1536, + "grad_norm": 2.109375, + "grad_norm_var": 0.003413899739583333, + "learning_rate": 0.0001, + "loss": 4.4823, + "loss/crossentropy": 1.8555094003677368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19453337788581848, + "step": 7680 + }, + { + "epoch": 0.15364, + "grad_norm": 2.03125, + "grad_norm_var": 0.0036783854166666666, + "learning_rate": 0.0001, + "loss": 4.0379, + "loss/crossentropy": 1.5948917865753174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17746463418006897, + "step": 7682 + }, + { + "epoch": 0.15368, + "grad_norm": 2.1875, + "grad_norm_var": 0.0034464518229166668, + "learning_rate": 0.0001, + "loss": 4.2901, + "loss/crossentropy": 1.8917757868766785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2103574424982071, + "step": 7684 + }, + { + "epoch": 0.15372, + "grad_norm": 2.140625, + "grad_norm_var": 0.0030670166015625, + "learning_rate": 0.0001, + "loss": 4.3326, + "loss/crossentropy": 2.090232729911804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2199823334813118, + "step": 7686 + }, + { + "epoch": 0.15376, + "grad_norm": 2.03125, + "grad_norm_var": 0.004979451497395833, + "learning_rate": 0.0001, + "loss": 4.3695, + "loss/crossentropy": 1.8155178427696228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21044857800006866, + "step": 7688 + }, + { + "epoch": 0.1538, + "grad_norm": 2.09375, + "grad_norm_var": 0.0063629150390625, + "learning_rate": 0.0001, + "loss": 4.5003, + "loss/crossentropy": 2.31532621383667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2437051385641098, + "step": 7690 + }, + { + "epoch": 0.15384, + "grad_norm": 2.078125, + "grad_norm_var": 0.0065419514973958336, + "learning_rate": 0.0001, + "loss": 4.3815, + "loss/crossentropy": 1.9688079357147217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22686263918876648, + "step": 7692 + }, + { + "epoch": 0.15388, + "grad_norm": 2.046875, + "grad_norm_var": 0.010001373291015626, + "learning_rate": 0.0001, + "loss": 3.9937, + "loss/crossentropy": 1.9029017686843872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20635761320590973, + "step": 7694 + }, + { + "epoch": 0.15392, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010587565104166667, + "learning_rate": 0.0001, + "loss": 3.91, + "loss/crossentropy": 1.9817028641700745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2080235257744789, + "step": 7696 + }, + { + "epoch": 0.15396, + "grad_norm": 2.171875, + "grad_norm_var": 0.017463175455729167, + "learning_rate": 0.0001, + "loss": 4.3301, + "loss/crossentropy": 2.3392014503479004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26226382702589035, + "step": 7698 + }, + { + "epoch": 0.154, + "grad_norm": 1.9375, + "grad_norm_var": 0.021320597330729166, + "learning_rate": 0.0001, + "loss": 4.0381, + "loss/crossentropy": 1.7265403866767883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21790936589241028, + "step": 7700 + }, + { + "epoch": 0.15404, + "grad_norm": 2.203125, + "grad_norm_var": 0.02276585896809896, + "learning_rate": 0.0001, + "loss": 4.1725, + "loss/crossentropy": 2.024384081363678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21987473219633102, + "step": 7702 + }, + { + "epoch": 0.15408, + "grad_norm": 2.5, + "grad_norm_var": 0.03117650349934896, + "learning_rate": 0.0001, + "loss": 4.6702, + "loss/crossentropy": 2.1840893030166626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24231631308794022, + "step": 7704 + }, + { + "epoch": 0.15412, + "grad_norm": 2.171875, + "grad_norm_var": 0.03001683553059896, + "learning_rate": 0.0001, + "loss": 4.2197, + "loss/crossentropy": 2.1950928568840027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21808429062366486, + "step": 7706 + }, + { + "epoch": 0.15416, + "grad_norm": 2.140625, + "grad_norm_var": 0.03029352823893229, + "learning_rate": 0.0001, + "loss": 4.3215, + "loss/crossentropy": 1.9541537165641785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22109197825193405, + "step": 7708 + }, + { + "epoch": 0.1542, + "grad_norm": 2.03125, + "grad_norm_var": 0.0257476806640625, + "learning_rate": 0.0001, + "loss": 4.1966, + "loss/crossentropy": 2.0232877135276794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.214762382209301, + "step": 7710 + }, + { + "epoch": 0.15424, + "grad_norm": 1.9609375, + "grad_norm_var": 0.023859659830729168, + "learning_rate": 0.0001, + "loss": 4.0012, + "loss/crossentropy": 2.003768503665924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21820105612277985, + "step": 7712 + }, + { + "epoch": 0.15428, + "grad_norm": 2.21875, + "grad_norm_var": 0.020026652018229167, + "learning_rate": 0.0001, + "loss": 4.2471, + "loss/crossentropy": 2.007221221923828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2203196883201599, + "step": 7714 + }, + { + "epoch": 0.15432, + "grad_norm": 2.09375, + "grad_norm_var": 0.015900675455729166, + "learning_rate": 0.0001, + "loss": 4.3201, + "loss/crossentropy": 2.0134615898132324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21916545927524567, + "step": 7716 + }, + { + "epoch": 0.15436, + "grad_norm": 2.328125, + "grad_norm_var": 0.016013336181640626, + "learning_rate": 0.0001, + "loss": 4.3821, + "loss/crossentropy": 1.9012999534606934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25758640468120575, + "step": 7718 + }, + { + "epoch": 0.1544, + "grad_norm": 2.25, + "grad_norm_var": 0.008727773030598959, + "learning_rate": 0.0001, + "loss": 4.1555, + "loss/crossentropy": 2.074169874191284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22667942196130753, + "step": 7720 + }, + { + "epoch": 0.15444, + "grad_norm": 2.21875, + "grad_norm_var": 0.009059397379557292, + "learning_rate": 0.0001, + "loss": 4.4355, + "loss/crossentropy": 2.070925295352936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20353248715400696, + "step": 7722 + }, + { + "epoch": 0.15448, + "grad_norm": 2.359375, + "grad_norm_var": 0.011163075764973959, + "learning_rate": 0.0001, + "loss": 4.5676, + "loss/crossentropy": 2.289568066596985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23175117373466492, + "step": 7724 + }, + { + "epoch": 0.15452, + "grad_norm": 2.15625, + "grad_norm_var": 0.012827301025390625, + "learning_rate": 0.0001, + "loss": 4.6152, + "loss/crossentropy": 2.21374249458313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23585008084774017, + "step": 7726 + }, + { + "epoch": 0.15456, + "grad_norm": 2.328125, + "grad_norm_var": 0.010791015625, + "learning_rate": 0.0001, + "loss": 4.5575, + "loss/crossentropy": 2.15897136926651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24143048375844955, + "step": 7728 + }, + { + "epoch": 0.1546, + "grad_norm": 2.359375, + "grad_norm_var": 0.01142578125, + "learning_rate": 0.0001, + "loss": 4.4722, + "loss/crossentropy": 2.134206771850586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24344487488269806, + "step": 7730 + }, + { + "epoch": 0.15464, + "grad_norm": 2.359375, + "grad_norm_var": 0.011180623372395834, + "learning_rate": 0.0001, + "loss": 4.5827, + "loss/crossentropy": 2.3832513093948364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24878299236297607, + "step": 7732 + }, + { + "epoch": 0.15468, + "grad_norm": 2.203125, + "grad_norm_var": 0.0102203369140625, + "learning_rate": 0.0001, + "loss": 4.0722, + "loss/crossentropy": 1.917544960975647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20134451985359192, + "step": 7734 + }, + { + "epoch": 0.15472, + "grad_norm": 2.40625, + "grad_norm_var": 0.011767578125, + "learning_rate": 0.0001, + "loss": 4.4499, + "loss/crossentropy": 2.1081286668777466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23577219247817993, + "step": 7736 + }, + { + "epoch": 0.15476, + "grad_norm": 2.4375, + "grad_norm_var": 0.0137603759765625, + "learning_rate": 0.0001, + "loss": 4.7942, + "loss/crossentropy": 2.214662790298462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23744845390319824, + "step": 7738 + }, + { + "epoch": 0.1548, + "grad_norm": 2.359375, + "grad_norm_var": 0.013895670572916666, + "learning_rate": 0.0001, + "loss": 4.4325, + "loss/crossentropy": 1.995256781578064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21037640422582626, + "step": 7740 + }, + { + "epoch": 0.15484, + "grad_norm": 1.890625, + "grad_norm_var": 0.024006144205729166, + "learning_rate": 0.0001, + "loss": 4.0701, + "loss/crossentropy": 2.2877765893936157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23090071976184845, + "step": 7742 + }, + { + "epoch": 0.15488, + "grad_norm": 2.125, + "grad_norm_var": 0.022652180989583333, + "learning_rate": 0.0001, + "loss": 4.6167, + "loss/crossentropy": 2.23935329914093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24617131054401398, + "step": 7744 + }, + { + "epoch": 0.15492, + "grad_norm": 2.125, + "grad_norm_var": 0.023005167643229168, + "learning_rate": 0.0001, + "loss": 4.4886, + "loss/crossentropy": 2.15006422996521, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2235095053911209, + "step": 7746 + }, + { + "epoch": 0.15496, + "grad_norm": 2.015625, + "grad_norm_var": 0.024144490559895832, + "learning_rate": 0.0001, + "loss": 4.1873, + "loss/crossentropy": 1.9917905926704407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21606986224651337, + "step": 7748 + }, + { + "epoch": 0.155, + "grad_norm": 2.203125, + "grad_norm_var": 0.025804646809895835, + "learning_rate": 0.0001, + "loss": 4.6347, + "loss/crossentropy": 2.303179979324341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2541813999414444, + "step": 7750 + }, + { + "epoch": 0.15504, + "grad_norm": 2.109375, + "grad_norm_var": 0.022786458333333332, + "learning_rate": 0.0001, + "loss": 4.3647, + "loss/crossentropy": 2.231510281562805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2479737550020218, + "step": 7752 + }, + { + "epoch": 0.15508, + "grad_norm": 2.25, + "grad_norm_var": 0.01778132120768229, + "learning_rate": 0.0001, + "loss": 4.1714, + "loss/crossentropy": 2.0530437231063843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20220057666301727, + "step": 7754 + }, + { + "epoch": 0.15512, + "grad_norm": 2.171875, + "grad_norm_var": 0.015909830729166668, + "learning_rate": 0.0001, + "loss": 4.1978, + "loss/crossentropy": 2.0889222025871277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22389977425336838, + "step": 7756 + }, + { + "epoch": 0.15516, + "grad_norm": 2.28125, + "grad_norm_var": 0.0131256103515625, + "learning_rate": 0.0001, + "loss": 4.4748, + "loss/crossentropy": 2.3807711601257324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2397037297487259, + "step": 7758 + }, + { + "epoch": 0.1552, + "grad_norm": 2.171875, + "grad_norm_var": 0.01297607421875, + "learning_rate": 0.0001, + "loss": 4.3382, + "loss/crossentropy": 1.8144067525863647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19052604585886002, + "step": 7760 + }, + { + "epoch": 0.15524, + "grad_norm": 2.09375, + "grad_norm_var": 0.013240559895833334, + "learning_rate": 0.0001, + "loss": 4.5358, + "loss/crossentropy": 2.295349955558777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2168404459953308, + "step": 7762 + }, + { + "epoch": 0.15528, + "grad_norm": 11.8125, + "grad_norm_var": 5.868936920166016, + "learning_rate": 0.0001, + "loss": 4.1706, + "loss/crossentropy": 1.7281805276870728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.230300635099411, + "step": 7764 + }, + { + "epoch": 0.15532, + "grad_norm": 2.375, + "grad_norm_var": 5.861083730061849, + "learning_rate": 0.0001, + "loss": 4.3867, + "loss/crossentropy": 2.2153135538101196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24419061839580536, + "step": 7766 + }, + { + "epoch": 0.15536, + "grad_norm": 2.03125, + "grad_norm_var": 5.890169270833334, + "learning_rate": 0.0001, + "loss": 4.2047, + "loss/crossentropy": 2.0259060859680176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2119704708456993, + "step": 7768 + }, + { + "epoch": 0.1554, + "grad_norm": 2.21875, + "grad_norm_var": 5.871726226806641, + "learning_rate": 0.0001, + "loss": 4.3405, + "loss/crossentropy": 2.2399297952651978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23252833634614944, + "step": 7770 + }, + { + "epoch": 0.15544, + "grad_norm": 2.109375, + "grad_norm_var": 5.86380615234375, + "learning_rate": 0.0001, + "loss": 4.3287, + "loss/crossentropy": 2.0974661111831665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2085232511162758, + "step": 7772 + }, + { + "epoch": 0.15548, + "grad_norm": 2.21875, + "grad_norm_var": 5.861717732747396, + "learning_rate": 0.0001, + "loss": 4.4141, + "loss/crossentropy": 2.0121108293533325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24354346096515656, + "step": 7774 + }, + { + "epoch": 0.15552, + "grad_norm": 2.171875, + "grad_norm_var": 5.843431599934896, + "learning_rate": 0.0001, + "loss": 4.3167, + "loss/crossentropy": 2.1463273763656616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22797952592372894, + "step": 7776 + }, + { + "epoch": 0.15556, + "grad_norm": 1.9140625, + "grad_norm_var": 5.875705718994141, + "learning_rate": 0.0001, + "loss": 3.9814, + "loss/crossentropy": 1.6362827122211456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18291430547833443, + "step": 7778 + }, + { + "epoch": 0.1556, + "grad_norm": 2.21875, + "grad_norm_var": 0.016283162434895835, + "learning_rate": 0.0001, + "loss": 4.3534, + "loss/crossentropy": 2.4894620180130005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24631594866514206, + "step": 7780 + }, + { + "epoch": 0.15564, + "grad_norm": 2.15625, + "grad_norm_var": 0.0119781494140625, + "learning_rate": 0.0001, + "loss": 4.5799, + "loss/crossentropy": 2.346967577934265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20964853465557098, + "step": 7782 + }, + { + "epoch": 0.15568, + "grad_norm": 2.03125, + "grad_norm_var": 0.010282135009765625, + "learning_rate": 0.0001, + "loss": 4.5036, + "loss/crossentropy": 2.0165189504623413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22410035878419876, + "step": 7784 + }, + { + "epoch": 0.15572, + "grad_norm": 2.140625, + "grad_norm_var": 0.009417470296223958, + "learning_rate": 0.0001, + "loss": 4.0735, + "loss/crossentropy": 1.7486848831176758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20792805403470993, + "step": 7786 + }, + { + "epoch": 0.15576, + "grad_norm": 2.03125, + "grad_norm_var": 0.009905751546223958, + "learning_rate": 0.0001, + "loss": 4.1321, + "loss/crossentropy": 1.9615037441253662, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22578728944063187, + "step": 7788 + }, + { + "epoch": 0.1558, + "grad_norm": 2.09375, + "grad_norm_var": 0.009069569905598958, + "learning_rate": 0.0001, + "loss": 4.326, + "loss/crossentropy": 1.8386783003807068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21898606419563293, + "step": 7790 + }, + { + "epoch": 0.15584, + "grad_norm": 2.15625, + "grad_norm_var": 0.011533355712890625, + "learning_rate": 0.0001, + "loss": 4.6429, + "loss/crossentropy": 2.1383039951324463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2322411835193634, + "step": 7792 + }, + { + "epoch": 0.15588, + "grad_norm": 2.21875, + "grad_norm_var": 0.008918253580729167, + "learning_rate": 0.0001, + "loss": 4.3568, + "loss/crossentropy": 2.510488271713257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24672146886587143, + "step": 7794 + }, + { + "epoch": 0.15592, + "grad_norm": 2.125, + "grad_norm_var": 0.009309895833333333, + "learning_rate": 0.0001, + "loss": 4.4328, + "loss/crossentropy": 2.03993421792984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23064473271369934, + "step": 7796 + }, + { + "epoch": 0.15596, + "grad_norm": 2.09375, + "grad_norm_var": 0.007306925455729167, + "learning_rate": 0.0001, + "loss": 4.4833, + "loss/crossentropy": 2.305809736251831, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21916814893484116, + "step": 7798 + }, + { + "epoch": 0.156, + "grad_norm": 2.03125, + "grad_norm_var": 0.007796223958333333, + "learning_rate": 0.0001, + "loss": 4.1218, + "loss/crossentropy": 1.8330454230308533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20832987129688263, + "step": 7800 + }, + { + "epoch": 0.15604, + "grad_norm": 2.140625, + "grad_norm_var": 0.007420857747395833, + "learning_rate": 0.0001, + "loss": 4.2579, + "loss/crossentropy": 1.9194663166999817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22163349390029907, + "step": 7802 + }, + { + "epoch": 0.15608, + "grad_norm": 2.078125, + "grad_norm_var": 0.006917317708333333, + "learning_rate": 0.0001, + "loss": 4.2219, + "loss/crossentropy": 1.798878252506256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19532181322574615, + "step": 7804 + }, + { + "epoch": 0.15612, + "grad_norm": 2.140625, + "grad_norm_var": 0.0067942301432291664, + "learning_rate": 0.0001, + "loss": 4.2949, + "loss/crossentropy": 1.730432152748108, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20092248916625977, + "step": 7806 + }, + { + "epoch": 0.15616, + "grad_norm": 2.515625, + "grad_norm_var": 0.01304931640625, + "learning_rate": 0.0001, + "loss": 4.3109, + "loss/crossentropy": 2.1426968574523926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22185371816158295, + "step": 7808 + }, + { + "epoch": 0.1562, + "grad_norm": 2.078125, + "grad_norm_var": 0.012848917643229167, + "learning_rate": 0.0001, + "loss": 4.4308, + "loss/crossentropy": 1.983969271183014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22203146666288376, + "step": 7810 + }, + { + "epoch": 0.15624, + "grad_norm": 2.0625, + "grad_norm_var": 0.016532389322916667, + "learning_rate": 0.0001, + "loss": 4.2114, + "loss/crossentropy": 2.2948192954063416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23453453928232193, + "step": 7812 + }, + { + "epoch": 0.15628, + "grad_norm": 2.09375, + "grad_norm_var": 0.0175933837890625, + "learning_rate": 0.0001, + "loss": 4.287, + "loss/crossentropy": 1.9190048575401306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22457829862833023, + "step": 7814 + }, + { + "epoch": 0.15632, + "grad_norm": 2.171875, + "grad_norm_var": 0.01529541015625, + "learning_rate": 0.0001, + "loss": 4.1721, + "loss/crossentropy": 2.0268847346305847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21628276258707047, + "step": 7816 + }, + { + "epoch": 0.15636, + "grad_norm": 2.203125, + "grad_norm_var": 0.017411295572916666, + "learning_rate": 0.0001, + "loss": 4.3553, + "loss/crossentropy": 2.0050706267356873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20108875632286072, + "step": 7818 + }, + { + "epoch": 0.1564, + "grad_norm": 2.453125, + "grad_norm_var": 0.024470774332682292, + "learning_rate": 0.0001, + "loss": 4.2059, + "loss/crossentropy": 2.096635937690735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22623063623905182, + "step": 7820 + }, + { + "epoch": 0.15644, + "grad_norm": 2.21875, + "grad_norm_var": 0.024580637613932293, + "learning_rate": 0.0001, + "loss": 4.3901, + "loss/crossentropy": 1.9188768863677979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21088901162147522, + "step": 7822 + }, + { + "epoch": 0.15648, + "grad_norm": 2.203125, + "grad_norm_var": 0.018304189046223957, + "learning_rate": 0.0001, + "loss": 4.4062, + "loss/crossentropy": 2.3639817237854004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2608294039964676, + "step": 7824 + }, + { + "epoch": 0.15652, + "grad_norm": 2.1875, + "grad_norm_var": 0.017765045166015625, + "learning_rate": 0.0001, + "loss": 4.2123, + "loss/crossentropy": 1.9716956615447998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23327408730983734, + "step": 7826 + }, + { + "epoch": 0.15656, + "grad_norm": 2.265625, + "grad_norm_var": 0.08131688435872396, + "learning_rate": 0.0001, + "loss": 4.1377, + "loss/crossentropy": 2.004276990890503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22930586338043213, + "step": 7828 + }, + { + "epoch": 0.1566, + "grad_norm": 2.078125, + "grad_norm_var": 0.08247858683268229, + "learning_rate": 0.0001, + "loss": 4.4703, + "loss/crossentropy": 1.7996181845664978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21030305325984955, + "step": 7830 + }, + { + "epoch": 0.15664, + "grad_norm": 2.34375, + "grad_norm_var": 0.09555435180664062, + "learning_rate": 0.0001, + "loss": 4.7938, + "loss/crossentropy": 2.192178189754486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2232302725315094, + "step": 7832 + }, + { + "epoch": 0.15668, + "grad_norm": 2.109375, + "grad_norm_var": 0.09538345336914063, + "learning_rate": 0.0001, + "loss": 4.2381, + "loss/crossentropy": 1.7093925476074219, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19697313755750656, + "step": 7834 + }, + { + "epoch": 0.15672, + "grad_norm": 2.234375, + "grad_norm_var": 0.09130859375, + "learning_rate": 0.0001, + "loss": 4.2534, + "loss/crossentropy": 1.915247917175293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21003933250904083, + "step": 7836 + }, + { + "epoch": 0.15676, + "grad_norm": 2.375, + "grad_norm_var": 0.091162109375, + "learning_rate": 0.0001, + "loss": 4.3825, + "loss/crossentropy": 2.188641667366028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23260314762592316, + "step": 7838 + }, + { + "epoch": 0.1568, + "grad_norm": 2.109375, + "grad_norm_var": 0.09501953125, + "learning_rate": 0.0001, + "loss": 4.7329, + "loss/crossentropy": 2.3316495418548584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24601806700229645, + "step": 7840 + }, + { + "epoch": 0.15684, + "grad_norm": 2.28125, + "grad_norm_var": 0.0918121337890625, + "learning_rate": 0.0001, + "loss": 4.2934, + "loss/crossentropy": 2.140946924686432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23532958328723907, + "step": 7842 + }, + { + "epoch": 0.15688, + "grad_norm": 2.21875, + "grad_norm_var": 0.031525675455729166, + "learning_rate": 0.0001, + "loss": 4.2738, + "loss/crossentropy": 2.372095465660095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2615740895271301, + "step": 7844 + }, + { + "epoch": 0.15692, + "grad_norm": 2.25, + "grad_norm_var": 0.028727213541666668, + "learning_rate": 0.0001, + "loss": 4.1954, + "loss/crossentropy": 1.8433185815811157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2160269021987915, + "step": 7846 + }, + { + "epoch": 0.15696, + "grad_norm": 2.140625, + "grad_norm_var": 0.016071573893229166, + "learning_rate": 0.0001, + "loss": 4.2489, + "loss/crossentropy": 2.012324333190918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1964300200343132, + "step": 7848 + }, + { + "epoch": 0.157, + "grad_norm": 2.203125, + "grad_norm_var": 0.014090983072916667, + "learning_rate": 0.0001, + "loss": 4.486, + "loss/crossentropy": 2.0325884222984314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22080926597118378, + "step": 7850 + }, + { + "epoch": 0.15704, + "grad_norm": 2.296875, + "grad_norm_var": 0.013036092122395834, + "learning_rate": 0.0001, + "loss": 4.3784, + "loss/crossentropy": 2.3786104917526245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24448946118354797, + "step": 7852 + }, + { + "epoch": 0.15708, + "grad_norm": 2.21875, + "grad_norm_var": 0.011693318684895834, + "learning_rate": 0.0001, + "loss": 4.5173, + "loss/crossentropy": 2.304913640022278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24708375334739685, + "step": 7854 + }, + { + "epoch": 0.15712, + "grad_norm": 2.1875, + "grad_norm_var": 0.006371053059895834, + "learning_rate": 0.0001, + "loss": 4.5711, + "loss/crossentropy": 1.8640215396881104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19824489206075668, + "step": 7856 + }, + { + "epoch": 0.15716, + "grad_norm": 2.0625, + "grad_norm_var": 0.00625, + "learning_rate": 0.0001, + "loss": 4.6174, + "loss/crossentropy": 2.444548487663269, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2542492523789406, + "step": 7858 + }, + { + "epoch": 0.1572, + "grad_norm": 2.078125, + "grad_norm_var": 0.00592041015625, + "learning_rate": 0.0001, + "loss": 4.0522, + "loss/crossentropy": 1.991346299648285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2159758359193802, + "step": 7860 + }, + { + "epoch": 0.15724, + "grad_norm": 2.15625, + "grad_norm_var": 0.005101521809895833, + "learning_rate": 0.0001, + "loss": 4.2627, + "loss/crossentropy": 2.3172048926353455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2506742626428604, + "step": 7862 + }, + { + "epoch": 0.15728, + "grad_norm": 2.265625, + "grad_norm_var": 0.0050201416015625, + "learning_rate": 0.0001, + "loss": 4.5415, + "loss/crossentropy": 1.9302632212638855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104542776942253, + "step": 7864 + }, + { + "epoch": 0.15732, + "grad_norm": 2.125, + "grad_norm_var": 0.020726521809895832, + "learning_rate": 0.0001, + "loss": 4.7831, + "loss/crossentropy": 2.457883358001709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23538483679294586, + "step": 7866 + }, + { + "epoch": 0.15736, + "grad_norm": 2.125, + "grad_norm_var": 0.019677734375, + "learning_rate": 0.0001, + "loss": 4.1912, + "loss/crossentropy": 1.9857566952705383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2131117358803749, + "step": 7868 + }, + { + "epoch": 0.1574, + "grad_norm": 2.203125, + "grad_norm_var": 0.019140625, + "learning_rate": 0.0001, + "loss": 4.4264, + "loss/crossentropy": 2.0062127113342285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22806233912706375, + "step": 7870 + }, + { + "epoch": 0.15744, + "grad_norm": 2.15625, + "grad_norm_var": 0.019489542643229166, + "learning_rate": 0.0001, + "loss": 4.6181, + "loss/crossentropy": 2.283127784729004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23883548378944397, + "step": 7872 + }, + { + "epoch": 0.15748, + "grad_norm": 2.234375, + "grad_norm_var": 0.019136555989583335, + "learning_rate": 0.0001, + "loss": 4.4442, + "loss/crossentropy": 2.110979437828064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.209333136677742, + "step": 7874 + }, + { + "epoch": 0.15752, + "grad_norm": 2.140625, + "grad_norm_var": 0.019856770833333332, + "learning_rate": 0.0001, + "loss": 4.2107, + "loss/crossentropy": 1.8705166578292847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20791998505592346, + "step": 7876 + }, + { + "epoch": 0.15756, + "grad_norm": 2.140625, + "grad_norm_var": 0.019017537434895832, + "learning_rate": 0.0001, + "loss": 4.2956, + "loss/crossentropy": 2.133803129196167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23219672590494156, + "step": 7878 + }, + { + "epoch": 0.1576, + "grad_norm": 2.21875, + "grad_norm_var": 0.019627888997395832, + "learning_rate": 0.0001, + "loss": 4.5028, + "loss/crossentropy": 2.1062549352645874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2266012579202652, + "step": 7880 + }, + { + "epoch": 0.15764, + "grad_norm": 2.28125, + "grad_norm_var": 0.0058553059895833336, + "learning_rate": 0.0001, + "loss": 4.5078, + "loss/crossentropy": 2.0088155269622803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2079932913184166, + "step": 7882 + }, + { + "epoch": 0.15768, + "grad_norm": 2.234375, + "grad_norm_var": 0.0065582275390625, + "learning_rate": 0.0001, + "loss": 3.9204, + "loss/crossentropy": 1.6621176600456238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19370710104703903, + "step": 7884 + }, + { + "epoch": 0.15772, + "grad_norm": 2.09375, + "grad_norm_var": 0.007136027018229167, + "learning_rate": 0.0001, + "loss": 4.3462, + "loss/crossentropy": 1.7729167938232422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18562481552362442, + "step": 7886 + }, + { + "epoch": 0.15776, + "grad_norm": 2.109375, + "grad_norm_var": 0.006843058268229166, + "learning_rate": 0.0001, + "loss": 4.3897, + "loss/crossentropy": 2.132485508918762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23432063311338425, + "step": 7888 + }, + { + "epoch": 0.1578, + "grad_norm": 2.328125, + "grad_norm_var": 0.0091949462890625, + "learning_rate": 0.0001, + "loss": 4.3731, + "loss/crossentropy": 2.122144937515259, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23042195290327072, + "step": 7890 + }, + { + "epoch": 0.15784, + "grad_norm": 2.8125, + "grad_norm_var": 0.034956868489583334, + "learning_rate": 0.0001, + "loss": 4.4025, + "loss/crossentropy": 1.855578601360321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20124691724777222, + "step": 7892 + }, + { + "epoch": 0.15788, + "grad_norm": 2.15625, + "grad_norm_var": 0.035054524739583336, + "learning_rate": 0.0001, + "loss": 4.172, + "loss/crossentropy": 2.02128005027771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20766886323690414, + "step": 7894 + }, + { + "epoch": 0.15792, + "grad_norm": 1.984375, + "grad_norm_var": 0.03681233723958333, + "learning_rate": 0.0001, + "loss": 4.3096, + "loss/crossentropy": 2.112824857234955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22250613570213318, + "step": 7896 + }, + { + "epoch": 0.15796, + "grad_norm": 2.3125, + "grad_norm_var": 0.03662821451822917, + "learning_rate": 0.0001, + "loss": 4.5595, + "loss/crossentropy": 2.2290207147598267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2531846910715103, + "step": 7898 + }, + { + "epoch": 0.158, + "grad_norm": 1.96875, + "grad_norm_var": 0.03882548014322917, + "learning_rate": 0.0001, + "loss": 4.3586, + "loss/crossentropy": 2.135373592376709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2266940325498581, + "step": 7900 + }, + { + "epoch": 0.15804, + "grad_norm": 2.125, + "grad_norm_var": 0.04052734375, + "learning_rate": 0.0001, + "loss": 4.1719, + "loss/crossentropy": 1.7298616170883179, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1986929401755333, + "step": 7902 + }, + { + "epoch": 0.15808, + "grad_norm": 2.3125, + "grad_norm_var": 0.04173075358072917, + "learning_rate": 0.0001, + "loss": 4.403, + "loss/crossentropy": 2.18759286403656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21622422337532043, + "step": 7904 + }, + { + "epoch": 0.15812, + "grad_norm": 2.140625, + "grad_norm_var": 0.03889058430989583, + "learning_rate": 0.0001, + "loss": 4.4765, + "loss/crossentropy": 2.0889216661453247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21440055221319199, + "step": 7906 + }, + { + "epoch": 0.15816, + "grad_norm": 2.1875, + "grad_norm_var": 0.010204060872395834, + "learning_rate": 0.0001, + "loss": 4.2231, + "loss/crossentropy": 1.7791658639907837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1967781037092209, + "step": 7908 + }, + { + "epoch": 0.1582, + "grad_norm": 1.953125, + "grad_norm_var": 0.012386067708333334, + "learning_rate": 0.0001, + "loss": 4.2314, + "loss/crossentropy": 2.2144845724105835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22152486443519592, + "step": 7910 + }, + { + "epoch": 0.15824, + "grad_norm": 2.078125, + "grad_norm_var": 0.012723795572916667, + "learning_rate": 0.0001, + "loss": 4.6236, + "loss/crossentropy": 2.316117286682129, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24549901485443115, + "step": 7912 + }, + { + "epoch": 0.15828, + "grad_norm": 2.046875, + "grad_norm_var": 0.0115875244140625, + "learning_rate": 0.0001, + "loss": 4.0867, + "loss/crossentropy": 1.8642511367797852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22441796958446503, + "step": 7914 + }, + { + "epoch": 0.15832, + "grad_norm": 2.078125, + "grad_norm_var": 0.0098541259765625, + "learning_rate": 0.0001, + "loss": 4.1365, + "loss/crossentropy": 1.807969868183136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22138798981904984, + "step": 7916 + }, + { + "epoch": 0.15836, + "grad_norm": 2.390625, + "grad_norm_var": 0.013866170247395834, + "learning_rate": 0.0001, + "loss": 4.5625, + "loss/crossentropy": 2.266697645187378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22544697672128677, + "step": 7918 + }, + { + "epoch": 0.1584, + "grad_norm": 2.203125, + "grad_norm_var": 0.012262980143229166, + "learning_rate": 0.0001, + "loss": 4.5632, + "loss/crossentropy": 2.0871587991714478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21997228264808655, + "step": 7920 + }, + { + "epoch": 0.15844, + "grad_norm": 2.21875, + "grad_norm_var": 0.0126861572265625, + "learning_rate": 0.0001, + "loss": 4.5567, + "loss/crossentropy": 2.1993759870529175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2416691780090332, + "step": 7922 + }, + { + "epoch": 0.15848, + "grad_norm": 2.5, + "grad_norm_var": 0.020340983072916666, + "learning_rate": 0.0001, + "loss": 4.2997, + "loss/crossentropy": 1.8039653897285461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21721573173999786, + "step": 7924 + }, + { + "epoch": 0.15852, + "grad_norm": 2.3125, + "grad_norm_var": 0.0225982666015625, + "learning_rate": 0.0001, + "loss": 4.4795, + "loss/crossentropy": 2.188117265701294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.231883242726326, + "step": 7926 + }, + { + "epoch": 0.15856, + "grad_norm": 2.171875, + "grad_norm_var": 0.021923828125, + "learning_rate": 0.0001, + "loss": 4.6694, + "loss/crossentropy": 2.3920425176620483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24655026197433472, + "step": 7928 + }, + { + "epoch": 0.1586, + "grad_norm": 2.125, + "grad_norm_var": 0.017561848958333334, + "learning_rate": 0.0001, + "loss": 4.1001, + "loss/crossentropy": 2.286831498146057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23398208618164062, + "step": 7930 + }, + { + "epoch": 0.15864, + "grad_norm": 2.0625, + "grad_norm_var": 0.016825358072916668, + "learning_rate": 0.0001, + "loss": 4.0007, + "loss/crossentropy": 2.075824797153473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22423933446407318, + "step": 7932 + }, + { + "epoch": 0.15868, + "grad_norm": 2.25, + "grad_norm_var": 0.0155426025390625, + "learning_rate": 0.0001, + "loss": 4.5697, + "loss/crossentropy": 2.197165012359619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22215355187654495, + "step": 7934 + }, + { + "epoch": 0.15872, + "grad_norm": 2.078125, + "grad_norm_var": 0.017854817708333335, + "learning_rate": 0.0001, + "loss": 4.2899, + "loss/crossentropy": 1.8253535032272339, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20457974076271057, + "step": 7936 + }, + { + "epoch": 0.15876, + "grad_norm": 2.21875, + "grad_norm_var": 0.01783447265625, + "learning_rate": 0.0001, + "loss": 4.4774, + "loss/crossentropy": 1.842383086681366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20742817968130112, + "step": 7938 + }, + { + "epoch": 0.1588, + "grad_norm": 2.0625, + "grad_norm_var": 0.014208984375, + "learning_rate": 0.0001, + "loss": 4.3801, + "loss/crossentropy": 2.2086315155029297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2086438685655594, + "step": 7940 + }, + { + "epoch": 0.15884, + "grad_norm": 2.109375, + "grad_norm_var": 0.0067047119140625, + "learning_rate": 0.0001, + "loss": 4.3718, + "loss/crossentropy": 2.3381282091140747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2546956539154053, + "step": 7942 + }, + { + "epoch": 0.15888, + "grad_norm": 2.265625, + "grad_norm_var": 0.005248006184895833, + "learning_rate": 0.0001, + "loss": 4.3439, + "loss/crossentropy": 2.1159931421279907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23917824029922485, + "step": 7944 + }, + { + "epoch": 0.15892, + "grad_norm": 2.078125, + "grad_norm_var": 0.0069081624348958336, + "learning_rate": 0.0001, + "loss": 4.2056, + "loss/crossentropy": 1.7507159113883972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1870383694767952, + "step": 7946 + }, + { + "epoch": 0.15896, + "grad_norm": 2.140625, + "grad_norm_var": 0.0073964436848958336, + "learning_rate": 0.0001, + "loss": 4.2118, + "loss/crossentropy": 2.213807225227356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22211921215057373, + "step": 7948 + }, + { + "epoch": 0.159, + "grad_norm": 2.03125, + "grad_norm_var": 0.009598795572916667, + "learning_rate": 0.0001, + "loss": 4.4316, + "loss/crossentropy": 2.0570366978645325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2662791311740875, + "step": 7950 + }, + { + "epoch": 0.15904, + "grad_norm": 2.171875, + "grad_norm_var": 0.009748331705729167, + "learning_rate": 0.0001, + "loss": 4.2041, + "loss/crossentropy": 1.8775206208229065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22045740485191345, + "step": 7952 + }, + { + "epoch": 0.15908, + "grad_norm": 2.046875, + "grad_norm_var": 0.009601847330729166, + "learning_rate": 0.0001, + "loss": 4.2575, + "loss/crossentropy": 2.3483108282089233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22626785188913345, + "step": 7954 + }, + { + "epoch": 0.15912, + "grad_norm": 2.125, + "grad_norm_var": 0.009496053059895834, + "learning_rate": 0.0001, + "loss": 4.193, + "loss/crossentropy": 1.9531084895133972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22194421291351318, + "step": 7956 + }, + { + "epoch": 0.15916, + "grad_norm": 2.328125, + "grad_norm_var": 0.011213175455729167, + "learning_rate": 0.0001, + "loss": 4.4819, + "loss/crossentropy": 2.057813823223114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21116723865270615, + "step": 7958 + }, + { + "epoch": 0.1592, + "grad_norm": 2.28125, + "grad_norm_var": 0.01148681640625, + "learning_rate": 0.0001, + "loss": 4.3478, + "loss/crossentropy": 1.9398415088653564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2257898524403572, + "step": 7960 + }, + { + "epoch": 0.15924, + "grad_norm": 2.015625, + "grad_norm_var": 0.0111480712890625, + "learning_rate": 0.0001, + "loss": 4.1942, + "loss/crossentropy": 1.9200270175933838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2119666188955307, + "step": 7962 + }, + { + "epoch": 0.15928, + "grad_norm": 2.203125, + "grad_norm_var": 0.010640462239583334, + "learning_rate": 0.0001, + "loss": 4.4513, + "loss/crossentropy": 2.157149076461792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23271384835243225, + "step": 7964 + }, + { + "epoch": 0.15932, + "grad_norm": 2.25, + "grad_norm_var": 0.0083404541015625, + "learning_rate": 0.0001, + "loss": 4.2064, + "loss/crossentropy": 2.0308582186698914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23934553563594818, + "step": 7966 + }, + { + "epoch": 0.15936, + "grad_norm": 2.234375, + "grad_norm_var": 0.015543619791666666, + "learning_rate": 0.0001, + "loss": 4.3977, + "loss/crossentropy": 2.1855711936950684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.236568883061409, + "step": 7968 + }, + { + "epoch": 0.1594, + "grad_norm": 2.375, + "grad_norm_var": 0.07629292805989583, + "learning_rate": 0.0001, + "loss": 4.719, + "loss/crossentropy": 2.4479551315307617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24693088978528976, + "step": 7970 + }, + { + "epoch": 0.15944, + "grad_norm": 2.21875, + "grad_norm_var": 0.07333882649739583, + "learning_rate": 0.0001, + "loss": 4.5221, + "loss/crossentropy": 2.439974784851074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2502682954072952, + "step": 7972 + }, + { + "epoch": 0.15948, + "grad_norm": 2.375, + "grad_norm_var": 0.07625325520833333, + "learning_rate": 0.0001, + "loss": 4.4024, + "loss/crossentropy": 1.9899010062217712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23031124472618103, + "step": 7974 + }, + { + "epoch": 0.15952, + "grad_norm": 2.109375, + "grad_norm_var": 0.07517903645833333, + "learning_rate": 0.0001, + "loss": 4.2522, + "loss/crossentropy": 1.830255150794983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1984459012746811, + "step": 7976 + }, + { + "epoch": 0.15956, + "grad_norm": 2.21875, + "grad_norm_var": 0.07088114420572916, + "learning_rate": 0.0001, + "loss": 4.3832, + "loss/crossentropy": 1.9675705432891846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21336429566144943, + "step": 7978 + }, + { + "epoch": 0.1596, + "grad_norm": 2.1875, + "grad_norm_var": 0.06965738932291667, + "learning_rate": 0.0001, + "loss": 4.4784, + "loss/crossentropy": 2.030815005302429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2102503478527069, + "step": 7980 + }, + { + "epoch": 0.15964, + "grad_norm": 2.109375, + "grad_norm_var": 0.07285054524739583, + "learning_rate": 0.0001, + "loss": 4.492, + "loss/crossentropy": 2.4932440519332886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.246867336332798, + "step": 7982 + }, + { + "epoch": 0.15968, + "grad_norm": 2.078125, + "grad_norm_var": 0.07330322265625, + "learning_rate": 0.0001, + "loss": 4.2085, + "loss/crossentropy": 1.5839802622795105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17231802642345428, + "step": 7984 + }, + { + "epoch": 0.15972, + "grad_norm": 2.359375, + "grad_norm_var": 0.0091217041015625, + "learning_rate": 0.0001, + "loss": 4.4045, + "loss/crossentropy": 1.821477472782135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2239791452884674, + "step": 7986 + }, + { + "epoch": 0.15976, + "grad_norm": 2.109375, + "grad_norm_var": 0.0093414306640625, + "learning_rate": 0.0001, + "loss": 4.0655, + "loss/crossentropy": 2.013838052749634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2335103377699852, + "step": 7988 + }, + { + "epoch": 0.1598, + "grad_norm": 2.03125, + "grad_norm_var": 0.006376139322916667, + "learning_rate": 0.0001, + "loss": 4.4989, + "loss/crossentropy": 1.9412779211997986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20336110144853592, + "step": 7990 + }, + { + "epoch": 0.15984, + "grad_norm": 2.0, + "grad_norm_var": 0.008234659830729166, + "learning_rate": 0.0001, + "loss": 3.9015, + "loss/crossentropy": 1.6653677225112915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2025909200310707, + "step": 7992 + }, + { + "epoch": 0.15988, + "grad_norm": 2.125, + "grad_norm_var": 0.00865478515625, + "learning_rate": 0.0001, + "loss": 4.4299, + "loss/crossentropy": 2.0069726705551147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21300900727510452, + "step": 7994 + }, + { + "epoch": 0.15992, + "grad_norm": 2.046875, + "grad_norm_var": 0.009919230143229167, + "learning_rate": 0.0001, + "loss": 4.4395, + "loss/crossentropy": 2.1118472814559937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25853876769542694, + "step": 7996 + }, + { + "epoch": 0.15996, + "grad_norm": 2.34375, + "grad_norm_var": 0.013622029622395834, + "learning_rate": 0.0001, + "loss": 4.1922, + "loss/crossentropy": 1.608262836933136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1958206593990326, + "step": 7998 + }, + { + "epoch": 0.16, + "grad_norm": 2.203125, + "grad_norm_var": 0.0138580322265625, + "learning_rate": 0.0001, + "loss": 4.2896, + "loss/crossentropy": 1.6572073101997375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19102878868579865, + "step": 8000 + }, + { + "epoch": 0.16004, + "grad_norm": 2.015625, + "grad_norm_var": 0.0113677978515625, + "learning_rate": 0.0001, + "loss": 4.0811, + "loss/crossentropy": 1.9421688318252563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19628942012786865, + "step": 8002 + }, + { + "epoch": 0.16008, + "grad_norm": 2.265625, + "grad_norm_var": 0.0134918212890625, + "learning_rate": 0.0001, + "loss": 4.5344, + "loss/crossentropy": 2.2197489738464355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.248014435172081, + "step": 8004 + }, + { + "epoch": 0.16012, + "grad_norm": 2.203125, + "grad_norm_var": 0.0126617431640625, + "learning_rate": 0.0001, + "loss": 4.4625, + "loss/crossentropy": 2.200868308544159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2476908192038536, + "step": 8006 + }, + { + "epoch": 0.16016, + "grad_norm": 2.34375, + "grad_norm_var": 0.01162109375, + "learning_rate": 0.0001, + "loss": 4.8151, + "loss/crossentropy": 2.3793649673461914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2356039509177208, + "step": 8008 + }, + { + "epoch": 0.1602, + "grad_norm": 2.140625, + "grad_norm_var": 0.0112945556640625, + "learning_rate": 0.0001, + "loss": 4.1849, + "loss/crossentropy": 2.130257308483124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224358968436718, + "step": 8010 + }, + { + "epoch": 0.16024, + "grad_norm": 2.15625, + "grad_norm_var": 0.009566243489583333, + "learning_rate": 0.0001, + "loss": 4.5948, + "loss/crossentropy": 2.370365023612976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23811528086662292, + "step": 8012 + }, + { + "epoch": 0.16028, + "grad_norm": 2.0, + "grad_norm_var": 0.008812459309895833, + "learning_rate": 0.0001, + "loss": 4.4326, + "loss/crossentropy": 1.985486626625061, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19639353454113007, + "step": 8014 + }, + { + "epoch": 0.16032, + "grad_norm": 2.40625, + "grad_norm_var": 0.01129150390625, + "learning_rate": 0.0001, + "loss": 4.4335, + "loss/crossentropy": 2.2128632068634033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2340361252427101, + "step": 8016 + }, + { + "epoch": 0.16036, + "grad_norm": 2.0625, + "grad_norm_var": 0.009956868489583333, + "learning_rate": 0.0001, + "loss": 4.3688, + "loss/crossentropy": 1.830498456954956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19760622829198837, + "step": 8018 + }, + { + "epoch": 0.1604, + "grad_norm": 4.375, + "grad_norm_var": 0.3061757405598958, + "learning_rate": 0.0001, + "loss": 4.6443, + "loss/crossentropy": 1.9595977067947388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2138657420873642, + "step": 8020 + }, + { + "epoch": 0.16044, + "grad_norm": 2.1875, + "grad_norm_var": 0.30684305826822916, + "learning_rate": 0.0001, + "loss": 4.3101, + "loss/crossentropy": 2.400893449783325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2615668326616287, + "step": 8022 + }, + { + "epoch": 0.16048, + "grad_norm": 2.140625, + "grad_norm_var": 0.3080963134765625, + "learning_rate": 0.0001, + "loss": 4.6359, + "loss/crossentropy": 2.5079843997955322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.251235693693161, + "step": 8024 + }, + { + "epoch": 0.16052, + "grad_norm": 2.265625, + "grad_norm_var": 0.30686848958333335, + "learning_rate": 0.0001, + "loss": 4.4973, + "loss/crossentropy": 2.573891043663025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2646481841802597, + "step": 8026 + }, + { + "epoch": 0.16056, + "grad_norm": 2.1875, + "grad_norm_var": 0.3064605712890625, + "learning_rate": 0.0001, + "loss": 4.4177, + "loss/crossentropy": 2.0088363885879517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22888045758008957, + "step": 8028 + }, + { + "epoch": 0.1606, + "grad_norm": 2.109375, + "grad_norm_var": 0.31115697224934896, + "learning_rate": 0.0001, + "loss": 4.214, + "loss/crossentropy": 2.4596647024154663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2507014721632004, + "step": 8030 + }, + { + "epoch": 0.16064, + "grad_norm": 2.1875, + "grad_norm_var": 0.3093462626139323, + "learning_rate": 0.0001, + "loss": 4.6826, + "loss/crossentropy": 2.204255223274231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22913406044244766, + "step": 8032 + }, + { + "epoch": 0.16068, + "grad_norm": 2.125, + "grad_norm_var": 0.30677261352539065, + "learning_rate": 0.0001, + "loss": 4.6603, + "loss/crossentropy": 2.285408139228821, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23185817897319794, + "step": 8034 + }, + { + "epoch": 0.16072, + "grad_norm": 1.9609375, + "grad_norm_var": 0.012520345052083333, + "learning_rate": 0.0001, + "loss": 3.9513, + "loss/crossentropy": 2.0788660645484924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20811481028795242, + "step": 8036 + }, + { + "epoch": 0.16076, + "grad_norm": 2.140625, + "grad_norm_var": 0.01307373046875, + "learning_rate": 0.0001, + "loss": 4.5368, + "loss/crossentropy": 2.398258686065674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2411409318447113, + "step": 8038 + }, + { + "epoch": 0.1608, + "grad_norm": 2.140625, + "grad_norm_var": 0.015579986572265624, + "learning_rate": 0.0001, + "loss": 4.0415, + "loss/crossentropy": 2.3101454973220825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22493668645620346, + "step": 8040 + }, + { + "epoch": 0.16084, + "grad_norm": 2.203125, + "grad_norm_var": 0.015134429931640625, + "learning_rate": 0.0001, + "loss": 4.0612, + "loss/crossentropy": 1.9015105962753296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22360052913427353, + "step": 8042 + }, + { + "epoch": 0.16088, + "grad_norm": 2.34375, + "grad_norm_var": 0.017704010009765625, + "learning_rate": 0.0001, + "loss": 4.216, + "loss/crossentropy": 2.0112481117248535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24062782526016235, + "step": 8044 + }, + { + "epoch": 0.16092, + "grad_norm": 2.171875, + "grad_norm_var": 0.0161773681640625, + "learning_rate": 0.0001, + "loss": 4.3638, + "loss/crossentropy": 2.2024285793304443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21959003806114197, + "step": 8046 + }, + { + "epoch": 0.16096, + "grad_norm": 2.125, + "grad_norm_var": 0.014412434895833333, + "learning_rate": 0.0001, + "loss": 4.4825, + "loss/crossentropy": 2.1069058775901794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21980682760477066, + "step": 8048 + }, + { + "epoch": 0.161, + "grad_norm": 2.3125, + "grad_norm_var": 0.012540690104166667, + "learning_rate": 0.0001, + "loss": 4.2106, + "loss/crossentropy": 1.8380340337753296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2201671525835991, + "step": 8050 + }, + { + "epoch": 0.16104, + "grad_norm": 2.046875, + "grad_norm_var": 0.010965728759765625, + "learning_rate": 0.0001, + "loss": 4.3778, + "loss/crossentropy": 2.276741087436676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23787930607795715, + "step": 8052 + }, + { + "epoch": 0.16108, + "grad_norm": 2.140625, + "grad_norm_var": 0.010680898030598959, + "learning_rate": 0.0001, + "loss": 4.0784, + "loss/crossentropy": 1.631809651851654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19654212146997452, + "step": 8054 + }, + { + "epoch": 0.16112, + "grad_norm": 2.234375, + "grad_norm_var": 0.007453409830729166, + "learning_rate": 0.0001, + "loss": 4.5927, + "loss/crossentropy": 2.067444145679474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21420229971408844, + "step": 8056 + }, + { + "epoch": 0.16116, + "grad_norm": 2.21875, + "grad_norm_var": 0.011767578125, + "learning_rate": 0.0001, + "loss": 4.562, + "loss/crossentropy": 2.384890556335449, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2312106341123581, + "step": 8058 + }, + { + "epoch": 0.1612, + "grad_norm": 2.265625, + "grad_norm_var": 0.010986328125, + "learning_rate": 0.0001, + "loss": 4.363, + "loss/crossentropy": 2.3683160543441772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2527216002345085, + "step": 8060 + }, + { + "epoch": 0.16124, + "grad_norm": 2.171875, + "grad_norm_var": 0.011253865559895833, + "learning_rate": 0.0001, + "loss": 4.4576, + "loss/crossentropy": 2.1545952558517456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23928315192461014, + "step": 8062 + }, + { + "epoch": 0.16128, + "grad_norm": 1.96875, + "grad_norm_var": 0.013695271809895833, + "learning_rate": 0.0001, + "loss": 4.2285, + "loss/crossentropy": 2.0792208313941956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2226722463965416, + "step": 8064 + }, + { + "epoch": 0.16132, + "grad_norm": 2.09375, + "grad_norm_var": 0.013068644205729167, + "learning_rate": 0.0001, + "loss": 3.8788, + "loss/crossentropy": 2.181519627571106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2193506434559822, + "step": 8066 + }, + { + "epoch": 0.16136, + "grad_norm": 2.375, + "grad_norm_var": 0.015067545572916667, + "learning_rate": 0.0001, + "loss": 4.4445, + "loss/crossentropy": 1.7798657417297363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23413337767124176, + "step": 8068 + }, + { + "epoch": 0.1614, + "grad_norm": 2.125, + "grad_norm_var": 0.017235310872395833, + "learning_rate": 0.0001, + "loss": 4.0731, + "loss/crossentropy": 2.1232666969299316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23351240158081055, + "step": 8070 + }, + { + "epoch": 0.16144, + "grad_norm": 2.203125, + "grad_norm_var": 0.017609659830729166, + "learning_rate": 0.0001, + "loss": 4.6975, + "loss/crossentropy": 2.34002685546875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2533388137817383, + "step": 8072 + }, + { + "epoch": 0.16148, + "grad_norm": 2.21875, + "grad_norm_var": 0.011637369791666666, + "learning_rate": 0.0001, + "loss": 4.3487, + "loss/crossentropy": 2.066399872303009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21156150847673416, + "step": 8074 + }, + { + "epoch": 0.16152, + "grad_norm": 2.171875, + "grad_norm_var": 0.010677083333333334, + "learning_rate": 0.0001, + "loss": 4.3646, + "loss/crossentropy": 2.2298463582992554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2307809516787529, + "step": 8076 + }, + { + "epoch": 0.16156, + "grad_norm": 2.0625, + "grad_norm_var": 0.0105865478515625, + "learning_rate": 0.0001, + "loss": 4.1946, + "loss/crossentropy": 1.9858508110046387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21499747782945633, + "step": 8078 + }, + { + "epoch": 0.1616, + "grad_norm": 2.0625, + "grad_norm_var": 0.011149088541666666, + "learning_rate": 0.0001, + "loss": 3.9984, + "loss/crossentropy": 1.5669215321540833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1889330968260765, + "step": 8080 + }, + { + "epoch": 0.16164, + "grad_norm": 2.25, + "grad_norm_var": 0.011881510416666666, + "learning_rate": 0.0001, + "loss": 4.4743, + "loss/crossentropy": 2.296878218650818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2405889928340912, + "step": 8082 + }, + { + "epoch": 0.16168, + "grad_norm": 2.203125, + "grad_norm_var": 0.007307942708333333, + "learning_rate": 0.0001, + "loss": 4.2205, + "loss/crossentropy": 1.954626441001892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19809379428625107, + "step": 8084 + }, + { + "epoch": 0.16172, + "grad_norm": 2.3125, + "grad_norm_var": 0.008687337239583334, + "learning_rate": 0.0001, + "loss": 4.5923, + "loss/crossentropy": 1.8640353083610535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.226090669631958, + "step": 8086 + }, + { + "epoch": 0.16176, + "grad_norm": 2.28125, + "grad_norm_var": 0.009663899739583334, + "learning_rate": 0.0001, + "loss": 4.3363, + "loss/crossentropy": 2.1914591789245605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24432511627674103, + "step": 8088 + }, + { + "epoch": 0.1618, + "grad_norm": 2.21875, + "grad_norm_var": 0.009798177083333333, + "learning_rate": 0.0001, + "loss": 4.7291, + "loss/crossentropy": 2.192594051361084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23682628571987152, + "step": 8090 + }, + { + "epoch": 0.16184, + "grad_norm": 2.09375, + "grad_norm_var": 0.010993448893229167, + "learning_rate": 0.0001, + "loss": 4.4592, + "loss/crossentropy": 2.210235595703125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.230379119515419, + "step": 8092 + }, + { + "epoch": 0.16188, + "grad_norm": 2.140625, + "grad_norm_var": 0.011115519205729167, + "learning_rate": 0.0001, + "loss": 4.4111, + "loss/crossentropy": 2.214667320251465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26643867790699005, + "step": 8094 + }, + { + "epoch": 0.16192, + "grad_norm": 2.09375, + "grad_norm_var": 0.009586588541666666, + "learning_rate": 0.0001, + "loss": 4.3131, + "loss/crossentropy": 1.9808599948883057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20679324120283127, + "step": 8096 + }, + { + "epoch": 0.16196, + "grad_norm": 2.109375, + "grad_norm_var": 0.008333333333333333, + "learning_rate": 0.0001, + "loss": 4.4683, + "loss/crossentropy": 2.076589345932007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23430196940898895, + "step": 8098 + }, + { + "epoch": 0.162, + "grad_norm": 2.15625, + "grad_norm_var": 0.006883748372395833, + "learning_rate": 0.0001, + "loss": 4.5645, + "loss/crossentropy": 2.364492177963257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23023054748773575, + "step": 8100 + }, + { + "epoch": 0.16204, + "grad_norm": 2.140625, + "grad_norm_var": 0.005890909830729167, + "learning_rate": 0.0001, + "loss": 4.6595, + "loss/crossentropy": 2.2908111214637756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21239468455314636, + "step": 8102 + }, + { + "epoch": 0.16208, + "grad_norm": 2.40625, + "grad_norm_var": 0.008958943684895833, + "learning_rate": 0.0001, + "loss": 4.5503, + "loss/crossentropy": 1.8306183218955994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21511316299438477, + "step": 8104 + }, + { + "epoch": 0.16212, + "grad_norm": 2.0625, + "grad_norm_var": 0.010724894205729167, + "learning_rate": 0.0001, + "loss": 4.1582, + "loss/crossentropy": 2.1121758222579956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23471853882074356, + "step": 8106 + }, + { + "epoch": 0.16216, + "grad_norm": 2.140625, + "grad_norm_var": 0.012355295817057292, + "learning_rate": 0.0001, + "loss": 4.1464, + "loss/crossentropy": 1.7613067030906677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2025228664278984, + "step": 8108 + }, + { + "epoch": 0.1622, + "grad_norm": 2.0, + "grad_norm_var": 0.013844553629557292, + "learning_rate": 0.0001, + "loss": 4.0797, + "loss/crossentropy": 2.1413058042526245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21817607432603836, + "step": 8110 + }, + { + "epoch": 0.16224, + "grad_norm": 2.265625, + "grad_norm_var": 0.013641103108723959, + "learning_rate": 0.0001, + "loss": 4.5739, + "loss/crossentropy": 2.375948429107666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30982857942581177, + "step": 8112 + }, + { + "epoch": 0.16228, + "grad_norm": 2.1875, + "grad_norm_var": 0.013396962483723959, + "learning_rate": 0.0001, + "loss": 4.076, + "loss/crossentropy": 1.9669193029403687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21873307973146439, + "step": 8114 + }, + { + "epoch": 0.16232, + "grad_norm": 2.21875, + "grad_norm_var": 0.015592193603515625, + "learning_rate": 0.0001, + "loss": 4.4024, + "loss/crossentropy": 2.2028547525405884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.229970782995224, + "step": 8116 + }, + { + "epoch": 0.16236, + "grad_norm": 2.078125, + "grad_norm_var": 0.01622289021809896, + "learning_rate": 0.0001, + "loss": 4.3359, + "loss/crossentropy": 2.082156002521515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2262335941195488, + "step": 8118 + }, + { + "epoch": 0.1624, + "grad_norm": 2.078125, + "grad_norm_var": 0.012379709879557292, + "learning_rate": 0.0001, + "loss": 4.2737, + "loss/crossentropy": 2.0538666248321533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22941745817661285, + "step": 8120 + }, + { + "epoch": 0.16244, + "grad_norm": 2.4375, + "grad_norm_var": 0.015750885009765625, + "learning_rate": 0.0001, + "loss": 4.3615, + "loss/crossentropy": 2.338989734649658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2434876710176468, + "step": 8122 + }, + { + "epoch": 0.16248, + "grad_norm": 2.375, + "grad_norm_var": 2.602311197916667, + "learning_rate": 0.0001, + "loss": 4.5472, + "loss/crossentropy": 2.277916193008423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29285988211631775, + "step": 8124 + }, + { + "epoch": 0.16252, + "grad_norm": 2.46875, + "grad_norm_var": 2.567577107747396, + "learning_rate": 0.0001, + "loss": 4.3437, + "loss/crossentropy": 2.1196334958076477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20728030800819397, + "step": 8126 + }, + { + "epoch": 0.16256, + "grad_norm": 2.15625, + "grad_norm_var": 2.5655558268229166, + "learning_rate": 0.0001, + "loss": 4.4216, + "loss/crossentropy": 2.0216450095176697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.229092076420784, + "step": 8128 + }, + { + "epoch": 0.1626, + "grad_norm": 2.171875, + "grad_norm_var": 2.5546834309895834, + "learning_rate": 0.0001, + "loss": 4.5163, + "loss/crossentropy": 2.098921537399292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22539281845092773, + "step": 8130 + }, + { + "epoch": 0.16264, + "grad_norm": 2.03125, + "grad_norm_var": 2.580052693684896, + "learning_rate": 0.0001, + "loss": 4.1066, + "loss/crossentropy": 2.0186068415641785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20286529511213303, + "step": 8132 + }, + { + "epoch": 0.16268, + "grad_norm": 2.28125, + "grad_norm_var": 2.5584706624348956, + "learning_rate": 0.0001, + "loss": 4.7847, + "loss/crossentropy": 2.157357335090637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23342902958393097, + "step": 8134 + }, + { + "epoch": 0.16272, + "grad_norm": 2.140625, + "grad_norm_var": 2.5516998291015627, + "learning_rate": 0.0001, + "loss": 4.3489, + "loss/crossentropy": 2.1885476112365723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22314801812171936, + "step": 8136 + }, + { + "epoch": 0.16276, + "grad_norm": 2.21875, + "grad_norm_var": 2.5796160380045574, + "learning_rate": 0.0001, + "loss": 4.3403, + "loss/crossentropy": 2.2208757400512695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2295266091823578, + "step": 8138 + }, + { + "epoch": 0.1628, + "grad_norm": 2.015625, + "grad_norm_var": 0.027522532145182292, + "learning_rate": 0.0001, + "loss": 4.0473, + "loss/crossentropy": 1.7885233163833618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2008970081806183, + "step": 8140 + }, + { + "epoch": 0.16284, + "grad_norm": 2.078125, + "grad_norm_var": 0.022989654541015626, + "learning_rate": 0.0001, + "loss": 4.0835, + "loss/crossentropy": 2.152435064315796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2311881259083748, + "step": 8142 + }, + { + "epoch": 0.16288, + "grad_norm": 2.296875, + "grad_norm_var": 0.016001129150390626, + "learning_rate": 0.0001, + "loss": 4.3384, + "loss/crossentropy": 2.0818406343460083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24117180705070496, + "step": 8144 + }, + { + "epoch": 0.16292, + "grad_norm": 2.140625, + "grad_norm_var": 0.02513402303059896, + "learning_rate": 0.0001, + "loss": 4.4246, + "loss/crossentropy": 2.1775856614112854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23246955126523972, + "step": 8146 + }, + { + "epoch": 0.16296, + "grad_norm": 2.578125, + "grad_norm_var": 0.033607737223307295, + "learning_rate": 0.0001, + "loss": 3.9993, + "loss/crossentropy": 1.7029682397842407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19564303010702133, + "step": 8148 + }, + { + "epoch": 0.163, + "grad_norm": 2.453125, + "grad_norm_var": 0.03468195597330729, + "learning_rate": 0.0001, + "loss": 4.5631, + "loss/crossentropy": 2.070194900035858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2382609099149704, + "step": 8150 + }, + { + "epoch": 0.16304, + "grad_norm": 2.046875, + "grad_norm_var": 0.036043039957682294, + "learning_rate": 0.0001, + "loss": 4.19, + "loss/crossentropy": 1.9214876890182495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20894134789705276, + "step": 8152 + }, + { + "epoch": 0.16308, + "grad_norm": 2.21875, + "grad_norm_var": 0.03243815104166667, + "learning_rate": 0.0001, + "loss": 4.3622, + "loss/crossentropy": 1.7311474084854126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1962232068181038, + "step": 8154 + }, + { + "epoch": 0.16312, + "grad_norm": 2.0, + "grad_norm_var": 0.0323150634765625, + "learning_rate": 0.0001, + "loss": 4.4623, + "loss/crossentropy": 2.2161877155303955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22318138182163239, + "step": 8156 + }, + { + "epoch": 0.16316, + "grad_norm": 2.078125, + "grad_norm_var": 0.03186747233072917, + "learning_rate": 0.0001, + "loss": 4.7343, + "loss/crossentropy": 2.21865177154541, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.234280064702034, + "step": 8158 + }, + { + "epoch": 0.1632, + "grad_norm": 2.09375, + "grad_norm_var": 0.032835896809895834, + "learning_rate": 0.0001, + "loss": 4.3277, + "loss/crossentropy": 1.9517142176628113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1964063122868538, + "step": 8160 + }, + { + "epoch": 0.16324, + "grad_norm": 2.25, + "grad_norm_var": 0.023713175455729166, + "learning_rate": 0.0001, + "loss": 4.3878, + "loss/crossentropy": 2.261234760284424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24549759924411774, + "step": 8162 + }, + { + "epoch": 0.16328, + "grad_norm": 2.109375, + "grad_norm_var": 0.012751261393229166, + "learning_rate": 0.0001, + "loss": 4.159, + "loss/crossentropy": 1.9791623950004578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2018037736415863, + "step": 8164 + }, + { + "epoch": 0.16332, + "grad_norm": 2.1875, + "grad_norm_var": 0.005125935872395833, + "learning_rate": 0.0001, + "loss": 4.6231, + "loss/crossentropy": 2.3916029930114746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24446460604667664, + "step": 8166 + }, + { + "epoch": 0.16336, + "grad_norm": 2.328125, + "grad_norm_var": 0.007991536458333334, + "learning_rate": 0.0001, + "loss": 4.3251, + "loss/crossentropy": 2.204437553882599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22334939241409302, + "step": 8168 + }, + { + "epoch": 0.1634, + "grad_norm": 2.0625, + "grad_norm_var": 0.007111612955729167, + "learning_rate": 0.0001, + "loss": 4.3236, + "loss/crossentropy": 2.2013272047042847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2068365067243576, + "step": 8170 + }, + { + "epoch": 0.16344, + "grad_norm": 2.03125, + "grad_norm_var": 0.0065826416015625, + "learning_rate": 0.0001, + "loss": 4.2597, + "loss/crossentropy": 1.8648701310157776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19690127670764923, + "step": 8172 + }, + { + "epoch": 0.16348, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008857981363932291, + "learning_rate": 0.0001, + "loss": 4.1324, + "loss/crossentropy": 2.221195936203003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1900760903954506, + "step": 8174 + }, + { + "epoch": 0.16352, + "grad_norm": 2.203125, + "grad_norm_var": 0.009445953369140624, + "learning_rate": 0.0001, + "loss": 4.4874, + "loss/crossentropy": 1.8648499846458435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1982845515012741, + "step": 8176 + }, + { + "epoch": 0.16356, + "grad_norm": 2.125, + "grad_norm_var": 0.010762278238932292, + "learning_rate": 0.0001, + "loss": 4.588, + "loss/crossentropy": 2.3032894134521484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25082532316446304, + "step": 8178 + }, + { + "epoch": 0.1636, + "grad_norm": 2.203125, + "grad_norm_var": 0.010931142171223958, + "learning_rate": 0.0001, + "loss": 4.3375, + "loss/crossentropy": 2.0634626150131226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23546822369098663, + "step": 8180 + }, + { + "epoch": 0.16364, + "grad_norm": 2.28125, + "grad_norm_var": 0.012737782796223958, + "learning_rate": 0.0001, + "loss": 4.2402, + "loss/crossentropy": 1.7406468391418457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20627902448177338, + "step": 8182 + }, + { + "epoch": 0.16368, + "grad_norm": 2.125, + "grad_norm_var": 0.012668609619140625, + "learning_rate": 0.0001, + "loss": 4.4103, + "loss/crossentropy": 2.3812272548675537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25258868932724, + "step": 8184 + }, + { + "epoch": 0.16372, + "grad_norm": 2.1875, + "grad_norm_var": 0.012499745686848958, + "learning_rate": 0.0001, + "loss": 4.3584, + "loss/crossentropy": 2.20754611492157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21015550196170807, + "step": 8186 + }, + { + "epoch": 0.16376, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0140777587890625, + "learning_rate": 0.0001, + "loss": 4.1902, + "loss/crossentropy": 2.081672966480255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19573034346103668, + "step": 8188 + }, + { + "epoch": 0.1638, + "grad_norm": 2.140625, + "grad_norm_var": 0.009905751546223958, + "learning_rate": 0.0001, + "loss": 4.5267, + "loss/crossentropy": 2.184974491596222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2477174997329712, + "step": 8190 + }, + { + "epoch": 0.16384, + "grad_norm": 2.125, + "grad_norm_var": 0.010322825113932291, + "learning_rate": 0.0001, + "loss": 4.3804, + "loss/crossentropy": 2.1048192977905273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22900478541851044, + "step": 8192 + }, + { + "epoch": 0.16388, + "grad_norm": 2.046875, + "grad_norm_var": 0.010135650634765625, + "learning_rate": 0.0001, + "loss": 4.1068, + "loss/crossentropy": 1.960956335067749, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23504969477653503, + "step": 8194 + }, + { + "epoch": 0.16392, + "grad_norm": 2.484375, + "grad_norm_var": 0.017114003499348957, + "learning_rate": 0.0001, + "loss": 4.7698, + "loss/crossentropy": 2.158856213092804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2200375646352768, + "step": 8196 + }, + { + "epoch": 0.16396, + "grad_norm": 2.015625, + "grad_norm_var": 0.017286936442057293, + "learning_rate": 0.0001, + "loss": 4.0533, + "loss/crossentropy": 2.0954058170318604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22729048877954483, + "step": 8198 + }, + { + "epoch": 0.164, + "grad_norm": 2.078125, + "grad_norm_var": 0.01599299112955729, + "learning_rate": 0.0001, + "loss": 4.3492, + "loss/crossentropy": 2.1452964544296265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21412881463766098, + "step": 8200 + }, + { + "epoch": 0.16404, + "grad_norm": 1.8359375, + "grad_norm_var": 0.02072321573893229, + "learning_rate": 0.0001, + "loss": 4.0204, + "loss/crossentropy": 1.9737866520881653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19449186325073242, + "step": 8202 + }, + { + "epoch": 0.16408, + "grad_norm": 2.109375, + "grad_norm_var": 0.020344034830729166, + "learning_rate": 0.0001, + "loss": 4.5119, + "loss/crossentropy": 2.213072657585144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2333887815475464, + "step": 8204 + }, + { + "epoch": 0.16412, + "grad_norm": 2.09375, + "grad_norm_var": 0.0203277587890625, + "learning_rate": 0.0001, + "loss": 4.2139, + "loss/crossentropy": 1.8702161312103271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21368274092674255, + "step": 8206 + }, + { + "epoch": 0.16416, + "grad_norm": 1.9765625, + "grad_norm_var": 0.02102635701497396, + "learning_rate": 0.0001, + "loss": 4.242, + "loss/crossentropy": 2.177275776863098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21248809248209, + "step": 8208 + }, + { + "epoch": 0.1642, + "grad_norm": 1.9375, + "grad_norm_var": 0.021945953369140625, + "learning_rate": 0.0001, + "loss": 4.3151, + "loss/crossentropy": 2.3422038555145264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2288341298699379, + "step": 8210 + }, + { + "epoch": 0.16424, + "grad_norm": 2.265625, + "grad_norm_var": 0.014212799072265626, + "learning_rate": 0.0001, + "loss": 4.2195, + "loss/crossentropy": 2.094432234764099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2224871665239334, + "step": 8212 + }, + { + "epoch": 0.16428, + "grad_norm": 2.125, + "grad_norm_var": 0.020072174072265626, + "learning_rate": 0.0001, + "loss": 4.5045, + "loss/crossentropy": 2.3371682167053223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2710718363523483, + "step": 8214 + }, + { + "epoch": 0.16432, + "grad_norm": 2.25, + "grad_norm_var": 0.034242502848307294, + "learning_rate": 0.0001, + "loss": 4.6926, + "loss/crossentropy": 1.8700988292694092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20795634388923645, + "step": 8216 + }, + { + "epoch": 0.16436, + "grad_norm": 2.046875, + "grad_norm_var": 0.02575658162434896, + "learning_rate": 0.0001, + "loss": 4.3266, + "loss/crossentropy": 2.080985188484192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23497651517391205, + "step": 8218 + }, + { + "epoch": 0.1644, + "grad_norm": 2.046875, + "grad_norm_var": 0.026364898681640624, + "learning_rate": 0.0001, + "loss": 4.3873, + "loss/crossentropy": 2.3486984968185425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25393396615982056, + "step": 8220 + }, + { + "epoch": 0.16444, + "grad_norm": 2.15625, + "grad_norm_var": 0.028148396809895834, + "learning_rate": 0.0001, + "loss": 4.3446, + "loss/crossentropy": 2.0084245800971985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21738936007022858, + "step": 8222 + }, + { + "epoch": 0.16448, + "grad_norm": 2.03125, + "grad_norm_var": 0.02846247355143229, + "learning_rate": 0.0001, + "loss": 3.8584, + "loss/crossentropy": 1.536482572555542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17787320166826248, + "step": 8224 + }, + { + "epoch": 0.16452, + "grad_norm": 2.1875, + "grad_norm_var": 0.024857330322265624, + "learning_rate": 0.0001, + "loss": 4.494, + "loss/crossentropy": 2.1727080941200256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22967493534088135, + "step": 8226 + }, + { + "epoch": 0.16456, + "grad_norm": 2.015625, + "grad_norm_var": 0.02490208943684896, + "learning_rate": 0.0001, + "loss": 4.1895, + "loss/crossentropy": 1.6915069222450256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19113170355558395, + "step": 8228 + }, + { + "epoch": 0.1646, + "grad_norm": 2.046875, + "grad_norm_var": 0.022739410400390625, + "learning_rate": 0.0001, + "loss": 4.4378, + "loss/crossentropy": 2.1645957231521606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21431444585323334, + "step": 8230 + }, + { + "epoch": 0.16464, + "grad_norm": 2.09375, + "grad_norm_var": 0.00858154296875, + "learning_rate": 0.0001, + "loss": 3.8088, + "loss/crossentropy": 1.8797736763954163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19344569742679596, + "step": 8232 + }, + { + "epoch": 0.16468, + "grad_norm": 2.1875, + "grad_norm_var": 0.0148345947265625, + "learning_rate": 0.0001, + "loss": 4.3119, + "loss/crossentropy": 2.092893421649933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21476523578166962, + "step": 8234 + }, + { + "epoch": 0.16472, + "grad_norm": 2.125, + "grad_norm_var": 0.0143310546875, + "learning_rate": 0.0001, + "loss": 4.4145, + "loss/crossentropy": 2.336071252822876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25474119186401367, + "step": 8236 + }, + { + "epoch": 0.16476, + "grad_norm": 2.03125, + "grad_norm_var": 0.014410146077473958, + "learning_rate": 0.0001, + "loss": 4.3054, + "loss/crossentropy": 2.008872926235199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22672003507614136, + "step": 8238 + }, + { + "epoch": 0.1648, + "grad_norm": 2.171875, + "grad_norm_var": 0.011822255452473958, + "learning_rate": 0.0001, + "loss": 4.4997, + "loss/crossentropy": 2.2229605317115784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2392737865447998, + "step": 8240 + }, + { + "epoch": 0.16484, + "grad_norm": 2.171875, + "grad_norm_var": 0.011525217692057292, + "learning_rate": 0.0001, + "loss": 4.5894, + "loss/crossentropy": 2.1929808855056763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22832375764846802, + "step": 8242 + }, + { + "epoch": 0.16488, + "grad_norm": 2.171875, + "grad_norm_var": 0.010625966389973958, + "learning_rate": 0.0001, + "loss": 4.642, + "loss/crossentropy": 2.1557860374450684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24640469253063202, + "step": 8244 + }, + { + "epoch": 0.16492, + "grad_norm": 2.109375, + "grad_norm_var": 0.009582265218098959, + "learning_rate": 0.0001, + "loss": 4.2508, + "loss/crossentropy": 2.2462236881256104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23639583587646484, + "step": 8246 + }, + { + "epoch": 0.16496, + "grad_norm": 2.03125, + "grad_norm_var": 0.009501139322916666, + "learning_rate": 0.0001, + "loss": 4.1476, + "loss/crossentropy": 2.1238350868225098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23066531121730804, + "step": 8248 + }, + { + "epoch": 0.165, + "grad_norm": 2.078125, + "grad_norm_var": 0.0049479166666666664, + "learning_rate": 0.0001, + "loss": 4.1666, + "loss/crossentropy": 1.6866248846054077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20677632093429565, + "step": 8250 + }, + { + "epoch": 0.16504, + "grad_norm": 2.03125, + "grad_norm_var": 0.005338541666666667, + "learning_rate": 0.0001, + "loss": 4.1494, + "loss/crossentropy": 2.0640709400177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148914858698845, + "step": 8252 + }, + { + "epoch": 0.16508, + "grad_norm": 2.234375, + "grad_norm_var": 0.00513916015625, + "learning_rate": 0.0001, + "loss": 4.4138, + "loss/crossentropy": 2.003119468688965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21817681193351746, + "step": 8254 + }, + { + "epoch": 0.16512, + "grad_norm": 2.328125, + "grad_norm_var": 0.007763671875, + "learning_rate": 0.0001, + "loss": 4.276, + "loss/crossentropy": 2.053581953048706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22257455438375473, + "step": 8256 + }, + { + "epoch": 0.16516, + "grad_norm": 2.046875, + "grad_norm_var": 0.008381144205729166, + "learning_rate": 0.0001, + "loss": 4.1314, + "loss/crossentropy": 1.788454830646515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21449437737464905, + "step": 8258 + }, + { + "epoch": 0.1652, + "grad_norm": 2.265625, + "grad_norm_var": 0.009032185872395833, + "learning_rate": 0.0001, + "loss": 4.3986, + "loss/crossentropy": 1.8791787028312683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1877290904521942, + "step": 8260 + }, + { + "epoch": 0.16524, + "grad_norm": 2.140625, + "grad_norm_var": 0.009761555989583334, + "learning_rate": 0.0001, + "loss": 4.4326, + "loss/crossentropy": 2.2346811294555664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.221164733171463, + "step": 8262 + }, + { + "epoch": 0.16528, + "grad_norm": 2.03125, + "grad_norm_var": 0.009227498372395834, + "learning_rate": 0.0001, + "loss": 4.3296, + "loss/crossentropy": 2.1032413244247437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21829386800527573, + "step": 8264 + }, + { + "epoch": 0.16532, + "grad_norm": 2.09375, + "grad_norm_var": 0.008690388997395833, + "learning_rate": 0.0001, + "loss": 4.3188, + "loss/crossentropy": 2.1452749967575073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24733971804380417, + "step": 8266 + }, + { + "epoch": 0.16536, + "grad_norm": 2.171875, + "grad_norm_var": 0.008373006184895834, + "learning_rate": 0.0001, + "loss": 4.2093, + "loss/crossentropy": 1.8318313956260681, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22139593213796616, + "step": 8268 + }, + { + "epoch": 0.1654, + "grad_norm": 2.09375, + "grad_norm_var": 0.0074045817057291664, + "learning_rate": 0.0001, + "loss": 4.3494, + "loss/crossentropy": 1.9865980744361877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22354473173618317, + "step": 8270 + }, + { + "epoch": 0.16544, + "grad_norm": 2.21875, + "grad_norm_var": 0.005464680989583333, + "learning_rate": 0.0001, + "loss": 4.318, + "loss/crossentropy": 2.140509843826294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22832269966602325, + "step": 8272 + }, + { + "epoch": 0.16548, + "grad_norm": 2.15625, + "grad_norm_var": 0.005052693684895833, + "learning_rate": 0.0001, + "loss": 4.1632, + "loss/crossentropy": 2.3763319849967957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22866757214069366, + "step": 8274 + }, + { + "epoch": 0.16552, + "grad_norm": 2.03125, + "grad_norm_var": 0.004198201497395833, + "learning_rate": 0.0001, + "loss": 4.2366, + "loss/crossentropy": 1.785762071609497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19961480796337128, + "step": 8276 + }, + { + "epoch": 0.16556, + "grad_norm": 7.34375, + "grad_norm_var": 1.7181477864583334, + "learning_rate": 0.0001, + "loss": 4.5317, + "loss/crossentropy": 2.0500977635383606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2972872704267502, + "step": 8278 + }, + { + "epoch": 0.1656, + "grad_norm": 2.265625, + "grad_norm_var": 1.6987589518229167, + "learning_rate": 0.0001, + "loss": 4.4247, + "loss/crossentropy": 2.3665153980255127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24115745723247528, + "step": 8280 + }, + { + "epoch": 0.16564, + "grad_norm": 2.25, + "grad_norm_var": 1.6973592122395833, + "learning_rate": 0.0001, + "loss": 4.326, + "loss/crossentropy": 1.9636226892471313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2048879787325859, + "step": 8282 + }, + { + "epoch": 0.16568, + "grad_norm": 2.171875, + "grad_norm_var": 1.6948720296223958, + "learning_rate": 0.0001, + "loss": 4.4303, + "loss/crossentropy": 2.2624993324279785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22186043858528137, + "step": 8284 + }, + { + "epoch": 0.16572, + "grad_norm": 2.046875, + "grad_norm_var": 1.7048886617024739, + "learning_rate": 0.0001, + "loss": 4.0816, + "loss/crossentropy": 2.098397970199585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.219336099922657, + "step": 8286 + }, + { + "epoch": 0.16576, + "grad_norm": 2.140625, + "grad_norm_var": 1.6997393290201823, + "learning_rate": 0.0001, + "loss": 4.4705, + "loss/crossentropy": 1.9795190691947937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2145320549607277, + "step": 8288 + }, + { + "epoch": 0.1658, + "grad_norm": 2.203125, + "grad_norm_var": 1.6936927795410157, + "learning_rate": 0.0001, + "loss": 4.4625, + "loss/crossentropy": 2.1028788089752197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22656689584255219, + "step": 8290 + }, + { + "epoch": 0.16584, + "grad_norm": 1.984375, + "grad_norm_var": 1.7015398661295573, + "learning_rate": 0.0001, + "loss": 4.1901, + "loss/crossentropy": 2.2936136722564697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23688847571611404, + "step": 8292 + }, + { + "epoch": 0.16588, + "grad_norm": 2.234375, + "grad_norm_var": 0.01862360636393229, + "learning_rate": 0.0001, + "loss": 4.1665, + "loss/crossentropy": 1.870754897594452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20914533734321594, + "step": 8294 + }, + { + "epoch": 0.16592, + "grad_norm": 2.109375, + "grad_norm_var": 0.007999674479166666, + "learning_rate": 0.0001, + "loss": 3.9884, + "loss/crossentropy": 2.034530520439148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22234197705984116, + "step": 8296 + }, + { + "epoch": 0.16596, + "grad_norm": 2.203125, + "grad_norm_var": 0.0073811848958333336, + "learning_rate": 0.0001, + "loss": 4.3674, + "loss/crossentropy": 1.974400520324707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20479100942611694, + "step": 8298 + }, + { + "epoch": 0.166, + "grad_norm": 2.21875, + "grad_norm_var": 0.0078765869140625, + "learning_rate": 0.0001, + "loss": 4.3157, + "loss/crossentropy": 2.180828809738159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2229994386434555, + "step": 8300 + }, + { + "epoch": 0.16604, + "grad_norm": 2.078125, + "grad_norm_var": 0.007624308268229167, + "learning_rate": 0.0001, + "loss": 4.0704, + "loss/crossentropy": 2.098487079143524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19709115475416183, + "step": 8302 + }, + { + "epoch": 0.16608, + "grad_norm": 2.0625, + "grad_norm_var": 0.007673136393229167, + "learning_rate": 0.0001, + "loss": 4.2292, + "loss/crossentropy": 1.999358892440796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20891200006008148, + "step": 8304 + }, + { + "epoch": 0.16612, + "grad_norm": 2.1875, + "grad_norm_var": 0.009022776285807292, + "learning_rate": 0.0001, + "loss": 4.4263, + "loss/crossentropy": 2.40866219997406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22355159372091293, + "step": 8306 + }, + { + "epoch": 0.16616, + "grad_norm": 2.171875, + "grad_norm_var": 0.008156077067057291, + "learning_rate": 0.0001, + "loss": 4.0532, + "loss/crossentropy": 1.774325966835022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2058887928724289, + "step": 8308 + }, + { + "epoch": 0.1662, + "grad_norm": 2.125, + "grad_norm_var": 0.007352447509765625, + "learning_rate": 0.0001, + "loss": 4.4951, + "loss/crossentropy": 1.9870773553848267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21368569880723953, + "step": 8310 + }, + { + "epoch": 0.16624, + "grad_norm": 2.40625, + "grad_norm_var": 0.01121826171875, + "learning_rate": 0.0001, + "loss": 4.3496, + "loss/crossentropy": 1.9224175810813904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2279355376958847, + "step": 8312 + }, + { + "epoch": 0.16628, + "grad_norm": 2.171875, + "grad_norm_var": 0.012230428059895833, + "learning_rate": 0.0001, + "loss": 4.4074, + "loss/crossentropy": 2.011419177055359, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2363004833459854, + "step": 8314 + }, + { + "epoch": 0.16632, + "grad_norm": 2.328125, + "grad_norm_var": 0.01444091796875, + "learning_rate": 0.0001, + "loss": 4.6627, + "loss/crossentropy": 1.9777795672416687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21830055862665176, + "step": 8316 + }, + { + "epoch": 0.16636, + "grad_norm": 2.046875, + "grad_norm_var": 0.013508860270182292, + "learning_rate": 0.0001, + "loss": 4.3798, + "loss/crossentropy": 2.3334981203079224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21089734882116318, + "step": 8318 + }, + { + "epoch": 0.1664, + "grad_norm": 2.28125, + "grad_norm_var": 0.015148671468098958, + "learning_rate": 0.0001, + "loss": 4.3792, + "loss/crossentropy": 1.9137234687805176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21954041719436646, + "step": 8320 + }, + { + "epoch": 0.16644, + "grad_norm": 2.15625, + "grad_norm_var": 0.013492838541666666, + "learning_rate": 0.0001, + "loss": 4.3826, + "loss/crossentropy": 2.1358155608177185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21321023255586624, + "step": 8322 + }, + { + "epoch": 0.16648, + "grad_norm": 2.203125, + "grad_norm_var": 0.013688151041666667, + "learning_rate": 0.0001, + "loss": 4.3454, + "loss/crossentropy": 2.1747822165489197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23682481050491333, + "step": 8324 + }, + { + "epoch": 0.16652, + "grad_norm": 2.078125, + "grad_norm_var": 0.013505045572916667, + "learning_rate": 0.0001, + "loss": 4.2685, + "loss/crossentropy": 2.2818257808685303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24226247519254684, + "step": 8326 + }, + { + "epoch": 0.16656, + "grad_norm": 2.1875, + "grad_norm_var": 0.010155232747395833, + "learning_rate": 0.0001, + "loss": 4.4806, + "loss/crossentropy": 1.9066791534423828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20610930025577545, + "step": 8328 + }, + { + "epoch": 0.1666, + "grad_norm": 2.078125, + "grad_norm_var": 0.008284505208333333, + "learning_rate": 0.0001, + "loss": 4.4799, + "loss/crossentropy": 2.2431830763816833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22407780587673187, + "step": 8330 + }, + { + "epoch": 0.16664, + "grad_norm": 2.125, + "grad_norm_var": 0.0067708333333333336, + "learning_rate": 0.0001, + "loss": 4.5894, + "loss/crossentropy": 2.569726347923279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2647472620010376, + "step": 8332 + }, + { + "epoch": 0.16668, + "grad_norm": 2.140625, + "grad_norm_var": 0.006371053059895834, + "learning_rate": 0.0001, + "loss": 4.2307, + "loss/crossentropy": 1.8364137411117554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2113686427474022, + "step": 8334 + }, + { + "epoch": 0.16672, + "grad_norm": 2.078125, + "grad_norm_var": 0.007968902587890625, + "learning_rate": 0.0001, + "loss": 4.0369, + "loss/crossentropy": 2.0510441064834595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2268069088459015, + "step": 8336 + }, + { + "epoch": 0.16676, + "grad_norm": 2.046875, + "grad_norm_var": 0.007968902587890625, + "learning_rate": 0.0001, + "loss": 4.2528, + "loss/crossentropy": 1.7306728959083557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20341812074184418, + "step": 8338 + }, + { + "epoch": 0.1668, + "grad_norm": 2.578125, + "grad_norm_var": 0.023361968994140624, + "learning_rate": 0.0001, + "loss": 4.8146, + "loss/crossentropy": 2.398088574409485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2426001876592636, + "step": 8340 + }, + { + "epoch": 0.16684, + "grad_norm": 2.109375, + "grad_norm_var": 0.023128000895182292, + "learning_rate": 0.0001, + "loss": 4.0659, + "loss/crossentropy": 2.1259487867355347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22666773200035095, + "step": 8342 + }, + { + "epoch": 0.16688, + "grad_norm": 2.109375, + "grad_norm_var": 0.022141265869140624, + "learning_rate": 0.0001, + "loss": 4.2857, + "loss/crossentropy": 1.9744665026664734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2887374758720398, + "step": 8344 + }, + { + "epoch": 0.16692, + "grad_norm": 2.09375, + "grad_norm_var": 0.02316869099934896, + "learning_rate": 0.0001, + "loss": 4.1989, + "loss/crossentropy": 1.9841225743293762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2008478343486786, + "step": 8346 + }, + { + "epoch": 0.16696, + "grad_norm": 2.125, + "grad_norm_var": 0.1913469950358073, + "learning_rate": 0.0001, + "loss": 4.5208, + "loss/crossentropy": 2.060486137866974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23954987525939941, + "step": 8348 + }, + { + "epoch": 0.167, + "grad_norm": 2.03125, + "grad_norm_var": 0.19230931599934895, + "learning_rate": 0.0001, + "loss": 4.2185, + "loss/crossentropy": 1.9356245398521423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987425833940506, + "step": 8350 + }, + { + "epoch": 0.16704, + "grad_norm": 2.203125, + "grad_norm_var": 0.1835845947265625, + "learning_rate": 0.0001, + "loss": 4.1875, + "loss/crossentropy": 1.9968677163124084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21396416425704956, + "step": 8352 + }, + { + "epoch": 0.16708, + "grad_norm": 2.109375, + "grad_norm_var": 0.1820465087890625, + "learning_rate": 0.0001, + "loss": 4.1696, + "loss/crossentropy": 2.1678614616394043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22860293090343475, + "step": 8354 + }, + { + "epoch": 0.16712, + "grad_norm": 2.078125, + "grad_norm_var": 0.17534891764322916, + "learning_rate": 0.0001, + "loss": 4.1761, + "loss/crossentropy": 1.7752264142036438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20555391907691956, + "step": 8356 + }, + { + "epoch": 0.16716, + "grad_norm": 2.046875, + "grad_norm_var": 0.1767578125, + "learning_rate": 0.0001, + "loss": 4.2391, + "loss/crossentropy": 1.7073925137519836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19325412809848785, + "step": 8358 + }, + { + "epoch": 0.1672, + "grad_norm": 2.4375, + "grad_norm_var": 0.17827860514322916, + "learning_rate": 0.0001, + "loss": 4.7185, + "loss/crossentropy": 2.2200660705566406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21834757924079895, + "step": 8360 + }, + { + "epoch": 0.16724, + "grad_norm": 2.03125, + "grad_norm_var": 0.17594401041666666, + "learning_rate": 0.0001, + "loss": 4.3551, + "loss/crossentropy": 2.3598183393478394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25592371821403503, + "step": 8362 + }, + { + "epoch": 0.16728, + "grad_norm": 2.03125, + "grad_norm_var": 0.014644368489583334, + "learning_rate": 0.0001, + "loss": 4.3429, + "loss/crossentropy": 2.342926025390625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2480776235461235, + "step": 8364 + }, + { + "epoch": 0.16732, + "grad_norm": 2.046875, + "grad_norm_var": 0.014411417643229167, + "learning_rate": 0.0001, + "loss": 4.2802, + "loss/crossentropy": 1.8505961894989014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21376194059848785, + "step": 8366 + }, + { + "epoch": 0.16736, + "grad_norm": 2.078125, + "grad_norm_var": 0.0146636962890625, + "learning_rate": 0.0001, + "loss": 4.4199, + "loss/crossentropy": 2.2064108848571777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21334534883499146, + "step": 8368 + }, + { + "epoch": 0.1674, + "grad_norm": 2.6875, + "grad_norm_var": 0.0338043212890625, + "learning_rate": 0.0001, + "loss": 4.5344, + "loss/crossentropy": 2.1280709505081177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24501197040081024, + "step": 8370 + }, + { + "epoch": 0.16744, + "grad_norm": 2.203125, + "grad_norm_var": 0.03312886555989583, + "learning_rate": 0.0001, + "loss": 4.3382, + "loss/crossentropy": 2.2134695053100586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2368277609348297, + "step": 8372 + }, + { + "epoch": 0.16748, + "grad_norm": 2.03125, + "grad_norm_var": 0.03319905598958333, + "learning_rate": 0.0001, + "loss": 4.1197, + "loss/crossentropy": 1.6942040920257568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21226602047681808, + "step": 8374 + }, + { + "epoch": 0.16752, + "grad_norm": 2.09375, + "grad_norm_var": 0.025755818684895834, + "learning_rate": 0.0001, + "loss": 4.5004, + "loss/crossentropy": 2.1554355025291443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2655039578676224, + "step": 8376 + }, + { + "epoch": 0.16756, + "grad_norm": 2.078125, + "grad_norm_var": 0.024800618489583332, + "learning_rate": 0.0001, + "loss": 4.2446, + "loss/crossentropy": 1.8107115030288696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1756090670824051, + "step": 8378 + }, + { + "epoch": 0.1676, + "grad_norm": 1.921875, + "grad_norm_var": 0.026496378580729167, + "learning_rate": 0.0001, + "loss": 4.0745, + "loss/crossentropy": 1.655519425868988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1760415881872177, + "step": 8380 + }, + { + "epoch": 0.16764, + "grad_norm": 2.140625, + "grad_norm_var": 0.026460774739583335, + "learning_rate": 0.0001, + "loss": 4.2369, + "loss/crossentropy": 2.1618025302886963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21886380016803741, + "step": 8382 + }, + { + "epoch": 0.16768, + "grad_norm": 2.109375, + "grad_norm_var": 0.028270467122395834, + "learning_rate": 0.0001, + "loss": 4.0798, + "loss/crossentropy": 2.043319880962372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21368451416492462, + "step": 8384 + }, + { + "epoch": 0.16772, + "grad_norm": 2.1875, + "grad_norm_var": 0.010724894205729167, + "learning_rate": 0.0001, + "loss": 4.4035, + "loss/crossentropy": 2.3043102025985718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25452760607004166, + "step": 8386 + }, + { + "epoch": 0.16776, + "grad_norm": 2.21875, + "grad_norm_var": 0.011546834309895834, + "learning_rate": 0.0001, + "loss": 4.5633, + "loss/crossentropy": 2.3627192974090576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2438303530216217, + "step": 8388 + }, + { + "epoch": 0.1678, + "grad_norm": 2.109375, + "grad_norm_var": 0.011335245768229167, + "learning_rate": 0.0001, + "loss": 4.4339, + "loss/crossentropy": 2.0960012674331665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20813053101301193, + "step": 8390 + }, + { + "epoch": 0.16784, + "grad_norm": 1.921875, + "grad_norm_var": 0.013923136393229167, + "learning_rate": 0.0001, + "loss": 4.0401, + "loss/crossentropy": 1.9132550358772278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19716795533895493, + "step": 8392 + }, + { + "epoch": 0.16788, + "grad_norm": 2.078125, + "grad_norm_var": 0.014742024739583333, + "learning_rate": 0.0001, + "loss": 4.2371, + "loss/crossentropy": 2.1912059783935547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.233934685587883, + "step": 8394 + }, + { + "epoch": 0.16792, + "grad_norm": 2.0625, + "grad_norm_var": 0.012691243489583334, + "learning_rate": 0.0001, + "loss": 4.4501, + "loss/crossentropy": 2.165616512298584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23475389927625656, + "step": 8396 + }, + { + "epoch": 0.16796, + "grad_norm": 2.078125, + "grad_norm_var": 0.01265869140625, + "learning_rate": 0.0001, + "loss": 4.4841, + "loss/crossentropy": 2.209625542163849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24280209839344025, + "step": 8398 + }, + { + "epoch": 0.168, + "grad_norm": 2.125, + "grad_norm_var": 0.013606516520182292, + "learning_rate": 0.0001, + "loss": 3.8959, + "loss/crossentropy": 1.972103476524353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2072778418660164, + "step": 8400 + }, + { + "epoch": 0.16804, + "grad_norm": 2.171875, + "grad_norm_var": 0.009663645426432292, + "learning_rate": 0.0001, + "loss": 4.4646, + "loss/crossentropy": 2.402593731880188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23028723895549774, + "step": 8402 + }, + { + "epoch": 0.16808, + "grad_norm": 2.125, + "grad_norm_var": 0.008754221598307292, + "learning_rate": 0.0001, + "loss": 4.392, + "loss/crossentropy": 2.218156576156616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24104444682598114, + "step": 8404 + }, + { + "epoch": 0.16812, + "grad_norm": 2.09375, + "grad_norm_var": 0.008722941080729166, + "learning_rate": 0.0001, + "loss": 4.2713, + "loss/crossentropy": 1.9802079796791077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2134247124195099, + "step": 8406 + }, + { + "epoch": 0.16816, + "grad_norm": 1.953125, + "grad_norm_var": 0.008577219645182292, + "learning_rate": 0.0001, + "loss": 3.9765, + "loss/crossentropy": 2.0322983264923096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21680974960327148, + "step": 8408 + }, + { + "epoch": 0.1682, + "grad_norm": 2.03125, + "grad_norm_var": 0.008194732666015624, + "learning_rate": 0.0001, + "loss": 4.2457, + "loss/crossentropy": 2.11838436126709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2319432571530342, + "step": 8410 + }, + { + "epoch": 0.16824, + "grad_norm": 2.125, + "grad_norm_var": 0.013398996988932292, + "learning_rate": 0.0001, + "loss": 4.2204, + "loss/crossentropy": 2.3801279067993164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23981253802776337, + "step": 8412 + }, + { + "epoch": 0.16828, + "grad_norm": 2.078125, + "grad_norm_var": 0.013042958577473958, + "learning_rate": 0.0001, + "loss": 4.2766, + "loss/crossentropy": 2.0953307151794434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22835668921470642, + "step": 8414 + }, + { + "epoch": 0.16832, + "grad_norm": 2.15625, + "grad_norm_var": 0.011774698893229166, + "learning_rate": 0.0001, + "loss": 4.528, + "loss/crossentropy": 2.0654338598251343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100541964173317, + "step": 8416 + }, + { + "epoch": 0.16836, + "grad_norm": 2.0625, + "grad_norm_var": 0.018184407552083334, + "learning_rate": 0.0001, + "loss": 4.5734, + "loss/crossentropy": 2.383318305015564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.246080219745636, + "step": 8418 + }, + { + "epoch": 0.1684, + "grad_norm": 2.078125, + "grad_norm_var": 0.018318684895833333, + "learning_rate": 0.0001, + "loss": 4.2764, + "loss/crossentropy": 2.103544294834137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20659767091274261, + "step": 8420 + }, + { + "epoch": 0.16844, + "grad_norm": 2.09375, + "grad_norm_var": 0.017437489827473958, + "learning_rate": 0.0001, + "loss": 4.2737, + "loss/crossentropy": 2.064394950866699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23458171635866165, + "step": 8422 + }, + { + "epoch": 0.16848, + "grad_norm": 2.25, + "grad_norm_var": 0.014872233072916666, + "learning_rate": 0.0001, + "loss": 4.4133, + "loss/crossentropy": 2.087044835090637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23527196049690247, + "step": 8424 + }, + { + "epoch": 0.16852, + "grad_norm": 2.15625, + "grad_norm_var": 0.0133941650390625, + "learning_rate": 0.0001, + "loss": 4.5867, + "loss/crossentropy": 2.41584312915802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2425323873758316, + "step": 8426 + }, + { + "epoch": 0.16856, + "grad_norm": 2.09375, + "grad_norm_var": 0.010400390625, + "learning_rate": 0.0001, + "loss": 4.3292, + "loss/crossentropy": 2.2542352080345154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2330578714609146, + "step": 8428 + }, + { + "epoch": 0.1686, + "grad_norm": 2.09375, + "grad_norm_var": 0.010724894205729167, + "learning_rate": 0.0001, + "loss": 4.6628, + "loss/crossentropy": 2.453263282775879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.280864879488945, + "step": 8430 + }, + { + "epoch": 0.16864, + "grad_norm": 2.140625, + "grad_norm_var": 0.01197509765625, + "learning_rate": 0.0001, + "loss": 4.0443, + "loss/crossentropy": 1.9185429811477661, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19230744242668152, + "step": 8432 + }, + { + "epoch": 0.16868, + "grad_norm": 2.140625, + "grad_norm_var": 0.005956013997395833, + "learning_rate": 0.0001, + "loss": 4.1683, + "loss/crossentropy": 2.020436644554138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21756108105182648, + "step": 8434 + }, + { + "epoch": 0.16872, + "grad_norm": 2.328125, + "grad_norm_var": 0.007616170247395833, + "learning_rate": 0.0001, + "loss": 4.7727, + "loss/crossentropy": 2.3050636053085327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24950820207595825, + "step": 8436 + }, + { + "epoch": 0.16876, + "grad_norm": 2.0625, + "grad_norm_var": 0.01510009765625, + "learning_rate": 0.0001, + "loss": 4.4091, + "loss/crossentropy": 2.2787232398986816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24267014116048813, + "step": 8438 + }, + { + "epoch": 0.1688, + "grad_norm": 2.078125, + "grad_norm_var": 0.015327962239583333, + "learning_rate": 0.0001, + "loss": 4.2927, + "loss/crossentropy": 2.185176372528076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2157401591539383, + "step": 8440 + }, + { + "epoch": 0.16884, + "grad_norm": 2.265625, + "grad_norm_var": 0.01558837890625, + "learning_rate": 0.0001, + "loss": 4.2087, + "loss/crossentropy": 2.0673694610595703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23145683109760284, + "step": 8442 + }, + { + "epoch": 0.16888, + "grad_norm": 2.171875, + "grad_norm_var": 0.01754150390625, + "learning_rate": 0.0001, + "loss": 4.0229, + "loss/crossentropy": 1.9011740684509277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21629629284143448, + "step": 8444 + }, + { + "epoch": 0.16892, + "grad_norm": 2.34375, + "grad_norm_var": 0.018651326497395832, + "learning_rate": 0.0001, + "loss": 4.4821, + "loss/crossentropy": 2.055977463722229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2275683432817459, + "step": 8446 + }, + { + "epoch": 0.16896, + "grad_norm": 2.140625, + "grad_norm_var": 0.0171295166015625, + "learning_rate": 0.0001, + "loss": 4.2152, + "loss/crossentropy": 1.752756416797638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19437911361455917, + "step": 8448 + }, + { + "epoch": 0.169, + "grad_norm": 2.09375, + "grad_norm_var": 0.015706380208333332, + "learning_rate": 0.0001, + "loss": 4.3631, + "loss/crossentropy": 2.479012131690979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23593680560588837, + "step": 8450 + }, + { + "epoch": 0.16904, + "grad_norm": 2.1875, + "grad_norm_var": 0.015523274739583334, + "learning_rate": 0.0001, + "loss": 4.3245, + "loss/crossentropy": 2.065472185611725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148493528366089, + "step": 8452 + }, + { + "epoch": 0.16908, + "grad_norm": 2.078125, + "grad_norm_var": 0.015901692708333335, + "learning_rate": 0.0001, + "loss": 4.3955, + "loss/crossentropy": 2.214062213897705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25444991141557693, + "step": 8454 + }, + { + "epoch": 0.16912, + "grad_norm": 2.296875, + "grad_norm_var": 0.0155426025390625, + "learning_rate": 0.0001, + "loss": 4.2059, + "loss/crossentropy": 1.7410383224487305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21134207397699356, + "step": 8456 + }, + { + "epoch": 0.16916, + "grad_norm": 2.234375, + "grad_norm_var": 0.020542144775390625, + "learning_rate": 0.0001, + "loss": 4.196, + "loss/crossentropy": 1.9303107857704163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18110749125480652, + "step": 8458 + }, + { + "epoch": 0.1692, + "grad_norm": 2.203125, + "grad_norm_var": 0.01810480753580729, + "learning_rate": 0.0001, + "loss": 4.3916, + "loss/crossentropy": 1.9091919660568237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21320972591638565, + "step": 8460 + }, + { + "epoch": 0.16924, + "grad_norm": 2.0625, + "grad_norm_var": 0.025099436442057293, + "learning_rate": 0.0001, + "loss": 4.4075, + "loss/crossentropy": 1.8021087050437927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21294282376766205, + "step": 8462 + }, + { + "epoch": 0.16928, + "grad_norm": 2.0, + "grad_norm_var": 0.027186838785807292, + "learning_rate": 0.0001, + "loss": 3.97, + "loss/crossentropy": 2.018254518508911, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21781788766384125, + "step": 8464 + }, + { + "epoch": 0.16932, + "grad_norm": 2.0625, + "grad_norm_var": 0.02906061808268229, + "learning_rate": 0.0001, + "loss": 4.5665, + "loss/crossentropy": 2.0593737959861755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24223209917545319, + "step": 8466 + }, + { + "epoch": 0.16936, + "grad_norm": 2.125, + "grad_norm_var": 0.029504140218098957, + "learning_rate": 0.0001, + "loss": 4.3116, + "loss/crossentropy": 2.218974232673645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22679369151592255, + "step": 8468 + }, + { + "epoch": 0.1694, + "grad_norm": 2.078125, + "grad_norm_var": 0.022332509358723957, + "learning_rate": 0.0001, + "loss": 4.3313, + "loss/crossentropy": 2.182355046272278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23690057545900345, + "step": 8470 + }, + { + "epoch": 0.16944, + "grad_norm": 2.375, + "grad_norm_var": 0.024930572509765624, + "learning_rate": 0.0001, + "loss": 4.6241, + "loss/crossentropy": 2.1669063568115234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2458687722682953, + "step": 8472 + }, + { + "epoch": 0.16948, + "grad_norm": 2.171875, + "grad_norm_var": 0.021100870768229165, + "learning_rate": 0.0001, + "loss": 4.3181, + "loss/crossentropy": 2.0656558871269226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20421817898750305, + "step": 8474 + }, + { + "epoch": 0.16952, + "grad_norm": 2.03125, + "grad_norm_var": 0.024625651041666665, + "learning_rate": 0.0001, + "loss": 4.4916, + "loss/crossentropy": 2.1470741033554077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20798998326063156, + "step": 8476 + }, + { + "epoch": 0.16956, + "grad_norm": 2.1875, + "grad_norm_var": 0.014972941080729166, + "learning_rate": 0.0001, + "loss": 4.2616, + "loss/crossentropy": 2.206741452217102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2327529340982437, + "step": 8478 + }, + { + "epoch": 0.1696, + "grad_norm": 2.171875, + "grad_norm_var": 0.012596638997395833, + "learning_rate": 0.0001, + "loss": 4.3338, + "loss/crossentropy": 2.4421777725219727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2385794073343277, + "step": 8480 + }, + { + "epoch": 0.16964, + "grad_norm": 2.015625, + "grad_norm_var": 0.0115234375, + "learning_rate": 0.0001, + "loss": 3.963, + "loss/crossentropy": 1.8545736074447632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18871797621250153, + "step": 8482 + }, + { + "epoch": 0.16968, + "grad_norm": 2.078125, + "grad_norm_var": 0.0118072509765625, + "learning_rate": 0.0001, + "loss": 4.1352, + "loss/crossentropy": 1.807646930217743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19625309854745865, + "step": 8484 + }, + { + "epoch": 0.16972, + "grad_norm": 2.1875, + "grad_norm_var": 0.012202962239583334, + "learning_rate": 0.0001, + "loss": 4.5029, + "loss/crossentropy": 1.923595905303955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2233208492398262, + "step": 8486 + }, + { + "epoch": 0.16976, + "grad_norm": 2.0625, + "grad_norm_var": 0.0080718994140625, + "learning_rate": 0.0001, + "loss": 4.3251, + "loss/crossentropy": 2.1018277406692505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2556355446577072, + "step": 8488 + }, + { + "epoch": 0.1698, + "grad_norm": 1.90625, + "grad_norm_var": 0.01099853515625, + "learning_rate": 0.0001, + "loss": 4.1127, + "loss/crossentropy": 1.9638542532920837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2101784572005272, + "step": 8490 + }, + { + "epoch": 0.16984, + "grad_norm": 2.109375, + "grad_norm_var": 0.008703358968098958, + "learning_rate": 0.0001, + "loss": 3.9463, + "loss/crossentropy": 1.7770507335662842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2004244327545166, + "step": 8492 + }, + { + "epoch": 0.16988, + "grad_norm": 2.171875, + "grad_norm_var": 0.008573150634765625, + "learning_rate": 0.0001, + "loss": 4.3901, + "loss/crossentropy": 2.4064877033233643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23509501665830612, + "step": 8494 + }, + { + "epoch": 0.16992, + "grad_norm": 2.0, + "grad_norm_var": 0.008713531494140624, + "learning_rate": 0.0001, + "loss": 4.3197, + "loss/crossentropy": 2.183193802833557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22985967248678207, + "step": 8496 + }, + { + "epoch": 0.16996, + "grad_norm": 2.203125, + "grad_norm_var": 0.009126536051432292, + "learning_rate": 0.0001, + "loss": 4.4983, + "loss/crossentropy": 2.241714060306549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22633583843708038, + "step": 8498 + }, + { + "epoch": 0.17, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0097808837890625, + "learning_rate": 0.0001, + "loss": 4.32, + "loss/crossentropy": 2.212075114250183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20900271832942963, + "step": 8500 + }, + { + "epoch": 0.17004, + "grad_norm": 2.09375, + "grad_norm_var": 0.0071685791015625, + "learning_rate": 0.0001, + "loss": 4.1885, + "loss/crossentropy": 2.1283940076828003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21419794112443924, + "step": 8502 + }, + { + "epoch": 0.17008, + "grad_norm": 2.15625, + "grad_norm_var": 0.00731201171875, + "learning_rate": 0.0001, + "loss": 4.3498, + "loss/crossentropy": 2.1958925127983093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23375380039215088, + "step": 8504 + }, + { + "epoch": 0.17012, + "grad_norm": 2.171875, + "grad_norm_var": 0.0065673828125, + "learning_rate": 0.0001, + "loss": 4.3396, + "loss/crossentropy": 1.7883376479148865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2097182646393776, + "step": 8506 + }, + { + "epoch": 0.17016, + "grad_norm": 2.296875, + "grad_norm_var": 0.006811269124348958, + "learning_rate": 0.0001, + "loss": 4.5606, + "loss/crossentropy": 2.114001750946045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2059268057346344, + "step": 8508 + }, + { + "epoch": 0.1702, + "grad_norm": 2.15625, + "grad_norm_var": 0.006929270426432292, + "learning_rate": 0.0001, + "loss": 4.3178, + "loss/crossentropy": 2.2413129806518555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2517316862940788, + "step": 8510 + }, + { + "epoch": 0.17024, + "grad_norm": 1.921875, + "grad_norm_var": 0.015083567301432291, + "learning_rate": 0.0001, + "loss": 4.4202, + "loss/crossentropy": 2.0968031883239746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2454070746898651, + "step": 8512 + }, + { + "epoch": 0.17028, + "grad_norm": 1.875, + "grad_norm_var": 0.01945978800455729, + "learning_rate": 0.0001, + "loss": 4.407, + "loss/crossentropy": 1.9813454151153564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21238256990909576, + "step": 8514 + }, + { + "epoch": 0.17032, + "grad_norm": 2.28125, + "grad_norm_var": 0.019755045572916668, + "learning_rate": 0.0001, + "loss": 4.4724, + "loss/crossentropy": 2.261451005935669, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2393329069018364, + "step": 8516 + }, + { + "epoch": 0.17036, + "grad_norm": 2.171875, + "grad_norm_var": 0.020654296875, + "learning_rate": 0.0001, + "loss": 4.5377, + "loss/crossentropy": 1.9661999344825745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088441476225853, + "step": 8518 + }, + { + "epoch": 0.1704, + "grad_norm": 2.046875, + "grad_norm_var": 0.022163899739583333, + "learning_rate": 0.0001, + "loss": 3.9288, + "loss/crossentropy": 1.8275291323661804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20091407746076584, + "step": 8520 + }, + { + "epoch": 0.17044, + "grad_norm": 2.046875, + "grad_norm_var": 0.023346964518229166, + "learning_rate": 0.0001, + "loss": 4.3154, + "loss/crossentropy": 2.266944646835327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2514675632119179, + "step": 8522 + }, + { + "epoch": 0.17048, + "grad_norm": 2.28125, + "grad_norm_var": 0.02271728515625, + "learning_rate": 0.0001, + "loss": 4.3571, + "loss/crossentropy": 2.244120240211487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25534868240356445, + "step": 8524 + }, + { + "epoch": 0.17052, + "grad_norm": 2.015625, + "grad_norm_var": 0.025048828125, + "learning_rate": 0.0001, + "loss": 4.3037, + "loss/crossentropy": 1.8628552556037903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21675898134708405, + "step": 8526 + }, + { + "epoch": 0.17056, + "grad_norm": 2.109375, + "grad_norm_var": 0.01597900390625, + "learning_rate": 0.0001, + "loss": 4.3883, + "loss/crossentropy": 1.971808135509491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2143295481801033, + "step": 8528 + }, + { + "epoch": 0.1706, + "grad_norm": 2.0625, + "grad_norm_var": 0.0114410400390625, + "learning_rate": 0.0001, + "loss": 4.3481, + "loss/crossentropy": 1.959191381931305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24014096707105637, + "step": 8530 + }, + { + "epoch": 0.17064, + "grad_norm": 3.0, + "grad_norm_var": 0.05729878743489583, + "learning_rate": 0.0001, + "loss": 4.2633, + "loss/crossentropy": 1.836454451084137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20957449078559875, + "step": 8532 + }, + { + "epoch": 0.17068, + "grad_norm": 2.140625, + "grad_norm_var": 0.06005452473958333, + "learning_rate": 0.0001, + "loss": 4.1254, + "loss/crossentropy": 2.003947675228119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23538918048143387, + "step": 8534 + }, + { + "epoch": 0.17072, + "grad_norm": 2.375, + "grad_norm_var": 0.05788472493489583, + "learning_rate": 0.0001, + "loss": 4.4412, + "loss/crossentropy": 2.0154194831848145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2278473973274231, + "step": 8536 + }, + { + "epoch": 0.17076, + "grad_norm": 2.015625, + "grad_norm_var": 0.05548502604166667, + "learning_rate": 0.0001, + "loss": 4.2426, + "loss/crossentropy": 2.1560275554656982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21601636707782745, + "step": 8538 + }, + { + "epoch": 0.1708, + "grad_norm": 2.078125, + "grad_norm_var": 0.05689697265625, + "learning_rate": 0.0001, + "loss": 4.2258, + "loss/crossentropy": 2.1494773626327515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22883395850658417, + "step": 8540 + }, + { + "epoch": 0.17084, + "grad_norm": 2.234375, + "grad_norm_var": 0.05607808430989583, + "learning_rate": 0.0001, + "loss": 4.5201, + "loss/crossentropy": 2.20908784866333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26363062113523483, + "step": 8542 + }, + { + "epoch": 0.17088, + "grad_norm": 2.0625, + "grad_norm_var": 0.0566802978515625, + "learning_rate": 0.0001, + "loss": 4.2477, + "loss/crossentropy": 1.7932087182998657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18538028001785278, + "step": 8544 + }, + { + "epoch": 0.17092, + "grad_norm": 2.078125, + "grad_norm_var": 0.05625, + "learning_rate": 0.0001, + "loss": 4.471, + "loss/crossentropy": 2.069899260997772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22498925775289536, + "step": 8546 + }, + { + "epoch": 0.17096, + "grad_norm": 2.46875, + "grad_norm_var": 0.019169108072916666, + "learning_rate": 0.0001, + "loss": 4.474, + "loss/crossentropy": 2.008604884147644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21895557641983032, + "step": 8548 + }, + { + "epoch": 0.171, + "grad_norm": 2.171875, + "grad_norm_var": 0.018798828125, + "learning_rate": 0.0001, + "loss": 4.1973, + "loss/crossentropy": 2.0652626156806946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23341115564107895, + "step": 8550 + }, + { + "epoch": 0.17104, + "grad_norm": 2.234375, + "grad_norm_var": 0.015607706705729167, + "learning_rate": 0.0001, + "loss": 4.383, + "loss/crossentropy": 1.9780349135398865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22891747951507568, + "step": 8552 + }, + { + "epoch": 0.17108, + "grad_norm": 2.5, + "grad_norm_var": 0.022980753580729166, + "learning_rate": 0.0001, + "loss": 4.4424, + "loss/crossentropy": 2.163089871406555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2447328343987465, + "step": 8554 + }, + { + "epoch": 0.17112, + "grad_norm": 2.21875, + "grad_norm_var": 0.022945149739583334, + "learning_rate": 0.0001, + "loss": 4.1503, + "loss/crossentropy": 2.2946064472198486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24353116750717163, + "step": 8556 + }, + { + "epoch": 0.17116, + "grad_norm": 2.078125, + "grad_norm_var": 0.022459920247395834, + "learning_rate": 0.0001, + "loss": 4.3049, + "loss/crossentropy": 2.0238161087036133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2153262495994568, + "step": 8558 + }, + { + "epoch": 0.1712, + "grad_norm": 2.09375, + "grad_norm_var": 0.02604955037434896, + "learning_rate": 0.0001, + "loss": 3.8636, + "loss/crossentropy": 1.5683120489120483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17940818518400192, + "step": 8560 + }, + { + "epoch": 0.17124, + "grad_norm": 2.296875, + "grad_norm_var": 0.027337392171223957, + "learning_rate": 0.0001, + "loss": 4.3867, + "loss/crossentropy": 1.9956589937210083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22829821705818176, + "step": 8562 + }, + { + "epoch": 0.17128, + "grad_norm": 2.640625, + "grad_norm_var": 0.036834462483723955, + "learning_rate": 0.0001, + "loss": 4.7055, + "loss/crossentropy": 2.2242285013198853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23567892611026764, + "step": 8564 + }, + { + "epoch": 0.17132, + "grad_norm": 2.046875, + "grad_norm_var": 0.033719635009765624, + "learning_rate": 0.0001, + "loss": 4.274, + "loss/crossentropy": 2.2143776416778564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2457498088479042, + "step": 8566 + }, + { + "epoch": 0.17136, + "grad_norm": 2.15625, + "grad_norm_var": 0.033782704671223955, + "learning_rate": 0.0001, + "loss": 4.354, + "loss/crossentropy": 1.852292537689209, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24004538357257843, + "step": 8568 + }, + { + "epoch": 0.1714, + "grad_norm": 2.21875, + "grad_norm_var": 0.02676976521809896, + "learning_rate": 0.0001, + "loss": 4.4429, + "loss/crossentropy": 2.311089515686035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22059186547994614, + "step": 8570 + }, + { + "epoch": 0.17144, + "grad_norm": 2.0625, + "grad_norm_var": 0.027675120035807292, + "learning_rate": 0.0001, + "loss": 4.1697, + "loss/crossentropy": 1.9324169754981995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21051711589097977, + "step": 8572 + }, + { + "epoch": 0.17148, + "grad_norm": 2.171875, + "grad_norm_var": 0.02904052734375, + "learning_rate": 0.0001, + "loss": 4.2457, + "loss/crossentropy": 1.982999861240387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2084646075963974, + "step": 8574 + }, + { + "epoch": 0.17152, + "grad_norm": 2.078125, + "grad_norm_var": 0.02505671183268229, + "learning_rate": 0.0001, + "loss": 4.2503, + "loss/crossentropy": 2.1837204694747925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22953644394874573, + "step": 8576 + }, + { + "epoch": 0.17156, + "grad_norm": 2.25, + "grad_norm_var": 0.024102528889973957, + "learning_rate": 0.0001, + "loss": 4.4448, + "loss/crossentropy": 2.2588841319084167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22851599752902985, + "step": 8578 + }, + { + "epoch": 0.1716, + "grad_norm": 2.390625, + "grad_norm_var": 0.011502838134765625, + "learning_rate": 0.0001, + "loss": 4.2033, + "loss/crossentropy": 1.982733964920044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21284686028957367, + "step": 8580 + }, + { + "epoch": 0.17164, + "grad_norm": 2.046875, + "grad_norm_var": 0.012143707275390625, + "learning_rate": 0.0001, + "loss": 4.1238, + "loss/crossentropy": 2.443873167037964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23966002464294434, + "step": 8582 + }, + { + "epoch": 0.17168, + "grad_norm": 5.09375, + "grad_norm_var": 0.5580645243326823, + "learning_rate": 0.0001, + "loss": 4.2654, + "loss/crossentropy": 2.462417483329773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2341725453734398, + "step": 8584 + }, + { + "epoch": 0.17172, + "grad_norm": 2.453125, + "grad_norm_var": 0.5521705627441407, + "learning_rate": 0.0001, + "loss": 4.183, + "loss/crossentropy": 1.8569464683532715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224809430539608, + "step": 8586 + }, + { + "epoch": 0.17176, + "grad_norm": 2.609375, + "grad_norm_var": 0.55804443359375, + "learning_rate": 0.0001, + "loss": 4.2762, + "loss/crossentropy": 2.017254650592804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2259984165430069, + "step": 8588 + }, + { + "epoch": 0.1718, + "grad_norm": 2.1875, + "grad_norm_var": 0.550066884358724, + "learning_rate": 0.0001, + "loss": 4.463, + "loss/crossentropy": 1.9804525971412659, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20844170451164246, + "step": 8590 + }, + { + "epoch": 0.17184, + "grad_norm": 2.015625, + "grad_norm_var": 0.544781239827474, + "learning_rate": 0.0001, + "loss": 4.2676, + "loss/crossentropy": 2.2506592869758606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22404606640338898, + "step": 8592 + }, + { + "epoch": 0.17188, + "grad_norm": 2.0625, + "grad_norm_var": 0.5579335530598958, + "learning_rate": 0.0001, + "loss": 4.197, + "loss/crossentropy": 1.9515153765678406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.216490276157856, + "step": 8594 + }, + { + "epoch": 0.17192, + "grad_norm": 2.109375, + "grad_norm_var": 0.5598052978515625, + "learning_rate": 0.0001, + "loss": 4.4037, + "loss/crossentropy": 2.090883791446686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22496677190065384, + "step": 8596 + }, + { + "epoch": 0.17196, + "grad_norm": 2.828125, + "grad_norm_var": 0.5523844401041667, + "learning_rate": 0.0001, + "loss": 4.8429, + "loss/crossentropy": 2.4344359636306763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2748369127511978, + "step": 8598 + }, + { + "epoch": 0.172, + "grad_norm": 2.15625, + "grad_norm_var": 0.05840250651041667, + "learning_rate": 0.0001, + "loss": 4.4359, + "loss/crossentropy": 1.9280555844306946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20140594244003296, + "step": 8600 + }, + { + "epoch": 0.17204, + "grad_norm": 2.15625, + "grad_norm_var": 0.05537007649739583, + "learning_rate": 0.0001, + "loss": 4.2802, + "loss/crossentropy": 2.045006573200226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21741003543138504, + "step": 8602 + }, + { + "epoch": 0.17208, + "grad_norm": 2.0625, + "grad_norm_var": 0.04461034138997396, + "learning_rate": 0.0001, + "loss": 4.1622, + "loss/crossentropy": 2.092265546321869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20971956849098206, + "step": 8604 + }, + { + "epoch": 0.17212, + "grad_norm": 1.9375, + "grad_norm_var": 0.048130035400390625, + "learning_rate": 0.0001, + "loss": 4.16, + "loss/crossentropy": 1.794768512248993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20680297911167145, + "step": 8606 + }, + { + "epoch": 0.17216, + "grad_norm": 2.109375, + "grad_norm_var": 0.05690078735351563, + "learning_rate": 0.0001, + "loss": 4.2213, + "loss/crossentropy": 1.9316805601119995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2163936346769333, + "step": 8608 + }, + { + "epoch": 0.1722, + "grad_norm": 2.359375, + "grad_norm_var": 0.05872294108072917, + "learning_rate": 0.0001, + "loss": 4.5981, + "loss/crossentropy": 2.2786675691604614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2411205694079399, + "step": 8610 + }, + { + "epoch": 0.17224, + "grad_norm": 2.234375, + "grad_norm_var": 0.05852457682291667, + "learning_rate": 0.0001, + "loss": 4.5688, + "loss/crossentropy": 1.9211469888687134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23195043951272964, + "step": 8612 + }, + { + "epoch": 0.17228, + "grad_norm": 1.9921875, + "grad_norm_var": 0.037393951416015626, + "learning_rate": 0.0001, + "loss": 4.1776, + "loss/crossentropy": 1.900360643863678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2339140772819519, + "step": 8614 + }, + { + "epoch": 0.17232, + "grad_norm": 2.125, + "grad_norm_var": 0.03743464152018229, + "learning_rate": 0.0001, + "loss": 4.4408, + "loss/crossentropy": 1.976994514465332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.218951515853405, + "step": 8616 + }, + { + "epoch": 0.17236, + "grad_norm": 2.09375, + "grad_norm_var": 0.035982004801432294, + "learning_rate": 0.0001, + "loss": 4.1988, + "loss/crossentropy": 2.045244038105011, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20886047929525375, + "step": 8618 + }, + { + "epoch": 0.1724, + "grad_norm": 2.25, + "grad_norm_var": 0.03240534464518229, + "learning_rate": 0.0001, + "loss": 4.5082, + "loss/crossentropy": 2.21256685256958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21809116005897522, + "step": 8620 + }, + { + "epoch": 0.17244, + "grad_norm": 2.171875, + "grad_norm_var": 0.02641779581705729, + "learning_rate": 0.0001, + "loss": 4.6127, + "loss/crossentropy": 2.300337314605713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22060109674930573, + "step": 8622 + }, + { + "epoch": 0.17248, + "grad_norm": 2.015625, + "grad_norm_var": 0.016932932535807292, + "learning_rate": 0.0001, + "loss": 4.2411, + "loss/crossentropy": 1.8734883666038513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20403072237968445, + "step": 8624 + }, + { + "epoch": 0.17252, + "grad_norm": 2.5625, + "grad_norm_var": 0.01962865193684896, + "learning_rate": 0.0001, + "loss": 4.6907, + "loss/crossentropy": 2.1382813453674316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27397096157073975, + "step": 8626 + }, + { + "epoch": 0.17256, + "grad_norm": 2.0625, + "grad_norm_var": 0.019760894775390624, + "learning_rate": 0.0001, + "loss": 4.0917, + "loss/crossentropy": 1.9718505144119263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20658842474222183, + "step": 8628 + }, + { + "epoch": 0.1726, + "grad_norm": 2.125, + "grad_norm_var": 0.016706339518229165, + "learning_rate": 0.0001, + "loss": 4.5233, + "loss/crossentropy": 2.0957319736480713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2187560573220253, + "step": 8630 + }, + { + "epoch": 0.17264, + "grad_norm": 1.984375, + "grad_norm_var": 0.019954427083333334, + "learning_rate": 0.0001, + "loss": 4.0986, + "loss/crossentropy": 2.0504234433174133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22376062721014023, + "step": 8632 + }, + { + "epoch": 0.17268, + "grad_norm": 2.21875, + "grad_norm_var": 0.05233968098958333, + "learning_rate": 0.0001, + "loss": 4.4513, + "loss/crossentropy": 2.057171046733856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23209689557552338, + "step": 8634 + }, + { + "epoch": 0.17272, + "grad_norm": 2.109375, + "grad_norm_var": 0.052611287434895834, + "learning_rate": 0.0001, + "loss": 4.3473, + "loss/crossentropy": 1.9635317921638489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21854296326637268, + "step": 8636 + }, + { + "epoch": 0.17276, + "grad_norm": 2.359375, + "grad_norm_var": 0.0546539306640625, + "learning_rate": 0.0001, + "loss": 4.3124, + "loss/crossentropy": 1.8973188400268555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24171262234449387, + "step": 8638 + }, + { + "epoch": 0.1728, + "grad_norm": 2.109375, + "grad_norm_var": 0.05347900390625, + "learning_rate": 0.0001, + "loss": 4.2068, + "loss/crossentropy": 1.6730469465255737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17975886166095734, + "step": 8640 + }, + { + "epoch": 0.17284, + "grad_norm": 2.265625, + "grad_norm_var": 0.04702123006184896, + "learning_rate": 0.0001, + "loss": 4.4222, + "loss/crossentropy": 2.2531689405441284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2170827016234398, + "step": 8642 + }, + { + "epoch": 0.17288, + "grad_norm": 2.171875, + "grad_norm_var": 0.045873769124348956, + "learning_rate": 0.0001, + "loss": 4.0249, + "loss/crossentropy": 2.0913639068603516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22586838155984879, + "step": 8644 + }, + { + "epoch": 0.17292, + "grad_norm": 2.140625, + "grad_norm_var": 0.04533869425455729, + "learning_rate": 0.0001, + "loss": 4.45, + "loss/crossentropy": 2.163489580154419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2290281057357788, + "step": 8646 + }, + { + "epoch": 0.17296, + "grad_norm": 2.15625, + "grad_norm_var": 0.04267552693684896, + "learning_rate": 0.0001, + "loss": 4.3198, + "loss/crossentropy": 2.0669034719467163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22411519289016724, + "step": 8648 + }, + { + "epoch": 0.173, + "grad_norm": 2.046875, + "grad_norm_var": 0.008213043212890625, + "learning_rate": 0.0001, + "loss": 4.0474, + "loss/crossentropy": 1.9942336678504944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21262076497077942, + "step": 8650 + }, + { + "epoch": 0.17304, + "grad_norm": 2.1875, + "grad_norm_var": 0.009474436442057291, + "learning_rate": 0.0001, + "loss": 4.2701, + "loss/crossentropy": 2.046514868736267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2376151606440544, + "step": 8652 + }, + { + "epoch": 0.17308, + "grad_norm": 2.046875, + "grad_norm_var": 0.005995432535807292, + "learning_rate": 0.0001, + "loss": 4.3308, + "loss/crossentropy": 1.8385429382324219, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19642101973295212, + "step": 8654 + }, + { + "epoch": 0.17312, + "grad_norm": 2.125, + "grad_norm_var": 0.007252756754557292, + "learning_rate": 0.0001, + "loss": 4.446, + "loss/crossentropy": 2.259633481502533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24319174885749817, + "step": 8656 + }, + { + "epoch": 0.17316, + "grad_norm": 2.15625, + "grad_norm_var": 0.005301920572916666, + "learning_rate": 0.0001, + "loss": 4.2067, + "loss/crossentropy": 1.9811018109321594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21801364421844482, + "step": 8658 + }, + { + "epoch": 0.1732, + "grad_norm": 2.203125, + "grad_norm_var": 0.0059722900390625, + "learning_rate": 0.0001, + "loss": 4.2158, + "loss/crossentropy": 2.1726362705230713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23193368315696716, + "step": 8660 + }, + { + "epoch": 0.17324, + "grad_norm": 2.1875, + "grad_norm_var": 0.006086222330729167, + "learning_rate": 0.0001, + "loss": 4.2587, + "loss/crossentropy": 1.9915854930877686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2111910656094551, + "step": 8662 + }, + { + "epoch": 0.17328, + "grad_norm": 2.671875, + "grad_norm_var": 0.026851399739583334, + "learning_rate": 0.0001, + "loss": 4.5001, + "loss/crossentropy": 1.9651137590408325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.242890365421772, + "step": 8664 + }, + { + "epoch": 0.17332, + "grad_norm": 2.15625, + "grad_norm_var": 0.026008097330729167, + "learning_rate": 0.0001, + "loss": 4.3848, + "loss/crossentropy": 1.865262508392334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21065659821033478, + "step": 8666 + }, + { + "epoch": 0.17336, + "grad_norm": 2.0625, + "grad_norm_var": 0.02535400390625, + "learning_rate": 0.0001, + "loss": 4.3402, + "loss/crossentropy": 1.9073076248168945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1959603875875473, + "step": 8668 + }, + { + "epoch": 0.1734, + "grad_norm": 2.078125, + "grad_norm_var": 0.024332682291666668, + "learning_rate": 0.0001, + "loss": 4.2965, + "loss/crossentropy": 2.167983889579773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22176912426948547, + "step": 8670 + }, + { + "epoch": 0.17344, + "grad_norm": 2.34375, + "grad_norm_var": 0.5460896809895833, + "learning_rate": 0.0001, + "loss": 4.4551, + "loss/crossentropy": 1.7029761672019958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.222617506980896, + "step": 8672 + }, + { + "epoch": 0.17348, + "grad_norm": 2.046875, + "grad_norm_var": 0.5439849853515625, + "learning_rate": 0.0001, + "loss": 4.3414, + "loss/crossentropy": 2.053748309612274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23398682475090027, + "step": 8674 + }, + { + "epoch": 0.17352, + "grad_norm": 2.703125, + "grad_norm_var": 0.5408274332682291, + "learning_rate": 0.0001, + "loss": 4.7882, + "loss/crossentropy": 2.309812903404236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2961876690387726, + "step": 8676 + }, + { + "epoch": 0.17356, + "grad_norm": 1.9765625, + "grad_norm_var": 0.5421953837076823, + "learning_rate": 0.0001, + "loss": 4.4416, + "loss/crossentropy": 2.1045809984207153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22072184830904007, + "step": 8678 + }, + { + "epoch": 0.1736, + "grad_norm": 2.015625, + "grad_norm_var": 0.5490435282389323, + "learning_rate": 0.0001, + "loss": 4.4437, + "loss/crossentropy": 2.2114070653915405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2208957076072693, + "step": 8680 + }, + { + "epoch": 0.17364, + "grad_norm": 2.21875, + "grad_norm_var": 0.5490435282389323, + "learning_rate": 0.0001, + "loss": 4.5931, + "loss/crossentropy": 2.1773669719696045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22639526426792145, + "step": 8682 + }, + { + "epoch": 0.17368, + "grad_norm": 2.234375, + "grad_norm_var": 0.5398272196451823, + "learning_rate": 0.0001, + "loss": 3.9397, + "loss/crossentropy": 1.4213417768478394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17653951048851013, + "step": 8684 + }, + { + "epoch": 0.17372, + "grad_norm": 1.9609375, + "grad_norm_var": 0.5425374348958333, + "learning_rate": 0.0001, + "loss": 4.2711, + "loss/crossentropy": 1.968630075454712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2007492408156395, + "step": 8686 + }, + { + "epoch": 0.17376, + "grad_norm": 2.21875, + "grad_norm_var": 0.030304972330729166, + "learning_rate": 0.0001, + "loss": 4.2297, + "loss/crossentropy": 2.0826632976531982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21895240992307663, + "step": 8688 + }, + { + "epoch": 0.1738, + "grad_norm": 2.21875, + "grad_norm_var": 0.029002888997395834, + "learning_rate": 0.0001, + "loss": 4.4188, + "loss/crossentropy": 2.2756701707839966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22170037031173706, + "step": 8690 + }, + { + "epoch": 0.17384, + "grad_norm": 2.0625, + "grad_norm_var": 0.010773722330729167, + "learning_rate": 0.0001, + "loss": 4.51, + "loss/crossentropy": 2.329536557197571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24717354029417038, + "step": 8692 + }, + { + "epoch": 0.17388, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010033162434895833, + "learning_rate": 0.0001, + "loss": 4.0776, + "loss/crossentropy": 2.077241063117981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.215146966278553, + "step": 8694 + }, + { + "epoch": 0.17392, + "grad_norm": 2.109375, + "grad_norm_var": 0.0091949462890625, + "learning_rate": 0.0001, + "loss": 4.3482, + "loss/crossentropy": 2.2363221645355225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23428452014923096, + "step": 8696 + }, + { + "epoch": 0.17396, + "grad_norm": 2.0, + "grad_norm_var": 0.01024169921875, + "learning_rate": 0.0001, + "loss": 4.1321, + "loss/crossentropy": 2.055815279483795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22674524784088135, + "step": 8698 + }, + { + "epoch": 0.174, + "grad_norm": 2.015625, + "grad_norm_var": 0.0098541259765625, + "learning_rate": 0.0001, + "loss": 4.2789, + "loss/crossentropy": 2.205570936203003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23787499964237213, + "step": 8700 + }, + { + "epoch": 0.17404, + "grad_norm": 2.0625, + "grad_norm_var": 0.009895579020182291, + "learning_rate": 0.0001, + "loss": 4.4636, + "loss/crossentropy": 2.262540578842163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23921719938516617, + "step": 8702 + }, + { + "epoch": 0.17408, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010114542643229167, + "learning_rate": 0.0001, + "loss": 4.3372, + "loss/crossentropy": 2.5464816093444824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23302219063043594, + "step": 8704 + }, + { + "epoch": 0.17412, + "grad_norm": 2.09375, + "grad_norm_var": 0.00897216796875, + "learning_rate": 0.0001, + "loss": 4.2355, + "loss/crossentropy": 2.050383508205414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23393510282039642, + "step": 8706 + }, + { + "epoch": 0.17416, + "grad_norm": 2.046875, + "grad_norm_var": 0.0076812744140625, + "learning_rate": 0.0001, + "loss": 3.9943, + "loss/crossentropy": 1.9034642577171326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20801686495542526, + "step": 8708 + }, + { + "epoch": 0.1742, + "grad_norm": 2.09375, + "grad_norm_var": 0.007045237223307291, + "learning_rate": 0.0001, + "loss": 4.2801, + "loss/crossentropy": 2.313044309616089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24881915748119354, + "step": 8710 + }, + { + "epoch": 0.17424, + "grad_norm": 2.015625, + "grad_norm_var": 0.009388987223307292, + "learning_rate": 0.0001, + "loss": 4.3754, + "loss/crossentropy": 1.973829746246338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21400053054094315, + "step": 8712 + }, + { + "epoch": 0.17428, + "grad_norm": 2.25, + "grad_norm_var": 0.010109202067057291, + "learning_rate": 0.0001, + "loss": 4.2936, + "loss/crossentropy": 1.831783950328827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20855721831321716, + "step": 8714 + }, + { + "epoch": 0.17432, + "grad_norm": 2.15625, + "grad_norm_var": 0.009683990478515625, + "learning_rate": 0.0001, + "loss": 4.2681, + "loss/crossentropy": 2.0173734426498413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22365443408489227, + "step": 8716 + }, + { + "epoch": 0.17436, + "grad_norm": 2.125, + "grad_norm_var": 0.007155100504557292, + "learning_rate": 0.0001, + "loss": 4.4177, + "loss/crossentropy": 1.6534234285354614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20528900623321533, + "step": 8718 + }, + { + "epoch": 0.1744, + "grad_norm": 2.125, + "grad_norm_var": 0.006396484375, + "learning_rate": 0.0001, + "loss": 4.3658, + "loss/crossentropy": 1.8113531470298767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21433213353157043, + "step": 8720 + }, + { + "epoch": 0.17444, + "grad_norm": 2.125, + "grad_norm_var": 0.0064280192057291664, + "learning_rate": 0.0001, + "loss": 4.5135, + "loss/crossentropy": 2.0750836730003357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22202756255865097, + "step": 8722 + }, + { + "epoch": 0.17448, + "grad_norm": 2.046875, + "grad_norm_var": 0.0056955973307291664, + "learning_rate": 0.0001, + "loss": 4.2512, + "loss/crossentropy": 2.1388206481933594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24137140065431595, + "step": 8724 + }, + { + "epoch": 0.17452, + "grad_norm": 2.0625, + "grad_norm_var": 0.006745402018229167, + "learning_rate": 0.0001, + "loss": 4.0401, + "loss/crossentropy": 2.068696141242981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2215234711766243, + "step": 8726 + }, + { + "epoch": 0.17456, + "grad_norm": 1.984375, + "grad_norm_var": 0.005280558268229167, + "learning_rate": 0.0001, + "loss": 4.0277, + "loss/crossentropy": 1.6970900893211365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21108710020780563, + "step": 8728 + }, + { + "epoch": 0.1746, + "grad_norm": 2.15625, + "grad_norm_var": 0.005582682291666667, + "learning_rate": 0.0001, + "loss": 3.8555, + "loss/crossentropy": 1.8847576975822449, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2151619866490364, + "step": 8730 + }, + { + "epoch": 0.17464, + "grad_norm": 2.328125, + "grad_norm_var": 0.009403483072916666, + "learning_rate": 0.0001, + "loss": 4.4088, + "loss/crossentropy": 2.4103721380233765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2589537426829338, + "step": 8732 + }, + { + "epoch": 0.17468, + "grad_norm": 2.203125, + "grad_norm_var": 0.01011962890625, + "learning_rate": 0.0001, + "loss": 4.1415, + "loss/crossentropy": 1.8340824842453003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22282177209854126, + "step": 8734 + }, + { + "epoch": 0.17472, + "grad_norm": 2.078125, + "grad_norm_var": 0.010302734375, + "learning_rate": 0.0001, + "loss": 4.2472, + "loss/crossentropy": 1.88236665725708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2193661779165268, + "step": 8736 + }, + { + "epoch": 0.17476, + "grad_norm": 2.28125, + "grad_norm_var": 0.05056050618489583, + "learning_rate": 0.0001, + "loss": 4.441, + "loss/crossentropy": 1.8121293783187866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21855003386735916, + "step": 8738 + }, + { + "epoch": 0.1748, + "grad_norm": 2.203125, + "grad_norm_var": 0.05090738932291667, + "learning_rate": 0.0001, + "loss": 4.1422, + "loss/crossentropy": 2.215694308280945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2405528798699379, + "step": 8740 + }, + { + "epoch": 0.17484, + "grad_norm": 2.0625, + "grad_norm_var": 0.05090738932291667, + "learning_rate": 0.0001, + "loss": 4.2431, + "loss/crossentropy": 2.124837279319763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22565175592899323, + "step": 8742 + }, + { + "epoch": 0.17488, + "grad_norm": 2.203125, + "grad_norm_var": 0.0476959228515625, + "learning_rate": 0.0001, + "loss": 4.6783, + "loss/crossentropy": 2.2531429529190063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.259593665599823, + "step": 8744 + }, + { + "epoch": 0.17492, + "grad_norm": 2.046875, + "grad_norm_var": 0.04537760416666667, + "learning_rate": 0.0001, + "loss": 4.3546, + "loss/crossentropy": 2.403178572654724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2475409209728241, + "step": 8746 + }, + { + "epoch": 0.17496, + "grad_norm": 2.328125, + "grad_norm_var": 0.04527587890625, + "learning_rate": 0.0001, + "loss": 4.5698, + "loss/crossentropy": 1.7886858582496643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21677076816558838, + "step": 8748 + }, + { + "epoch": 0.175, + "grad_norm": 2.109375, + "grad_norm_var": 0.04397379557291667, + "learning_rate": 0.0001, + "loss": 4.1374, + "loss/crossentropy": 1.9257569313049316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2098483294248581, + "step": 8750 + }, + { + "epoch": 0.17504, + "grad_norm": 2.078125, + "grad_norm_var": 0.04396870930989583, + "learning_rate": 0.0001, + "loss": 4.407, + "loss/crossentropy": 2.1609140634536743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22846446931362152, + "step": 8752 + }, + { + "epoch": 0.17508, + "grad_norm": 2.0625, + "grad_norm_var": 0.0081939697265625, + "learning_rate": 0.0001, + "loss": 4.2683, + "loss/crossentropy": 2.529700756072998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2515157088637352, + "step": 8754 + }, + { + "epoch": 0.17512, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008548736572265625, + "learning_rate": 0.0001, + "loss": 4.4965, + "loss/crossentropy": 2.1920565366744995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24070476740598679, + "step": 8756 + }, + { + "epoch": 0.17516, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0097900390625, + "learning_rate": 0.0001, + "loss": 3.8416, + "loss/crossentropy": 1.7714558839797974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1787928193807602, + "step": 8758 + }, + { + "epoch": 0.1752, + "grad_norm": 2.171875, + "grad_norm_var": 0.009666951497395833, + "learning_rate": 0.0001, + "loss": 4.4457, + "loss/crossentropy": 1.986818790435791, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20736730098724365, + "step": 8760 + }, + { + "epoch": 0.17524, + "grad_norm": 2.015625, + "grad_norm_var": 0.010453287760416667, + "learning_rate": 0.0001, + "loss": 4.2731, + "loss/crossentropy": 1.8152282238006592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22459527850151062, + "step": 8762 + }, + { + "epoch": 0.17528, + "grad_norm": 2.109375, + "grad_norm_var": 0.0069163004557291664, + "learning_rate": 0.0001, + "loss": 4.4058, + "loss/crossentropy": 2.2312777042388916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23017627000808716, + "step": 8764 + }, + { + "epoch": 0.17532, + "grad_norm": 2.375, + "grad_norm_var": 0.013252766927083333, + "learning_rate": 0.0001, + "loss": 4.4714, + "loss/crossentropy": 2.107849955558777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21224269270896912, + "step": 8766 + }, + { + "epoch": 0.17536, + "grad_norm": 2.15625, + "grad_norm_var": 0.013206990559895833, + "learning_rate": 0.0001, + "loss": 4.1641, + "loss/crossentropy": 2.1588711738586426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2200954109430313, + "step": 8768 + }, + { + "epoch": 0.1754, + "grad_norm": 2.1875, + "grad_norm_var": 0.0130126953125, + "learning_rate": 0.0001, + "loss": 4.4056, + "loss/crossentropy": 2.1355313062667847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24048195779323578, + "step": 8770 + }, + { + "epoch": 0.17544, + "grad_norm": 2.125, + "grad_norm_var": 0.011557769775390626, + "learning_rate": 0.0001, + "loss": 4.3505, + "loss/crossentropy": 2.477591037750244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24459562450647354, + "step": 8772 + }, + { + "epoch": 0.17548, + "grad_norm": 2.046875, + "grad_norm_var": 0.00845947265625, + "learning_rate": 0.0001, + "loss": 4.4184, + "loss/crossentropy": 2.2577285766601562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23038798570632935, + "step": 8774 + }, + { + "epoch": 0.17552, + "grad_norm": 2.15625, + "grad_norm_var": 0.009813435872395833, + "learning_rate": 0.0001, + "loss": 4.2681, + "loss/crossentropy": 2.239536762237549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22290733456611633, + "step": 8776 + }, + { + "epoch": 0.17556, + "grad_norm": 2.125, + "grad_norm_var": 0.007373046875, + "learning_rate": 0.0001, + "loss": 4.4351, + "loss/crossentropy": 1.9139958024024963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2175864800810814, + "step": 8778 + }, + { + "epoch": 0.1756, + "grad_norm": 2.0625, + "grad_norm_var": 0.00888671875, + "learning_rate": 0.0001, + "loss": 4.0756, + "loss/crossentropy": 2.0622661113739014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2223067432641983, + "step": 8780 + }, + { + "epoch": 0.17564, + "grad_norm": 2.15625, + "grad_norm_var": 0.004524739583333334, + "learning_rate": 0.0001, + "loss": 4.4552, + "loss/crossentropy": 2.222475051879883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21181221306324005, + "step": 8782 + }, + { + "epoch": 0.17568, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007002512613932292, + "learning_rate": 0.0001, + "loss": 4.1696, + "loss/crossentropy": 2.2612074613571167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24079158157110214, + "step": 8784 + }, + { + "epoch": 0.17572, + "grad_norm": 2.078125, + "grad_norm_var": 0.005863189697265625, + "learning_rate": 0.0001, + "loss": 4.0889, + "loss/crossentropy": 2.1629387736320496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21437199413776398, + "step": 8786 + }, + { + "epoch": 0.17576, + "grad_norm": 2.328125, + "grad_norm_var": 0.009492746988932292, + "learning_rate": 0.0001, + "loss": 4.6583, + "loss/crossentropy": 2.145151972770691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23797442018985748, + "step": 8788 + }, + { + "epoch": 0.1758, + "grad_norm": 2.453125, + "grad_norm_var": 0.015457916259765624, + "learning_rate": 0.0001, + "loss": 4.5188, + "loss/crossentropy": 2.0366984605789185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2191104218363762, + "step": 8790 + }, + { + "epoch": 0.17584, + "grad_norm": 2.203125, + "grad_norm_var": 0.014422353108723958, + "learning_rate": 0.0001, + "loss": 4.1604, + "loss/crossentropy": 2.1049715280532837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.230033777654171, + "step": 8792 + }, + { + "epoch": 0.17588, + "grad_norm": 2.15625, + "grad_norm_var": 0.014338938395182292, + "learning_rate": 0.0001, + "loss": 4.4327, + "loss/crossentropy": 2.2549991607666016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23084092140197754, + "step": 8794 + }, + { + "epoch": 0.17592, + "grad_norm": 2.125, + "grad_norm_var": 0.015547688802083333, + "learning_rate": 0.0001, + "loss": 4.4653, + "loss/crossentropy": 1.9873813390731812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19879335910081863, + "step": 8796 + }, + { + "epoch": 0.17596, + "grad_norm": 2.140625, + "grad_norm_var": 0.015677897135416667, + "learning_rate": 0.0001, + "loss": 4.6371, + "loss/crossentropy": 2.0723283886909485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20538055896759033, + "step": 8798 + }, + { + "epoch": 0.176, + "grad_norm": 2.125, + "grad_norm_var": 0.013099924723307291, + "learning_rate": 0.0001, + "loss": 4.0313, + "loss/crossentropy": 2.090642750263214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2329270914196968, + "step": 8800 + }, + { + "epoch": 0.17604, + "grad_norm": 2.1875, + "grad_norm_var": 0.012672678629557291, + "learning_rate": 0.0001, + "loss": 4.4626, + "loss/crossentropy": 2.3432271480560303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2456662431359291, + "step": 8802 + }, + { + "epoch": 0.17608, + "grad_norm": 2.21875, + "grad_norm_var": 0.010465240478515625, + "learning_rate": 0.0001, + "loss": 4.5133, + "loss/crossentropy": 2.1210837364196777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22580894827842712, + "step": 8804 + }, + { + "epoch": 0.17612, + "grad_norm": 2.0, + "grad_norm_var": 0.005147043863932292, + "learning_rate": 0.0001, + "loss": 4.024, + "loss/crossentropy": 2.142494797706604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22426588833332062, + "step": 8806 + }, + { + "epoch": 0.17616, + "grad_norm": 2.0625, + "grad_norm_var": 0.0049435933430989586, + "learning_rate": 0.0001, + "loss": 4.2877, + "loss/crossentropy": 1.9163227677345276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21529845893383026, + "step": 8808 + }, + { + "epoch": 0.1762, + "grad_norm": 2.140625, + "grad_norm_var": 0.004937489827473958, + "learning_rate": 0.0001, + "loss": 4.4776, + "loss/crossentropy": 2.1478612422943115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2354428842663765, + "step": 8810 + }, + { + "epoch": 0.17624, + "grad_norm": 2.15625, + "grad_norm_var": 0.0033854166666666668, + "learning_rate": 0.0001, + "loss": 4.435, + "loss/crossentropy": 2.1546601057052612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22553270310163498, + "step": 8812 + }, + { + "epoch": 0.17628, + "grad_norm": 2.140625, + "grad_norm_var": 0.004233551025390625, + "learning_rate": 0.0001, + "loss": 4.1661, + "loss/crossentropy": 2.0559862852096558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19714127480983734, + "step": 8814 + }, + { + "epoch": 0.17632, + "grad_norm": 2.140625, + "grad_norm_var": 0.004078928629557292, + "learning_rate": 0.0001, + "loss": 4.3543, + "loss/crossentropy": 2.1340363025665283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22422882914543152, + "step": 8816 + }, + { + "epoch": 0.17636, + "grad_norm": 2.078125, + "grad_norm_var": 0.0038937886555989584, + "learning_rate": 0.0001, + "loss": 4.4464, + "loss/crossentropy": 2.265942335128784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23249086737632751, + "step": 8818 + }, + { + "epoch": 0.1764, + "grad_norm": 2.0625, + "grad_norm_var": 0.0031939188639322916, + "learning_rate": 0.0001, + "loss": 4.3187, + "loss/crossentropy": 2.245513081550598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22869569063186646, + "step": 8820 + }, + { + "epoch": 0.17644, + "grad_norm": 2.0, + "grad_norm_var": 0.006461334228515625, + "learning_rate": 0.0001, + "loss": 4.179, + "loss/crossentropy": 1.851025104522705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21989689767360687, + "step": 8822 + }, + { + "epoch": 0.17648, + "grad_norm": 2.09375, + "grad_norm_var": 0.006266021728515625, + "learning_rate": 0.0001, + "loss": 4.276, + "loss/crossentropy": 2.2972241640090942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22793132066726685, + "step": 8824 + }, + { + "epoch": 0.17652, + "grad_norm": 2.078125, + "grad_norm_var": 0.006276194254557292, + "learning_rate": 0.0001, + "loss": 4.3955, + "loss/crossentropy": 2.248735189437866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2133597657084465, + "step": 8826 + }, + { + "epoch": 0.17656, + "grad_norm": 2.15625, + "grad_norm_var": 0.008314768473307291, + "learning_rate": 0.0001, + "loss": 4.4423, + "loss/crossentropy": 2.4173099994659424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23985996842384338, + "step": 8828 + }, + { + "epoch": 0.1766, + "grad_norm": 2.28125, + "grad_norm_var": 0.008698527018229167, + "learning_rate": 0.0001, + "loss": 4.5425, + "loss/crossentropy": 2.5017653703689575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26119648665189743, + "step": 8830 + }, + { + "epoch": 0.17664, + "grad_norm": 2.09375, + "grad_norm_var": 0.0090484619140625, + "learning_rate": 0.0001, + "loss": 4.2474, + "loss/crossentropy": 1.9006813764572144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19938994944095612, + "step": 8832 + }, + { + "epoch": 0.17668, + "grad_norm": 2.046875, + "grad_norm_var": 0.009496053059895834, + "learning_rate": 0.0001, + "loss": 4.1643, + "loss/crossentropy": 2.101746916770935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20544035732746124, + "step": 8834 + }, + { + "epoch": 0.17672, + "grad_norm": 2.15625, + "grad_norm_var": 0.010542805989583333, + "learning_rate": 0.0001, + "loss": 4.3211, + "loss/crossentropy": 2.1605160236358643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23061934113502502, + "step": 8836 + }, + { + "epoch": 0.17676, + "grad_norm": 2.296875, + "grad_norm_var": 0.008967081705729166, + "learning_rate": 0.0001, + "loss": 4.4357, + "loss/crossentropy": 1.963772177696228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20736530423164368, + "step": 8838 + }, + { + "epoch": 0.1768, + "grad_norm": 2.1875, + "grad_norm_var": 0.009733072916666667, + "learning_rate": 0.0001, + "loss": 4.3572, + "loss/crossentropy": 2.154300093650818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22221273183822632, + "step": 8840 + }, + { + "epoch": 0.17684, + "grad_norm": 2.109375, + "grad_norm_var": 0.009501139322916666, + "learning_rate": 0.0001, + "loss": 4.4876, + "loss/crossentropy": 2.1576497554779053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21834726631641388, + "step": 8842 + }, + { + "epoch": 0.17688, + "grad_norm": 1.9375, + "grad_norm_var": 0.0115875244140625, + "learning_rate": 0.0001, + "loss": 4.1192, + "loss/crossentropy": 2.1316112279891968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2166184037923813, + "step": 8844 + }, + { + "epoch": 0.17692, + "grad_norm": 2.15625, + "grad_norm_var": 0.0098052978515625, + "learning_rate": 0.0001, + "loss": 4.2421, + "loss/crossentropy": 2.068525493144989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23183748871088028, + "step": 8846 + }, + { + "epoch": 0.17696, + "grad_norm": 2.203125, + "grad_norm_var": 0.01051025390625, + "learning_rate": 0.0001, + "loss": 4.5128, + "loss/crossentropy": 2.185767650604248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25881427526474, + "step": 8848 + }, + { + "epoch": 0.177, + "grad_norm": 2.125, + "grad_norm_var": 0.010619099934895833, + "learning_rate": 0.0001, + "loss": 4.3073, + "loss/crossentropy": 1.979454517364502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20669714361429214, + "step": 8850 + }, + { + "epoch": 0.17704, + "grad_norm": 2.109375, + "grad_norm_var": 0.009373982747395834, + "learning_rate": 0.0001, + "loss": 4.3356, + "loss/crossentropy": 2.3473092317581177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23232270777225494, + "step": 8852 + }, + { + "epoch": 0.17708, + "grad_norm": 2.046875, + "grad_norm_var": 0.007616170247395833, + "learning_rate": 0.0001, + "loss": 4.3732, + "loss/crossentropy": 2.461324691772461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.242417573928833, + "step": 8854 + }, + { + "epoch": 0.17712, + "grad_norm": 2.125, + "grad_norm_var": 0.008385976155598959, + "learning_rate": 0.0001, + "loss": 4.1107, + "loss/crossentropy": 1.5953214168548584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17768454551696777, + "step": 8856 + }, + { + "epoch": 0.17716, + "grad_norm": 2.15625, + "grad_norm_var": 0.008377838134765624, + "learning_rate": 0.0001, + "loss": 4.4289, + "loss/crossentropy": 2.1969146728515625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21466109156608582, + "step": 8858 + }, + { + "epoch": 0.1772, + "grad_norm": 1.921875, + "grad_norm_var": 0.008459218343098958, + "learning_rate": 0.0001, + "loss": 4.2154, + "loss/crossentropy": 2.243234634399414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22729554027318954, + "step": 8860 + }, + { + "epoch": 0.17724, + "grad_norm": 2.296875, + "grad_norm_var": 0.010628000895182291, + "learning_rate": 0.0001, + "loss": 4.2062, + "loss/crossentropy": 2.1855397820472717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24905066192150116, + "step": 8862 + }, + { + "epoch": 0.17728, + "grad_norm": 2.203125, + "grad_norm_var": 0.010628000895182291, + "learning_rate": 0.0001, + "loss": 4.4075, + "loss/crossentropy": 2.320886254310608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22135943174362183, + "step": 8864 + }, + { + "epoch": 0.17732, + "grad_norm": 1.96875, + "grad_norm_var": 0.011736806233723958, + "learning_rate": 0.0001, + "loss": 4.1444, + "loss/crossentropy": 2.140891909599304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2109990492463112, + "step": 8866 + }, + { + "epoch": 0.17736, + "grad_norm": 1.921875, + "grad_norm_var": 0.013038889567057291, + "learning_rate": 0.0001, + "loss": 4.1009, + "loss/crossentropy": 2.147824764251709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2131204530596733, + "step": 8868 + }, + { + "epoch": 0.1774, + "grad_norm": 2.71875, + "grad_norm_var": 0.039249420166015625, + "learning_rate": 0.0001, + "loss": 4.7136, + "loss/crossentropy": 2.187807321548462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2335718423128128, + "step": 8870 + }, + { + "epoch": 0.17744, + "grad_norm": 2.1875, + "grad_norm_var": 0.037287394205729164, + "learning_rate": 0.0001, + "loss": 4.5265, + "loss/crossentropy": 2.3127458095550537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2385788857936859, + "step": 8872 + }, + { + "epoch": 0.17748, + "grad_norm": 2.046875, + "grad_norm_var": 0.0369537353515625, + "learning_rate": 0.0001, + "loss": 4.1364, + "loss/crossentropy": 1.859586775302887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.204575777053833, + "step": 8874 + }, + { + "epoch": 0.17752, + "grad_norm": 2.0, + "grad_norm_var": 0.03585611979166667, + "learning_rate": 0.0001, + "loss": 4.1993, + "loss/crossentropy": 1.9626107215881348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21016598492860794, + "step": 8876 + }, + { + "epoch": 0.17756, + "grad_norm": 2.046875, + "grad_norm_var": 0.034077962239583336, + "learning_rate": 0.0001, + "loss": 4.462, + "loss/crossentropy": 2.1130539774894714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21995095163583755, + "step": 8878 + }, + { + "epoch": 0.1776, + "grad_norm": 2.203125, + "grad_norm_var": 0.035374959309895836, + "learning_rate": 0.0001, + "loss": 4.4677, + "loss/crossentropy": 1.8914743065834045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20437531173229218, + "step": 8880 + }, + { + "epoch": 0.17764, + "grad_norm": 2.125, + "grad_norm_var": 0.032763671875, + "learning_rate": 0.0001, + "loss": 4.4975, + "loss/crossentropy": 2.2135708332061768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2478664517402649, + "step": 8882 + }, + { + "epoch": 0.17768, + "grad_norm": 2.203125, + "grad_norm_var": 0.028922526041666667, + "learning_rate": 0.0001, + "loss": 4.3763, + "loss/crossentropy": 2.194110333919525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2366471290588379, + "step": 8884 + }, + { + "epoch": 0.17772, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009822336832682292, + "learning_rate": 0.0001, + "loss": 4.2844, + "loss/crossentropy": 2.4365748167037964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25842973589897156, + "step": 8886 + }, + { + "epoch": 0.17776, + "grad_norm": 2.078125, + "grad_norm_var": 0.010001373291015626, + "learning_rate": 0.0001, + "loss": 4.0574, + "loss/crossentropy": 1.9800177216529846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21982619166374207, + "step": 8888 + }, + { + "epoch": 0.1778, + "grad_norm": 2.15625, + "grad_norm_var": 0.009956614176432291, + "learning_rate": 0.0001, + "loss": 4.3465, + "loss/crossentropy": 2.1437748670578003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21494000405073166, + "step": 8890 + }, + { + "epoch": 0.17784, + "grad_norm": 2.1875, + "grad_norm_var": 0.008957672119140624, + "learning_rate": 0.0001, + "loss": 4.4051, + "loss/crossentropy": 2.0610267519950867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22473593801259995, + "step": 8892 + }, + { + "epoch": 0.17788, + "grad_norm": 1.953125, + "grad_norm_var": 0.013952382405598958, + "learning_rate": 0.0001, + "loss": 4.4637, + "loss/crossentropy": 2.261958599090576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2250354364514351, + "step": 8894 + }, + { + "epoch": 0.17792, + "grad_norm": 2.109375, + "grad_norm_var": 0.012237294514973959, + "learning_rate": 0.0001, + "loss": 4.3508, + "loss/crossentropy": 2.3689773082733154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22185539454221725, + "step": 8896 + }, + { + "epoch": 0.17796, + "grad_norm": 2.15625, + "grad_norm_var": 0.013034820556640625, + "learning_rate": 0.0001, + "loss": 4.7432, + "loss/crossentropy": 2.612341523170471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2561237961053848, + "step": 8898 + }, + { + "epoch": 0.178, + "grad_norm": 2.0625, + "grad_norm_var": 0.010990142822265625, + "learning_rate": 0.0001, + "loss": 4.5876, + "loss/crossentropy": 2.1230576038360596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23115848749876022, + "step": 8900 + }, + { + "epoch": 0.17804, + "grad_norm": 2.109375, + "grad_norm_var": 0.009626261393229167, + "learning_rate": 0.0001, + "loss": 4.2154, + "loss/crossentropy": 2.200004458427429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22002413868904114, + "step": 8902 + }, + { + "epoch": 0.17808, + "grad_norm": 2.03125, + "grad_norm_var": 0.009989420572916666, + "learning_rate": 0.0001, + "loss": 4.3438, + "loss/crossentropy": 2.323713779449463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22923698276281357, + "step": 8904 + }, + { + "epoch": 0.17812, + "grad_norm": 2.140625, + "grad_norm_var": 0.00982666015625, + "learning_rate": 0.0001, + "loss": 4.3079, + "loss/crossentropy": 2.038426458835602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21465667337179184, + "step": 8906 + }, + { + "epoch": 0.17816, + "grad_norm": 2.109375, + "grad_norm_var": 0.01217041015625, + "learning_rate": 0.0001, + "loss": 4.4498, + "loss/crossentropy": 2.3639097213745117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2378860041499138, + "step": 8908 + }, + { + "epoch": 0.1782, + "grad_norm": 2.046875, + "grad_norm_var": 0.00699462890625, + "learning_rate": 0.0001, + "loss": 4.261, + "loss/crossentropy": 1.8291080594062805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19894887506961823, + "step": 8910 + }, + { + "epoch": 0.17824, + "grad_norm": 2.265625, + "grad_norm_var": 0.011263020833333333, + "learning_rate": 0.0001, + "loss": 4.5606, + "loss/crossentropy": 2.3113714456558228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24526391178369522, + "step": 8912 + }, + { + "epoch": 0.17828, + "grad_norm": 2.109375, + "grad_norm_var": 0.011449178059895834, + "learning_rate": 0.0001, + "loss": 4.3016, + "loss/crossentropy": 2.114617943763733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21305133402347565, + "step": 8914 + }, + { + "epoch": 0.17832, + "grad_norm": 2.15625, + "grad_norm_var": 0.014975738525390626, + "learning_rate": 0.0001, + "loss": 3.9333, + "loss/crossentropy": 1.6893808841705322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19195494800806046, + "step": 8916 + }, + { + "epoch": 0.17836, + "grad_norm": 2.234375, + "grad_norm_var": 0.016721343994140624, + "learning_rate": 0.0001, + "loss": 4.2501, + "loss/crossentropy": 1.829396367073059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21181780844926834, + "step": 8918 + }, + { + "epoch": 0.1784, + "grad_norm": 2.03125, + "grad_norm_var": 0.01869481404622396, + "learning_rate": 0.0001, + "loss": 4.1542, + "loss/crossentropy": 1.8910154104232788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20975399017333984, + "step": 8920 + }, + { + "epoch": 0.17844, + "grad_norm": 2.140625, + "grad_norm_var": 0.01953709920247396, + "learning_rate": 0.0001, + "loss": 4.3327, + "loss/crossentropy": 2.0501255989074707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22336408495903015, + "step": 8922 + }, + { + "epoch": 0.17848, + "grad_norm": 2.109375, + "grad_norm_var": 0.01740086873372396, + "learning_rate": 0.0001, + "loss": 4.3035, + "loss/crossentropy": 2.3023892641067505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23251917958259583, + "step": 8924 + }, + { + "epoch": 0.17852, + "grad_norm": 2.015625, + "grad_norm_var": 0.01822077433268229, + "learning_rate": 0.0001, + "loss": 4.0592, + "loss/crossentropy": 2.0030421018600464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19997069239616394, + "step": 8926 + }, + { + "epoch": 0.17856, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013741048177083333, + "learning_rate": 0.0001, + "loss": 4.2568, + "loss/crossentropy": 2.2309017181396484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22145257890224457, + "step": 8928 + }, + { + "epoch": 0.1786, + "grad_norm": 2.171875, + "grad_norm_var": 0.013728841145833334, + "learning_rate": 0.0001, + "loss": 4.4366, + "loss/crossentropy": 2.1135157346725464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21580957621335983, + "step": 8930 + }, + { + "epoch": 0.17864, + "grad_norm": 2.015625, + "grad_norm_var": 0.011146799723307291, + "learning_rate": 0.0001, + "loss": 4.3473, + "loss/crossentropy": 2.098900556564331, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22469021379947662, + "step": 8932 + }, + { + "epoch": 0.17868, + "grad_norm": 2.21875, + "grad_norm_var": 0.0103179931640625, + "learning_rate": 0.0001, + "loss": 4.2702, + "loss/crossentropy": 2.1558337211608887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24765773862600327, + "step": 8934 + }, + { + "epoch": 0.17872, + "grad_norm": 2.09375, + "grad_norm_var": 0.006004842122395834, + "learning_rate": 0.0001, + "loss": 4.3318, + "loss/crossentropy": 2.141040623188019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22490675747394562, + "step": 8936 + }, + { + "epoch": 0.17876, + "grad_norm": 2.171875, + "grad_norm_var": 0.0096099853515625, + "learning_rate": 0.0001, + "loss": 4.4126, + "loss/crossentropy": 2.1036806106567383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23260055482387543, + "step": 8938 + }, + { + "epoch": 0.1788, + "grad_norm": 1.984375, + "grad_norm_var": 0.013036092122395834, + "learning_rate": 0.0001, + "loss": 4.4151, + "loss/crossentropy": 1.9403663277626038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21941428631544113, + "step": 8940 + }, + { + "epoch": 0.17884, + "grad_norm": 2.03125, + "grad_norm_var": 0.014717610677083333, + "learning_rate": 0.0001, + "loss": 4.2, + "loss/crossentropy": 1.8589079976081848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21533852070569992, + "step": 8942 + }, + { + "epoch": 0.17888, + "grad_norm": 2.21875, + "grad_norm_var": 0.013816070556640626, + "learning_rate": 0.0001, + "loss": 4.2476, + "loss/crossentropy": 2.3136903643608093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23139026761054993, + "step": 8944 + }, + { + "epoch": 0.17892, + "grad_norm": 2.109375, + "grad_norm_var": 0.012835439046223958, + "learning_rate": 0.0001, + "loss": 4.2669, + "loss/crossentropy": 2.305663585662842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24263737350702286, + "step": 8946 + }, + { + "epoch": 0.17896, + "grad_norm": 2.5625, + "grad_norm_var": 0.02195002237955729, + "learning_rate": 0.0001, + "loss": 4.3059, + "loss/crossentropy": 2.0450612902641296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2124529778957367, + "step": 8948 + }, + { + "epoch": 0.179, + "grad_norm": 2.3125, + "grad_norm_var": 0.020393880208333333, + "learning_rate": 0.0001, + "loss": 4.4229, + "loss/crossentropy": 2.435065984725952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24603386223316193, + "step": 8950 + }, + { + "epoch": 0.17904, + "grad_norm": 2.234375, + "grad_norm_var": 0.02295099894205729, + "learning_rate": 0.0001, + "loss": 4.2604, + "loss/crossentropy": 2.092818021774292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20905248820781708, + "step": 8952 + }, + { + "epoch": 0.17908, + "grad_norm": 2.109375, + "grad_norm_var": 0.024621327718098957, + "learning_rate": 0.0001, + "loss": 4.2539, + "loss/crossentropy": 2.08588969707489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2142588049173355, + "step": 8954 + }, + { + "epoch": 0.17912, + "grad_norm": 2.25, + "grad_norm_var": 0.021144358317057292, + "learning_rate": 0.0001, + "loss": 4.3958, + "loss/crossentropy": 1.9161878824234009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20626582205295563, + "step": 8956 + }, + { + "epoch": 0.17916, + "grad_norm": 2.078125, + "grad_norm_var": 0.020182037353515626, + "learning_rate": 0.0001, + "loss": 4.3726, + "loss/crossentropy": 2.3072937726974487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22028075903654099, + "step": 8958 + }, + { + "epoch": 0.1792, + "grad_norm": 2.15625, + "grad_norm_var": 0.021109771728515626, + "learning_rate": 0.0001, + "loss": 4.4876, + "loss/crossentropy": 2.3053938150405884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22212930023670197, + "step": 8960 + }, + { + "epoch": 0.17924, + "grad_norm": 2.0625, + "grad_norm_var": 0.02269261678059896, + "learning_rate": 0.0001, + "loss": 4.5137, + "loss/crossentropy": 1.9130414128303528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20036083459854126, + "step": 8962 + }, + { + "epoch": 0.17928, + "grad_norm": 2.0, + "grad_norm_var": 0.013242340087890625, + "learning_rate": 0.0001, + "loss": 4.1299, + "loss/crossentropy": 2.3808066844940186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2116570845246315, + "step": 8964 + }, + { + "epoch": 0.17932, + "grad_norm": 2.125, + "grad_norm_var": 0.009549713134765625, + "learning_rate": 0.0001, + "loss": 4.296, + "loss/crossentropy": 2.180716395378113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2275484874844551, + "step": 8966 + }, + { + "epoch": 0.17936, + "grad_norm": 1.984375, + "grad_norm_var": 0.008837890625, + "learning_rate": 0.0001, + "loss": 4.2597, + "loss/crossentropy": 2.027850866317749, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2011229619383812, + "step": 8968 + }, + { + "epoch": 0.1794, + "grad_norm": 2.171875, + "grad_norm_var": 0.017447916666666667, + "learning_rate": 0.0001, + "loss": 4.3167, + "loss/crossentropy": 2.077622890472412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21361806988716125, + "step": 8970 + }, + { + "epoch": 0.17944, + "grad_norm": 2.03125, + "grad_norm_var": 0.017513020833333334, + "learning_rate": 0.0001, + "loss": 4.1972, + "loss/crossentropy": 2.004905104637146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21110886335372925, + "step": 8972 + }, + { + "epoch": 0.17948, + "grad_norm": 2.21875, + "grad_norm_var": 0.019466145833333334, + "learning_rate": 0.0001, + "loss": 4.5026, + "loss/crossentropy": 2.1859925389289856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23011507838964462, + "step": 8974 + }, + { + "epoch": 0.17952, + "grad_norm": 2.21875, + "grad_norm_var": 0.019840494791666666, + "learning_rate": 0.0001, + "loss": 4.2908, + "loss/crossentropy": 1.8372295498847961, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2171105071902275, + "step": 8976 + }, + { + "epoch": 0.17956, + "grad_norm": 2.328125, + "grad_norm_var": 0.020197550455729168, + "learning_rate": 0.0001, + "loss": 4.3887, + "loss/crossentropy": 2.127313494682312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22017831355333328, + "step": 8978 + }, + { + "epoch": 0.1796, + "grad_norm": 2.421875, + "grad_norm_var": 0.022098795572916666, + "learning_rate": 0.0001, + "loss": 4.3389, + "loss/crossentropy": 2.121580421924591, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24898843467235565, + "step": 8980 + }, + { + "epoch": 0.17964, + "grad_norm": 2.109375, + "grad_norm_var": 0.0226470947265625, + "learning_rate": 0.0001, + "loss": 4.1127, + "loss/crossentropy": 2.0973563194274902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21355029940605164, + "step": 8982 + }, + { + "epoch": 0.17968, + "grad_norm": 2.125, + "grad_norm_var": 0.018896484375, + "learning_rate": 0.0001, + "loss": 4.3898, + "loss/crossentropy": 2.1109927892684937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2096879929304123, + "step": 8984 + }, + { + "epoch": 0.17972, + "grad_norm": 2.078125, + "grad_norm_var": 0.0126617431640625, + "learning_rate": 0.0001, + "loss": 4.1264, + "loss/crossentropy": 1.954129159450531, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20745816081762314, + "step": 8986 + }, + { + "epoch": 0.17976, + "grad_norm": 2.390625, + "grad_norm_var": 0.013483683268229166, + "learning_rate": 0.0001, + "loss": 4.5357, + "loss/crossentropy": 1.875806748867035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19150983542203903, + "step": 8988 + }, + { + "epoch": 0.1798, + "grad_norm": 2.109375, + "grad_norm_var": 0.01451416015625, + "learning_rate": 0.0001, + "loss": 4.3042, + "loss/crossentropy": 2.3101617097854614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2393202781677246, + "step": 8990 + }, + { + "epoch": 0.17984, + "grad_norm": 2.234375, + "grad_norm_var": 0.013374837239583333, + "learning_rate": 0.0001, + "loss": 4.4954, + "loss/crossentropy": 2.3156551122665405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23346291482448578, + "step": 8992 + }, + { + "epoch": 0.17988, + "grad_norm": 2.1875, + "grad_norm_var": 0.013966623942057292, + "learning_rate": 0.0001, + "loss": 4.3675, + "loss/crossentropy": 2.4437999725341797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172817811369896, + "step": 8994 + }, + { + "epoch": 0.17992, + "grad_norm": 2.046875, + "grad_norm_var": 0.009437815348307291, + "learning_rate": 0.0001, + "loss": 4.3122, + "loss/crossentropy": 2.0451250076293945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22556670010089874, + "step": 8996 + }, + { + "epoch": 0.17996, + "grad_norm": 2.21875, + "grad_norm_var": 0.009852854410807292, + "learning_rate": 0.0001, + "loss": 4.5654, + "loss/crossentropy": 2.3135393857955933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2313593551516533, + "step": 8998 + }, + { + "epoch": 0.18, + "grad_norm": 2.125, + "grad_norm_var": 0.010530344645182292, + "learning_rate": 0.0001, + "loss": 4.3001, + "loss/crossentropy": 1.9934805035591125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20815817266702652, + "step": 9000 + }, + { + "epoch": 0.18004, + "grad_norm": 2.125, + "grad_norm_var": 0.011982981363932292, + "learning_rate": 0.0001, + "loss": 4.6193, + "loss/crossentropy": 2.1770662665367126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23489046096801758, + "step": 9002 + }, + { + "epoch": 0.18008, + "grad_norm": 2.3125, + "grad_norm_var": 0.009779612223307291, + "learning_rate": 0.0001, + "loss": 4.3708, + "loss/crossentropy": 1.9791364073753357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2139936238527298, + "step": 9004 + }, + { + "epoch": 0.18012, + "grad_norm": 2.015625, + "grad_norm_var": 0.011445871988932292, + "learning_rate": 0.0001, + "loss": 4.298, + "loss/crossentropy": 2.2092931270599365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22620443254709244, + "step": 9006 + }, + { + "epoch": 0.18016, + "grad_norm": 2.171875, + "grad_norm_var": 0.011034901936848958, + "learning_rate": 0.0001, + "loss": 4.3114, + "loss/crossentropy": 2.123443365097046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21943332999944687, + "step": 9008 + }, + { + "epoch": 0.1802, + "grad_norm": 2.09375, + "grad_norm_var": 0.009212239583333334, + "learning_rate": 0.0001, + "loss": 4.4666, + "loss/crossentropy": 2.243329405784607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24108020961284637, + "step": 9010 + }, + { + "epoch": 0.18024, + "grad_norm": 2.1875, + "grad_norm_var": 0.008426920572916666, + "learning_rate": 0.0001, + "loss": 4.3687, + "loss/crossentropy": 2.366227388381958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21806316077709198, + "step": 9012 + }, + { + "epoch": 0.18028, + "grad_norm": 2.03125, + "grad_norm_var": 0.009663899739583334, + "learning_rate": 0.0001, + "loss": 3.8255, + "loss/crossentropy": 1.768812358379364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1861925944685936, + "step": 9014 + }, + { + "epoch": 0.18032, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012225087483723958, + "learning_rate": 0.0001, + "loss": 4.1236, + "loss/crossentropy": 1.9376537799835205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18691913783550262, + "step": 9016 + }, + { + "epoch": 0.18036, + "grad_norm": 2.140625, + "grad_norm_var": 0.010709381103515625, + "learning_rate": 0.0001, + "loss": 4.2748, + "loss/crossentropy": 2.3026299476623535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23757526278495789, + "step": 9018 + }, + { + "epoch": 0.1804, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010057576497395833, + "learning_rate": 0.0001, + "loss": 4.0026, + "loss/crossentropy": 1.9697216153144836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21453910320997238, + "step": 9020 + }, + { + "epoch": 0.18044, + "grad_norm": 2.0625, + "grad_norm_var": 0.011058553059895834, + "learning_rate": 0.0001, + "loss": 4.5457, + "loss/crossentropy": 2.257638931274414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24236120283603668, + "step": 9022 + }, + { + "epoch": 0.18048, + "grad_norm": 2.0625, + "grad_norm_var": 0.0115142822265625, + "learning_rate": 0.0001, + "loss": 4.3323, + "loss/crossentropy": 2.244320869445801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25429578125476837, + "step": 9024 + }, + { + "epoch": 0.18052, + "grad_norm": 2.21875, + "grad_norm_var": 0.012214152018229167, + "learning_rate": 0.0001, + "loss": 4.2854, + "loss/crossentropy": 2.1105872988700867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23081901669502258, + "step": 9026 + }, + { + "epoch": 0.18056, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012839508056640626, + "learning_rate": 0.0001, + "loss": 4.2561, + "loss/crossentropy": 1.9657647609710693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22111424803733826, + "step": 9028 + }, + { + "epoch": 0.1806, + "grad_norm": 2.125, + "grad_norm_var": 0.012308502197265625, + "learning_rate": 0.0001, + "loss": 4.4019, + "loss/crossentropy": 2.0759438276290894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22763221710920334, + "step": 9030 + }, + { + "epoch": 0.18064, + "grad_norm": 2.125, + "grad_norm_var": 0.010888417561848959, + "learning_rate": 0.0001, + "loss": 3.9999, + "loss/crossentropy": 2.0250572562217712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2279352843761444, + "step": 9032 + }, + { + "epoch": 0.18068, + "grad_norm": 2.09375, + "grad_norm_var": 0.010534413655598958, + "learning_rate": 0.0001, + "loss": 4.3772, + "loss/crossentropy": 2.270031213760376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23242096602916718, + "step": 9034 + }, + { + "epoch": 0.18072, + "grad_norm": 2.25, + "grad_norm_var": 0.011922200520833334, + "learning_rate": 0.0001, + "loss": 4.4304, + "loss/crossentropy": 2.10041344165802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23192601650953293, + "step": 9036 + }, + { + "epoch": 0.18076, + "grad_norm": 2.15625, + "grad_norm_var": 0.009227498372395834, + "learning_rate": 0.0001, + "loss": 4.1041, + "loss/crossentropy": 2.0255953073501587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23707614094018936, + "step": 9038 + }, + { + "epoch": 0.1808, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010406239827473959, + "learning_rate": 0.0001, + "loss": 4.1399, + "loss/crossentropy": 1.8162729740142822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19466694444417953, + "step": 9040 + }, + { + "epoch": 0.18084, + "grad_norm": 2.203125, + "grad_norm_var": 0.010170237223307291, + "learning_rate": 0.0001, + "loss": 4.5543, + "loss/crossentropy": 2.1271599531173706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22750889509916306, + "step": 9042 + }, + { + "epoch": 0.18088, + "grad_norm": 2.171875, + "grad_norm_var": 0.009447224934895833, + "learning_rate": 0.0001, + "loss": 4.5377, + "loss/crossentropy": 2.3638603687286377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2456393539905548, + "step": 9044 + }, + { + "epoch": 0.18092, + "grad_norm": 2.125, + "grad_norm_var": 0.019437662760416665, + "learning_rate": 0.0001, + "loss": 4.3413, + "loss/crossentropy": 1.5851669907569885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20994101464748383, + "step": 9046 + }, + { + "epoch": 0.18096, + "grad_norm": 2.1875, + "grad_norm_var": 0.017789459228515624, + "learning_rate": 0.0001, + "loss": 4.2284, + "loss/crossentropy": 1.7990906834602356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18919725716114044, + "step": 9048 + }, + { + "epoch": 0.181, + "grad_norm": 2.28125, + "grad_norm_var": 0.017490386962890625, + "learning_rate": 0.0001, + "loss": 4.5415, + "loss/crossentropy": 2.0975595712661743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23837832361459732, + "step": 9050 + }, + { + "epoch": 0.18104, + "grad_norm": 2.0625, + "grad_norm_var": 0.018070475260416666, + "learning_rate": 0.0001, + "loss": 4.1304, + "loss/crossentropy": 2.1970856189727783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23219536244869232, + "step": 9052 + }, + { + "epoch": 0.18108, + "grad_norm": 2.15625, + "grad_norm_var": 0.017292277018229166, + "learning_rate": 0.0001, + "loss": 4.211, + "loss/crossentropy": 2.0846009850502014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20775136351585388, + "step": 9054 + }, + { + "epoch": 0.18112, + "grad_norm": 2.0625, + "grad_norm_var": 0.015075429280598959, + "learning_rate": 0.0001, + "loss": 4.0415, + "loss/crossentropy": 1.662496030330658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18223516643047333, + "step": 9056 + }, + { + "epoch": 0.18116, + "grad_norm": 2.046875, + "grad_norm_var": 0.016078440348307292, + "learning_rate": 0.0001, + "loss": 4.2561, + "loss/crossentropy": 2.126902401447296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2193778082728386, + "step": 9058 + }, + { + "epoch": 0.1812, + "grad_norm": 2.984375, + "grad_norm_var": 0.062459309895833336, + "learning_rate": 0.0001, + "loss": 4.0343, + "loss/crossentropy": 1.9529814720153809, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19026879966259003, + "step": 9060 + }, + { + "epoch": 0.18124, + "grad_norm": 1.953125, + "grad_norm_var": 0.05981852213541667, + "learning_rate": 0.0001, + "loss": 4.1911, + "loss/crossentropy": 2.140450179576874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23864319920539856, + "step": 9062 + }, + { + "epoch": 0.18128, + "grad_norm": 2.015625, + "grad_norm_var": 0.06083882649739583, + "learning_rate": 0.0001, + "loss": 4.3442, + "loss/crossentropy": 2.4139195680618286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2360195592045784, + "step": 9064 + }, + { + "epoch": 0.18132, + "grad_norm": 2.046875, + "grad_norm_var": 0.06083882649739583, + "learning_rate": 0.0001, + "loss": 4.5122, + "loss/crossentropy": 2.190356135368347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22671552002429962, + "step": 9066 + }, + { + "epoch": 0.18136, + "grad_norm": 2.078125, + "grad_norm_var": 0.05916315714518229, + "learning_rate": 0.0001, + "loss": 4.281, + "loss/crossentropy": 1.9428812861442566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22318705916404724, + "step": 9068 + }, + { + "epoch": 0.1814, + "grad_norm": 2.046875, + "grad_norm_var": 0.059242502848307295, + "learning_rate": 0.0001, + "loss": 4.4628, + "loss/crossentropy": 2.296473503112793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22005227208137512, + "step": 9070 + }, + { + "epoch": 0.18144, + "grad_norm": 2.1875, + "grad_norm_var": 0.05919774373372396, + "learning_rate": 0.0001, + "loss": 4.5848, + "loss/crossentropy": 2.2118901014328003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22189343720674515, + "step": 9072 + }, + { + "epoch": 0.18148, + "grad_norm": 2.15625, + "grad_norm_var": 0.05810114542643229, + "learning_rate": 0.0001, + "loss": 4.285, + "loss/crossentropy": 1.8919037580490112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1986190229654312, + "step": 9074 + }, + { + "epoch": 0.18152, + "grad_norm": 2.171875, + "grad_norm_var": 0.011161295572916667, + "learning_rate": 0.0001, + "loss": 4.3046, + "loss/crossentropy": 1.975312054157257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20403321832418442, + "step": 9076 + }, + { + "epoch": 0.18156, + "grad_norm": 2.265625, + "grad_norm_var": 0.0075032552083333336, + "learning_rate": 0.0001, + "loss": 4.2175, + "loss/crossentropy": 1.8076966404914856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19708115607500076, + "step": 9078 + }, + { + "epoch": 0.1816, + "grad_norm": 2.265625, + "grad_norm_var": 0.007112630208333333, + "learning_rate": 0.0001, + "loss": 4.2166, + "loss/crossentropy": 2.101171374320984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22123181074857712, + "step": 9080 + }, + { + "epoch": 0.18164, + "grad_norm": 2.1875, + "grad_norm_var": 0.007225545247395834, + "learning_rate": 0.0001, + "loss": 4.2292, + "loss/crossentropy": 2.09942090511322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21761803328990936, + "step": 9082 + }, + { + "epoch": 0.18168, + "grad_norm": 2.1875, + "grad_norm_var": 0.008072916666666667, + "learning_rate": 0.0001, + "loss": 4.4802, + "loss/crossentropy": 2.4418424367904663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24908769130706787, + "step": 9084 + }, + { + "epoch": 0.18172, + "grad_norm": 2.109375, + "grad_norm_var": 0.0087890625, + "learning_rate": 0.0001, + "loss": 4.3408, + "loss/crossentropy": 1.944950520992279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20515284687280655, + "step": 9086 + }, + { + "epoch": 0.18176, + "grad_norm": 2.21875, + "grad_norm_var": 0.007991536458333334, + "learning_rate": 0.0001, + "loss": 4.3117, + "loss/crossentropy": 1.870418667793274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20427027344703674, + "step": 9088 + }, + { + "epoch": 0.1818, + "grad_norm": 2.171875, + "grad_norm_var": 0.007682291666666666, + "learning_rate": 0.0001, + "loss": 4.1323, + "loss/crossentropy": 1.9338520169258118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19928501546382904, + "step": 9090 + }, + { + "epoch": 0.18184, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008876291910807292, + "learning_rate": 0.0001, + "loss": 3.9734, + "loss/crossentropy": 1.7826221585273743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19778436422348022, + "step": 9092 + }, + { + "epoch": 0.18188, + "grad_norm": 2.078125, + "grad_norm_var": 0.007458241780598959, + "learning_rate": 0.0001, + "loss": 4.3537, + "loss/crossentropy": 2.2922680377960205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23613491654396057, + "step": 9094 + }, + { + "epoch": 0.18192, + "grad_norm": 2.296875, + "grad_norm_var": 0.15102513631184897, + "learning_rate": 0.0001, + "loss": 4.6789, + "loss/crossentropy": 2.1802788972854614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1969536542892456, + "step": 9096 + }, + { + "epoch": 0.18196, + "grad_norm": 2.15625, + "grad_norm_var": 0.1498308817545573, + "learning_rate": 0.0001, + "loss": 4.1715, + "loss/crossentropy": 1.9129992723464966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1942882016301155, + "step": 9098 + }, + { + "epoch": 0.182, + "grad_norm": 2.203125, + "grad_norm_var": 0.14738337198893228, + "learning_rate": 0.0001, + "loss": 4.3374, + "loss/crossentropy": 1.8288249969482422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19879616051912308, + "step": 9100 + }, + { + "epoch": 0.18204, + "grad_norm": 2.3125, + "grad_norm_var": 0.14580663045247397, + "learning_rate": 0.0001, + "loss": 4.2824, + "loss/crossentropy": 2.039812684059143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21374420076608658, + "step": 9102 + }, + { + "epoch": 0.18208, + "grad_norm": 1.9609375, + "grad_norm_var": 0.15449193318684895, + "learning_rate": 0.0001, + "loss": 4.1862, + "loss/crossentropy": 2.1177414059638977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2193453460931778, + "step": 9104 + }, + { + "epoch": 0.18212, + "grad_norm": 2.28125, + "grad_norm_var": 0.16704076131184895, + "learning_rate": 0.0001, + "loss": 4.4869, + "loss/crossentropy": 2.171097159385681, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21211445331573486, + "step": 9106 + }, + { + "epoch": 0.18216, + "grad_norm": 2.359375, + "grad_norm_var": 0.16413548787434895, + "learning_rate": 0.0001, + "loss": 4.1422, + "loss/crossentropy": 1.9781638383865356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21307373046875, + "step": 9108 + }, + { + "epoch": 0.1822, + "grad_norm": 1.984375, + "grad_norm_var": 0.17157363891601562, + "learning_rate": 0.0001, + "loss": 4.2454, + "loss/crossentropy": 2.0868560075759888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20633937418460846, + "step": 9110 + }, + { + "epoch": 0.18224, + "grad_norm": 2.03125, + "grad_norm_var": 0.04592463175455729, + "learning_rate": 0.0001, + "loss": 4.1153, + "loss/crossentropy": 2.079172134399414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20709815621376038, + "step": 9112 + }, + { + "epoch": 0.18228, + "grad_norm": 2.359375, + "grad_norm_var": 0.04835383097330729, + "learning_rate": 0.0001, + "loss": 4.1836, + "loss/crossentropy": 2.100913643836975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21831409633159637, + "step": 9114 + }, + { + "epoch": 0.18232, + "grad_norm": 2.203125, + "grad_norm_var": 0.049478912353515626, + "learning_rate": 0.0001, + "loss": 4.4087, + "loss/crossentropy": 2.018395781517029, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22545243054628372, + "step": 9116 + }, + { + "epoch": 0.18236, + "grad_norm": 2.203125, + "grad_norm_var": 0.04278132120768229, + "learning_rate": 0.0001, + "loss": 4.3421, + "loss/crossentropy": 1.9177632331848145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23346271365880966, + "step": 9118 + }, + { + "epoch": 0.1824, + "grad_norm": 2.0625, + "grad_norm_var": 0.038358306884765624, + "learning_rate": 0.0001, + "loss": 3.9924, + "loss/crossentropy": 2.1122357845306396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22593770176172256, + "step": 9120 + }, + { + "epoch": 0.18244, + "grad_norm": 2.03125, + "grad_norm_var": 0.014021555582682291, + "learning_rate": 0.0001, + "loss": 4.1399, + "loss/crossentropy": 1.7920495867729187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2127150148153305, + "step": 9122 + }, + { + "epoch": 0.18248, + "grad_norm": 2.1875, + "grad_norm_var": 0.0096099853515625, + "learning_rate": 0.0001, + "loss": 4.2061, + "loss/crossentropy": 2.2180920839309692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25088224560022354, + "step": 9124 + }, + { + "epoch": 0.18252, + "grad_norm": 2.1875, + "grad_norm_var": 0.008185831705729167, + "learning_rate": 0.0001, + "loss": 4.3287, + "loss/crossentropy": 1.8999969959259033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21616832166910172, + "step": 9126 + }, + { + "epoch": 0.18256, + "grad_norm": 2.125, + "grad_norm_var": 0.0081451416015625, + "learning_rate": 0.0001, + "loss": 4.5351, + "loss/crossentropy": 1.9424527287483215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22016742825508118, + "step": 9128 + }, + { + "epoch": 0.1826, + "grad_norm": 2.0625, + "grad_norm_var": 0.0049468994140625, + "learning_rate": 0.0001, + "loss": 4.4432, + "loss/crossentropy": 2.0539366006851196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26818516850471497, + "step": 9130 + }, + { + "epoch": 0.18264, + "grad_norm": 2.0625, + "grad_norm_var": 0.00458984375, + "learning_rate": 0.0001, + "loss": 4.1634, + "loss/crossentropy": 1.9180519580841064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2067112922668457, + "step": 9132 + }, + { + "epoch": 0.18268, + "grad_norm": 2.078125, + "grad_norm_var": 0.005052693684895833, + "learning_rate": 0.0001, + "loss": 3.9913, + "loss/crossentropy": 1.7417545318603516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1970825269818306, + "step": 9134 + }, + { + "epoch": 0.18272, + "grad_norm": 2.09375, + "grad_norm_var": 0.004833984375, + "learning_rate": 0.0001, + "loss": 4.016, + "loss/crossentropy": 1.9498217701911926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21281737089157104, + "step": 9136 + }, + { + "epoch": 0.18276, + "grad_norm": 2.21875, + "grad_norm_var": 0.005615234375, + "learning_rate": 0.0001, + "loss": 4.3842, + "loss/crossentropy": 2.140692949295044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24291902035474777, + "step": 9138 + }, + { + "epoch": 0.1828, + "grad_norm": 1.984375, + "grad_norm_var": 0.0069976806640625, + "learning_rate": 0.0001, + "loss": 4.1785, + "loss/crossentropy": 2.3510342836380005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23326712846755981, + "step": 9140 + }, + { + "epoch": 0.18284, + "grad_norm": 2.140625, + "grad_norm_var": 0.0072743733723958336, + "learning_rate": 0.0001, + "loss": 4.0707, + "loss/crossentropy": 2.049591898918152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21118033677339554, + "step": 9142 + }, + { + "epoch": 0.18288, + "grad_norm": 2.109375, + "grad_norm_var": 0.005631510416666667, + "learning_rate": 0.0001, + "loss": 4.263, + "loss/crossentropy": 1.949703335762024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22161198407411575, + "step": 9144 + }, + { + "epoch": 0.18292, + "grad_norm": 2.109375, + "grad_norm_var": 0.005399576822916667, + "learning_rate": 0.0001, + "loss": 4.5602, + "loss/crossentropy": 2.1442413330078125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21281076222658157, + "step": 9146 + }, + { + "epoch": 0.18296, + "grad_norm": 2.078125, + "grad_norm_var": 0.007111612955729167, + "learning_rate": 0.0001, + "loss": 4.0848, + "loss/crossentropy": 2.103494882583618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20892268419265747, + "step": 9148 + }, + { + "epoch": 0.183, + "grad_norm": 2.25, + "grad_norm_var": 0.01060791015625, + "learning_rate": 0.0001, + "loss": 4.133, + "loss/crossentropy": 2.1544495224952698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2008143737912178, + "step": 9150 + }, + { + "epoch": 0.18304, + "grad_norm": 1.984375, + "grad_norm_var": 0.01109619140625, + "learning_rate": 0.0001, + "loss": 4.2799, + "loss/crossentropy": 2.021821677684784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20975126326084137, + "step": 9152 + }, + { + "epoch": 0.18308, + "grad_norm": 1.921875, + "grad_norm_var": 0.0102203369140625, + "learning_rate": 0.0001, + "loss": 4.2858, + "loss/crossentropy": 2.109215199947357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19855067878961563, + "step": 9154 + }, + { + "epoch": 0.18312, + "grad_norm": 2.09375, + "grad_norm_var": 0.011165364583333334, + "learning_rate": 0.0001, + "loss": 4.2181, + "loss/crossentropy": 1.7631941437721252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20584283769130707, + "step": 9156 + }, + { + "epoch": 0.18316, + "grad_norm": 2.171875, + "grad_norm_var": 0.0116851806640625, + "learning_rate": 0.0001, + "loss": 4.4005, + "loss/crossentropy": 2.131524443626404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2151477411389351, + "step": 9158 + }, + { + "epoch": 0.1832, + "grad_norm": 2.015625, + "grad_norm_var": 0.01334228515625, + "learning_rate": 0.0001, + "loss": 4.2729, + "loss/crossentropy": 2.018375277519226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21982873231172562, + "step": 9160 + }, + { + "epoch": 0.18324, + "grad_norm": 2.09375, + "grad_norm_var": 0.0154205322265625, + "learning_rate": 0.0001, + "loss": 4.3302, + "loss/crossentropy": 2.217617154121399, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2257251739501953, + "step": 9162 + }, + { + "epoch": 0.18328, + "grad_norm": 2.15625, + "grad_norm_var": 0.014606730143229166, + "learning_rate": 0.0001, + "loss": 4.227, + "loss/crossentropy": 1.8632460832595825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19728046655654907, + "step": 9164 + }, + { + "epoch": 0.18332, + "grad_norm": 2.0625, + "grad_norm_var": 0.012137858072916667, + "learning_rate": 0.0001, + "loss": 4.5335, + "loss/crossentropy": 2.2818111181259155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22397568821907043, + "step": 9166 + }, + { + "epoch": 0.18336, + "grad_norm": 2.109375, + "grad_norm_var": 0.01177978515625, + "learning_rate": 0.0001, + "loss": 4.4104, + "loss/crossentropy": 2.1209938526153564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22490206360816956, + "step": 9168 + }, + { + "epoch": 0.1834, + "grad_norm": 2.046875, + "grad_norm_var": 0.00924072265625, + "learning_rate": 0.0001, + "loss": 4.244, + "loss/crossentropy": 2.141623795032501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21710850298404694, + "step": 9170 + }, + { + "epoch": 0.18344, + "grad_norm": 2.125, + "grad_norm_var": 0.007323201497395833, + "learning_rate": 0.0001, + "loss": 4.2063, + "loss/crossentropy": 2.165239691734314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.235738106071949, + "step": 9172 + }, + { + "epoch": 0.18348, + "grad_norm": 2.015625, + "grad_norm_var": 0.008103179931640624, + "learning_rate": 0.0001, + "loss": 4.0186, + "loss/crossentropy": 1.8649475574493408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088451236486435, + "step": 9174 + }, + { + "epoch": 0.18352, + "grad_norm": 2.15625, + "grad_norm_var": 0.007012685139973958, + "learning_rate": 0.0001, + "loss": 4.2853, + "loss/crossentropy": 2.312218189239502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.236283540725708, + "step": 9176 + }, + { + "epoch": 0.18356, + "grad_norm": 2.109375, + "grad_norm_var": 0.008699544270833333, + "learning_rate": 0.0001, + "loss": 3.9626, + "loss/crossentropy": 2.0149282217025757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21434535831212997, + "step": 9178 + }, + { + "epoch": 0.1836, + "grad_norm": 2.125, + "grad_norm_var": 0.0086822509765625, + "learning_rate": 0.0001, + "loss": 4.5081, + "loss/crossentropy": 2.499966621398926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26318275928497314, + "step": 9180 + }, + { + "epoch": 0.18364, + "grad_norm": 2.140625, + "grad_norm_var": 0.0084136962890625, + "learning_rate": 0.0001, + "loss": 4.4498, + "loss/crossentropy": 2.0454984307289124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2202616035938263, + "step": 9182 + }, + { + "epoch": 0.18368, + "grad_norm": 1.9609375, + "grad_norm_var": 0.010835520426432292, + "learning_rate": 0.0001, + "loss": 4.0754, + "loss/crossentropy": 2.1708725094795227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20958629250526428, + "step": 9184 + }, + { + "epoch": 0.18372, + "grad_norm": 2.140625, + "grad_norm_var": 0.010792795817057292, + "learning_rate": 0.0001, + "loss": 4.2104, + "loss/crossentropy": 1.8261350989341736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1952563151717186, + "step": 9186 + }, + { + "epoch": 0.18376, + "grad_norm": 2.046875, + "grad_norm_var": 0.010009511311848959, + "learning_rate": 0.0001, + "loss": 4.1247, + "loss/crossentropy": 2.036627769470215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22490298002958298, + "step": 9188 + }, + { + "epoch": 0.1838, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010501861572265625, + "learning_rate": 0.0001, + "loss": 4.0088, + "loss/crossentropy": 1.7977086305618286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19282807409763336, + "step": 9190 + }, + { + "epoch": 0.18384, + "grad_norm": 2.21875, + "grad_norm_var": 0.011211903889973958, + "learning_rate": 0.0001, + "loss": 4.1774, + "loss/crossentropy": 2.170135021209717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23149186372756958, + "step": 9192 + }, + { + "epoch": 0.18388, + "grad_norm": 2.328125, + "grad_norm_var": 0.012035115559895834, + "learning_rate": 0.0001, + "loss": 4.4155, + "loss/crossentropy": 2.1453020572662354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25120319426059723, + "step": 9194 + }, + { + "epoch": 0.18392, + "grad_norm": 2.125, + "grad_norm_var": 0.01224365234375, + "learning_rate": 0.0001, + "loss": 4.3628, + "loss/crossentropy": 1.8794063925743103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20883548259735107, + "step": 9196 + }, + { + "epoch": 0.18396, + "grad_norm": 2.3125, + "grad_norm_var": 0.0141998291015625, + "learning_rate": 0.0001, + "loss": 4.3384, + "loss/crossentropy": 2.021254241466522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21939975768327713, + "step": 9198 + }, + { + "epoch": 0.184, + "grad_norm": 2.09375, + "grad_norm_var": 0.009557851155598958, + "learning_rate": 0.0001, + "loss": 4.158, + "loss/crossentropy": 2.195580303668976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108924463391304, + "step": 9200 + }, + { + "epoch": 0.18404, + "grad_norm": 2.21875, + "grad_norm_var": 0.014085896809895833, + "learning_rate": 0.0001, + "loss": 4.0781, + "loss/crossentropy": 1.6260902881622314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20428159832954407, + "step": 9202 + }, + { + "epoch": 0.18408, + "grad_norm": 1.96875, + "grad_norm_var": 0.015363566080729167, + "learning_rate": 0.0001, + "loss": 4.2525, + "loss/crossentropy": 2.138678550720215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21641074120998383, + "step": 9204 + }, + { + "epoch": 0.18412, + "grad_norm": 2.125, + "grad_norm_var": 0.01825129191080729, + "learning_rate": 0.0001, + "loss": 4.4706, + "loss/crossentropy": 2.047194480895996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20891964435577393, + "step": 9206 + }, + { + "epoch": 0.18416, + "grad_norm": 2.171875, + "grad_norm_var": 0.019606272379557293, + "learning_rate": 0.0001, + "loss": 4.2312, + "loss/crossentropy": 2.189277768135071, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2150397077202797, + "step": 9208 + }, + { + "epoch": 0.1842, + "grad_norm": 2.140625, + "grad_norm_var": 0.017618560791015626, + "learning_rate": 0.0001, + "loss": 4.2792, + "loss/crossentropy": 2.184122920036316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23278063535690308, + "step": 9210 + }, + { + "epoch": 0.18424, + "grad_norm": 1.9921875, + "grad_norm_var": 0.019139607747395832, + "learning_rate": 0.0001, + "loss": 4.3471, + "loss/crossentropy": 2.413718104362488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24333150684833527, + "step": 9212 + }, + { + "epoch": 0.18428, + "grad_norm": 2.078125, + "grad_norm_var": 0.019017537434895832, + "learning_rate": 0.0001, + "loss": 3.8486, + "loss/crossentropy": 1.8086814880371094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19976364076137543, + "step": 9214 + }, + { + "epoch": 0.18432, + "grad_norm": 2.09375, + "grad_norm_var": 0.019978841145833332, + "learning_rate": 0.0001, + "loss": 4.5222, + "loss/crossentropy": 2.2418206930160522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.229690782725811, + "step": 9216 + }, + { + "epoch": 0.18436, + "grad_norm": 2.015625, + "grad_norm_var": 0.016355133056640624, + "learning_rate": 0.0001, + "loss": 4.0817, + "loss/crossentropy": 1.8083258867263794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20999443531036377, + "step": 9218 + }, + { + "epoch": 0.1844, + "grad_norm": 1.953125, + "grad_norm_var": 0.01693115234375, + "learning_rate": 0.0001, + "loss": 3.772, + "loss/crossentropy": 1.8117709755897522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20436270534992218, + "step": 9220 + }, + { + "epoch": 0.18444, + "grad_norm": 2.15625, + "grad_norm_var": 0.010550944010416667, + "learning_rate": 0.0001, + "loss": 4.2149, + "loss/crossentropy": 2.024270534515381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20097267627716064, + "step": 9222 + }, + { + "epoch": 0.18448, + "grad_norm": 2.25, + "grad_norm_var": 0.011449178059895834, + "learning_rate": 0.0001, + "loss": 4.4756, + "loss/crossentropy": 2.2385981678962708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21131044626235962, + "step": 9224 + }, + { + "epoch": 0.18452, + "grad_norm": 2.140625, + "grad_norm_var": 0.011271158854166666, + "learning_rate": 0.0001, + "loss": 4.3745, + "loss/crossentropy": 2.127749502658844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22855369001626968, + "step": 9226 + }, + { + "epoch": 0.18456, + "grad_norm": 2.15625, + "grad_norm_var": 0.011237589518229167, + "learning_rate": 0.0001, + "loss": 4.2645, + "loss/crossentropy": 2.1877033710479736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2339663878083229, + "step": 9228 + }, + { + "epoch": 0.1846, + "grad_norm": 2.25, + "grad_norm_var": 0.011139933268229167, + "learning_rate": 0.0001, + "loss": 4.2642, + "loss/crossentropy": 1.931507408618927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20912021398544312, + "step": 9230 + }, + { + "epoch": 0.18464, + "grad_norm": 2.234375, + "grad_norm_var": 0.011579386393229167, + "learning_rate": 0.0001, + "loss": 4.3309, + "loss/crossentropy": 1.8101251125335693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23724676668643951, + "step": 9232 + }, + { + "epoch": 0.18468, + "grad_norm": 2.125, + "grad_norm_var": 0.0111480712890625, + "learning_rate": 0.0001, + "loss": 4.201, + "loss/crossentropy": 2.015208065509796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23482007533311844, + "step": 9234 + }, + { + "epoch": 0.18472, + "grad_norm": 2.1875, + "grad_norm_var": 0.010282135009765625, + "learning_rate": 0.0001, + "loss": 4.5357, + "loss/crossentropy": 2.2742738723754883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23632052540779114, + "step": 9236 + }, + { + "epoch": 0.18476, + "grad_norm": 2.46875, + "grad_norm_var": 0.016806793212890626, + "learning_rate": 0.0001, + "loss": 4.3574, + "loss/crossentropy": 1.7254774570465088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104022428393364, + "step": 9238 + }, + { + "epoch": 0.1848, + "grad_norm": 2.09375, + "grad_norm_var": 0.016585032145182293, + "learning_rate": 0.0001, + "loss": 4.3477, + "loss/crossentropy": 2.181770443916321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22411521524190903, + "step": 9240 + }, + { + "epoch": 0.18484, + "grad_norm": 1.953125, + "grad_norm_var": 0.020157877604166666, + "learning_rate": 0.0001, + "loss": 3.9517, + "loss/crossentropy": 1.8449691534042358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2091139778494835, + "step": 9242 + }, + { + "epoch": 0.18488, + "grad_norm": 2.1875, + "grad_norm_var": 0.017195383707682293, + "learning_rate": 0.0001, + "loss": 4.3389, + "loss/crossentropy": 2.0917609333992004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2644127458333969, + "step": 9244 + }, + { + "epoch": 0.18492, + "grad_norm": 2.28125, + "grad_norm_var": 0.01793390909830729, + "learning_rate": 0.0001, + "loss": 4.083, + "loss/crossentropy": 2.060012102127075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21294714510440826, + "step": 9246 + }, + { + "epoch": 0.18496, + "grad_norm": 2.078125, + "grad_norm_var": 0.018387603759765624, + "learning_rate": 0.0001, + "loss": 4.6598, + "loss/crossentropy": 2.059940278530121, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22648683190345764, + "step": 9248 + }, + { + "epoch": 0.185, + "grad_norm": 2.078125, + "grad_norm_var": 0.01862360636393229, + "learning_rate": 0.0001, + "loss": 4.4696, + "loss/crossentropy": 1.8423291444778442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22509171068668365, + "step": 9250 + }, + { + "epoch": 0.18504, + "grad_norm": 2.125, + "grad_norm_var": 0.016410064697265626, + "learning_rate": 0.0001, + "loss": 4.4629, + "loss/crossentropy": 2.2559698820114136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23896963894367218, + "step": 9252 + }, + { + "epoch": 0.18508, + "grad_norm": 2.03125, + "grad_norm_var": 0.01579767862955729, + "learning_rate": 0.0001, + "loss": 4.6027, + "loss/crossentropy": 2.0583658814430237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22763275355100632, + "step": 9254 + }, + { + "epoch": 0.18512, + "grad_norm": 2.015625, + "grad_norm_var": 0.016658274332682292, + "learning_rate": 0.0001, + "loss": 4.2654, + "loss/crossentropy": 1.816649854183197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18608735501766205, + "step": 9256 + }, + { + "epoch": 0.18516, + "grad_norm": 4.65625, + "grad_norm_var": 0.4073150634765625, + "learning_rate": 0.0001, + "loss": 4.2837, + "loss/crossentropy": 2.0920958518981934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22859029471874237, + "step": 9258 + }, + { + "epoch": 0.1852, + "grad_norm": 2.125, + "grad_norm_var": 0.4108306884765625, + "learning_rate": 0.0001, + "loss": 4.2517, + "loss/crossentropy": 2.1475982666015625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21678777784109116, + "step": 9260 + }, + { + "epoch": 0.18524, + "grad_norm": 2.21875, + "grad_norm_var": 0.4100901285807292, + "learning_rate": 0.0001, + "loss": 4.4258, + "loss/crossentropy": 2.22346031665802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2333592176437378, + "step": 9262 + }, + { + "epoch": 0.18528, + "grad_norm": 2.09375, + "grad_norm_var": 0.41646703084309894, + "learning_rate": 0.0001, + "loss": 4.2134, + "loss/crossentropy": 1.7652028799057007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18461769074201584, + "step": 9264 + }, + { + "epoch": 0.18532, + "grad_norm": 2.109375, + "grad_norm_var": 0.4225006103515625, + "learning_rate": 0.0001, + "loss": 4.2362, + "loss/crossentropy": 2.1549625396728516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22428707033395767, + "step": 9266 + }, + { + "epoch": 0.18536, + "grad_norm": 2.0, + "grad_norm_var": 0.4281972249348958, + "learning_rate": 0.0001, + "loss": 4.3685, + "loss/crossentropy": 2.1677842140197754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22887174785137177, + "step": 9268 + }, + { + "epoch": 0.1854, + "grad_norm": 2.09375, + "grad_norm_var": 0.42428792317708336, + "learning_rate": 0.0001, + "loss": 4.1517, + "loss/crossentropy": 1.8352131247520447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19628287106752396, + "step": 9270 + }, + { + "epoch": 0.18544, + "grad_norm": 2.109375, + "grad_norm_var": 0.43038304646809894, + "learning_rate": 0.0001, + "loss": 4.101, + "loss/crossentropy": 2.104279100894928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20481227338314056, + "step": 9272 + }, + { + "epoch": 0.18548, + "grad_norm": 2.21875, + "grad_norm_var": 0.009795888264973959, + "learning_rate": 0.0001, + "loss": 4.2011, + "loss/crossentropy": 1.8447301387786865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2227916270494461, + "step": 9274 + }, + { + "epoch": 0.18552, + "grad_norm": 2.296875, + "grad_norm_var": 0.013651275634765625, + "learning_rate": 0.0001, + "loss": 4.3544, + "loss/crossentropy": 2.303207755088806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24895529448986053, + "step": 9276 + }, + { + "epoch": 0.18556, + "grad_norm": 2.171875, + "grad_norm_var": 0.014115142822265624, + "learning_rate": 0.0001, + "loss": 4.699, + "loss/crossentropy": 2.248077630996704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23077847063541412, + "step": 9278 + }, + { + "epoch": 0.1856, + "grad_norm": 2.34375, + "grad_norm_var": 0.017438761393229165, + "learning_rate": 0.0001, + "loss": 4.4943, + "loss/crossentropy": 2.261389970779419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23669905960559845, + "step": 9280 + }, + { + "epoch": 0.18564, + "grad_norm": 2.234375, + "grad_norm_var": 0.016056060791015625, + "learning_rate": 0.0001, + "loss": 4.3805, + "loss/crossentropy": 2.402338147163391, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2453952580690384, + "step": 9282 + }, + { + "epoch": 0.18568, + "grad_norm": 2.265625, + "grad_norm_var": 0.01587702433268229, + "learning_rate": 0.0001, + "loss": 4.2253, + "loss/crossentropy": 2.011174201965332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22673364728689194, + "step": 9284 + }, + { + "epoch": 0.18572, + "grad_norm": 2.234375, + "grad_norm_var": 0.016810862223307292, + "learning_rate": 0.0001, + "loss": 4.3897, + "loss/crossentropy": 1.9700093269348145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22711393237113953, + "step": 9286 + }, + { + "epoch": 0.18576, + "grad_norm": 2.140625, + "grad_norm_var": 0.014922841389973959, + "learning_rate": 0.0001, + "loss": 3.9609, + "loss/crossentropy": 2.0253939032554626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21902555227279663, + "step": 9288 + }, + { + "epoch": 0.1858, + "grad_norm": 2.296875, + "grad_norm_var": 0.016961415608723957, + "learning_rate": 0.0001, + "loss": 4.1799, + "loss/crossentropy": 1.741984784603119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1879509538412094, + "step": 9290 + }, + { + "epoch": 0.18584, + "grad_norm": 2.296875, + "grad_norm_var": 0.022564442952473958, + "learning_rate": 0.0001, + "loss": 4.7983, + "loss/crossentropy": 2.3156943321228027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2687100023031235, + "step": 9292 + }, + { + "epoch": 0.18588, + "grad_norm": 2.015625, + "grad_norm_var": 0.025833892822265624, + "learning_rate": 0.0001, + "loss": 4.2894, + "loss/crossentropy": 2.0826371908187866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21234872937202454, + "step": 9294 + }, + { + "epoch": 0.18592, + "grad_norm": 2.078125, + "grad_norm_var": 0.025921376546223958, + "learning_rate": 0.0001, + "loss": 4.6473, + "loss/crossentropy": 2.4080610275268555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2670409381389618, + "step": 9296 + }, + { + "epoch": 0.18596, + "grad_norm": 2.046875, + "grad_norm_var": 0.02958958943684896, + "learning_rate": 0.0001, + "loss": 4.1997, + "loss/crossentropy": 1.722270905971527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20094333589076996, + "step": 9298 + }, + { + "epoch": 0.186, + "grad_norm": 2.03125, + "grad_norm_var": 0.028527577718098957, + "learning_rate": 0.0001, + "loss": 4.3247, + "loss/crossentropy": 2.0514711141586304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.209539495408535, + "step": 9300 + }, + { + "epoch": 0.18604, + "grad_norm": 2.640625, + "grad_norm_var": 2.5507850646972656, + "learning_rate": 0.0001, + "loss": 4.9819, + "loss/crossentropy": 2.4808409214019775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2367517650127411, + "step": 9302 + }, + { + "epoch": 0.18608, + "grad_norm": 2.078125, + "grad_norm_var": 2.536018880208333, + "learning_rate": 0.0001, + "loss": 4.1105, + "loss/crossentropy": 2.2539944648742676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22834083437919617, + "step": 9304 + }, + { + "epoch": 0.18612, + "grad_norm": 2.078125, + "grad_norm_var": 2.5449544270833333, + "learning_rate": 0.0001, + "loss": 4.2383, + "loss/crossentropy": 1.9335210919380188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20264607667922974, + "step": 9306 + }, + { + "epoch": 0.18616, + "grad_norm": 2.109375, + "grad_norm_var": 2.5707194010416665, + "learning_rate": 0.0001, + "loss": 4.2285, + "loss/crossentropy": 1.90863037109375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20882528275251389, + "step": 9308 + }, + { + "epoch": 0.1862, + "grad_norm": 2.1875, + "grad_norm_var": 2.5589996337890626, + "learning_rate": 0.0001, + "loss": 4.2195, + "loss/crossentropy": 2.1449084281921387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2329169511795044, + "step": 9310 + }, + { + "epoch": 0.18624, + "grad_norm": 2.015625, + "grad_norm_var": 2.567341105143229, + "learning_rate": 0.0001, + "loss": 4.1806, + "loss/crossentropy": 1.9571366906166077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148708775639534, + "step": 9312 + }, + { + "epoch": 0.18628, + "grad_norm": 2.25, + "grad_norm_var": 2.542252604166667, + "learning_rate": 0.0001, + "loss": 4.4691, + "loss/crossentropy": 2.174731135368347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2280728593468666, + "step": 9314 + }, + { + "epoch": 0.18632, + "grad_norm": 1.96875, + "grad_norm_var": 2.5416575113932294, + "learning_rate": 0.0001, + "loss": 4.4344, + "loss/crossentropy": 1.9569833874702454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20855706185102463, + "step": 9316 + }, + { + "epoch": 0.18636, + "grad_norm": 2.1875, + "grad_norm_var": 0.0090972900390625, + "learning_rate": 0.0001, + "loss": 4.3162, + "loss/crossentropy": 2.153563976287842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22807130962610245, + "step": 9318 + }, + { + "epoch": 0.1864, + "grad_norm": 2.203125, + "grad_norm_var": 0.010741933186848959, + "learning_rate": 0.0001, + "loss": 4.3028, + "loss/crossentropy": 1.9219058752059937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2295902967453003, + "step": 9320 + }, + { + "epoch": 0.18644, + "grad_norm": 1.96875, + "grad_norm_var": 0.011926015218098959, + "learning_rate": 0.0001, + "loss": 4.29, + "loss/crossentropy": 2.1993675231933594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.223759263753891, + "step": 9322 + }, + { + "epoch": 0.18648, + "grad_norm": 2.078125, + "grad_norm_var": 0.011730702718098958, + "learning_rate": 0.0001, + "loss": 4.2254, + "loss/crossentropy": 2.173740863800049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23405525833368301, + "step": 9324 + }, + { + "epoch": 0.18652, + "grad_norm": 2.25, + "grad_norm_var": 0.012564849853515626, + "learning_rate": 0.0001, + "loss": 4.3717, + "loss/crossentropy": 2.026577115058899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2265823632478714, + "step": 9326 + }, + { + "epoch": 0.18656, + "grad_norm": 2.671875, + "grad_norm_var": 0.030326080322265626, + "learning_rate": 0.0001, + "loss": 4.4664, + "loss/crossentropy": 2.3629637956619263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23659023642539978, + "step": 9328 + }, + { + "epoch": 0.1866, + "grad_norm": 2.234375, + "grad_norm_var": 0.03050715128580729, + "learning_rate": 0.0001, + "loss": 4.3245, + "loss/crossentropy": 2.100727915763855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21468425542116165, + "step": 9330 + }, + { + "epoch": 0.18664, + "grad_norm": 2.109375, + "grad_norm_var": 0.025923411051432293, + "learning_rate": 0.0001, + "loss": 4.5291, + "loss/crossentropy": 2.163568615913391, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23402437567710876, + "step": 9332 + }, + { + "epoch": 0.18668, + "grad_norm": 1.9375, + "grad_norm_var": 0.02934748331705729, + "learning_rate": 0.0001, + "loss": 3.9687, + "loss/crossentropy": 1.9579994082450867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19713342934846878, + "step": 9334 + }, + { + "epoch": 0.18672, + "grad_norm": 1.9609375, + "grad_norm_var": 0.029412587483723957, + "learning_rate": 0.0001, + "loss": 4.2586, + "loss/crossentropy": 1.805375874042511, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22647518664598465, + "step": 9336 + }, + { + "epoch": 0.18676, + "grad_norm": 2.171875, + "grad_norm_var": 0.02797215779622396, + "learning_rate": 0.0001, + "loss": 4.2025, + "loss/crossentropy": 2.0764458775520325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2270444855093956, + "step": 9338 + }, + { + "epoch": 0.1868, + "grad_norm": 2.03125, + "grad_norm_var": 0.030987294514973958, + "learning_rate": 0.0001, + "loss": 3.8731, + "loss/crossentropy": 1.5654467940330505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15944529324769974, + "step": 9340 + }, + { + "epoch": 0.18684, + "grad_norm": 2.15625, + "grad_norm_var": 0.02976048787434896, + "learning_rate": 0.0001, + "loss": 4.3831, + "loss/crossentropy": 2.007612407207489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061111181974411, + "step": 9342 + }, + { + "epoch": 0.18688, + "grad_norm": 1.875, + "grad_norm_var": 0.010910797119140624, + "learning_rate": 0.0001, + "loss": 4.0835, + "loss/crossentropy": 1.981432855129242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19339826703071594, + "step": 9344 + }, + { + "epoch": 0.18692, + "grad_norm": 1.9921875, + "grad_norm_var": 0.00965576171875, + "learning_rate": 0.0001, + "loss": 4.1291, + "loss/crossentropy": 1.804275631904602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1968545839190483, + "step": 9346 + }, + { + "epoch": 0.18696, + "grad_norm": 2.078125, + "grad_norm_var": 0.008763631184895834, + "learning_rate": 0.0001, + "loss": 4.2109, + "loss/crossentropy": 1.9718617796897888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21482623368501663, + "step": 9348 + }, + { + "epoch": 0.187, + "grad_norm": 2.15625, + "grad_norm_var": 0.008397420247395834, + "learning_rate": 0.0001, + "loss": 4.1672, + "loss/crossentropy": 1.8358338475227356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20042669028043747, + "step": 9350 + }, + { + "epoch": 0.18704, + "grad_norm": 2.15625, + "grad_norm_var": 0.009089914957682292, + "learning_rate": 0.0001, + "loss": 4.2045, + "loss/crossentropy": 1.9447709321975708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21207460761070251, + "step": 9352 + }, + { + "epoch": 0.18708, + "grad_norm": 2.03125, + "grad_norm_var": 0.011356353759765625, + "learning_rate": 0.0001, + "loss": 4.3134, + "loss/crossentropy": 1.8921163082122803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19945629686117172, + "step": 9354 + }, + { + "epoch": 0.18712, + "grad_norm": 2.171875, + "grad_norm_var": 0.009580230712890625, + "learning_rate": 0.0001, + "loss": 4.2018, + "loss/crossentropy": 2.1323947310447693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2258753925561905, + "step": 9356 + }, + { + "epoch": 0.18716, + "grad_norm": 2.109375, + "grad_norm_var": 0.009277089436848959, + "learning_rate": 0.0001, + "loss": 4.4278, + "loss/crossentropy": 2.064914345741272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23450962454080582, + "step": 9358 + }, + { + "epoch": 0.1872, + "grad_norm": 2.046875, + "grad_norm_var": 0.005783843994140625, + "learning_rate": 0.0001, + "loss": 4.0173, + "loss/crossentropy": 2.1264703273773193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2169734537601471, + "step": 9360 + }, + { + "epoch": 0.18724, + "grad_norm": 2.09375, + "grad_norm_var": 0.004784138997395834, + "learning_rate": 0.0001, + "loss": 4.2926, + "loss/crossentropy": 1.7044150233268738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19169463962316513, + "step": 9362 + }, + { + "epoch": 0.18728, + "grad_norm": 2.0625, + "grad_norm_var": 0.004541015625, + "learning_rate": 0.0001, + "loss": 4.5367, + "loss/crossentropy": 1.9475398659706116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22891414165496826, + "step": 9364 + }, + { + "epoch": 0.18732, + "grad_norm": 1.9140625, + "grad_norm_var": 0.008017730712890626, + "learning_rate": 0.0001, + "loss": 4.1416, + "loss/crossentropy": 2.134114623069763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2241017445921898, + "step": 9366 + }, + { + "epoch": 0.18736, + "grad_norm": 2.1875, + "grad_norm_var": 0.007342274983723958, + "learning_rate": 0.0001, + "loss": 4.2185, + "loss/crossentropy": 1.8317620158195496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20189791917800903, + "step": 9368 + }, + { + "epoch": 0.1874, + "grad_norm": 2.140625, + "grad_norm_var": 0.005041249593098958, + "learning_rate": 0.0001, + "loss": 4.3518, + "loss/crossentropy": 2.257538855075836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2343401461839676, + "step": 9370 + }, + { + "epoch": 0.18744, + "grad_norm": 2.09375, + "grad_norm_var": 0.005228424072265625, + "learning_rate": 0.0001, + "loss": 4.0535, + "loss/crossentropy": 2.3722634315490723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23045828938484192, + "step": 9372 + }, + { + "epoch": 0.18748, + "grad_norm": 2.078125, + "grad_norm_var": 0.005222320556640625, + "learning_rate": 0.0001, + "loss": 4.2293, + "loss/crossentropy": 2.1880545020103455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2345728725194931, + "step": 9374 + }, + { + "epoch": 0.18752, + "grad_norm": 2.3125, + "grad_norm_var": 0.008255767822265624, + "learning_rate": 0.0001, + "loss": 4.5037, + "loss/crossentropy": 1.8883287906646729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25232937932014465, + "step": 9376 + }, + { + "epoch": 0.18756, + "grad_norm": 2.21875, + "grad_norm_var": 0.009124501546223959, + "learning_rate": 0.0001, + "loss": 4.2963, + "loss/crossentropy": 1.9253730773925781, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20324261486530304, + "step": 9378 + }, + { + "epoch": 0.1876, + "grad_norm": 2.078125, + "grad_norm_var": 0.009211985270182292, + "learning_rate": 0.0001, + "loss": 4.2745, + "loss/crossentropy": 1.961540937423706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22623597085475922, + "step": 9380 + }, + { + "epoch": 0.18764, + "grad_norm": 2.203125, + "grad_norm_var": 0.007835896809895833, + "learning_rate": 0.0001, + "loss": 4.347, + "loss/crossentropy": 2.190120279788971, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24154195934534073, + "step": 9382 + }, + { + "epoch": 0.18768, + "grad_norm": 2.140625, + "grad_norm_var": 0.007445271809895833, + "learning_rate": 0.0001, + "loss": 4.6159, + "loss/crossentropy": 2.0888350009918213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22750811278820038, + "step": 9384 + }, + { + "epoch": 0.18772, + "grad_norm": 2.1875, + "grad_norm_var": 0.008128865559895834, + "learning_rate": 0.0001, + "loss": 4.3171, + "loss/crossentropy": 1.724816918373108, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21245518326759338, + "step": 9386 + }, + { + "epoch": 0.18776, + "grad_norm": 2.0, + "grad_norm_var": 0.009307607014973959, + "learning_rate": 0.0001, + "loss": 4.0868, + "loss/crossentropy": 1.6542762517929077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1961463838815689, + "step": 9388 + }, + { + "epoch": 0.1878, + "grad_norm": 2.078125, + "grad_norm_var": 0.009714508056640625, + "learning_rate": 0.0001, + "loss": 4.4748, + "loss/crossentropy": 2.3528761863708496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23583710938692093, + "step": 9390 + }, + { + "epoch": 0.18784, + "grad_norm": 2.03125, + "grad_norm_var": 0.009012603759765625, + "learning_rate": 0.0001, + "loss": 4.394, + "loss/crossentropy": 2.181188702583313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24201467633247375, + "step": 9392 + }, + { + "epoch": 0.18788, + "grad_norm": 2.0625, + "grad_norm_var": 0.008414459228515626, + "learning_rate": 0.0001, + "loss": 4.3476, + "loss/crossentropy": 2.280683398246765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22065220028162003, + "step": 9394 + }, + { + "epoch": 0.18792, + "grad_norm": 2.21875, + "grad_norm_var": 0.009275054931640625, + "learning_rate": 0.0001, + "loss": 4.1603, + "loss/crossentropy": 1.9839438199996948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22708184272050858, + "step": 9396 + }, + { + "epoch": 0.18796, + "grad_norm": 2.125, + "grad_norm_var": 0.0070879618326822914, + "learning_rate": 0.0001, + "loss": 4.4322, + "loss/crossentropy": 2.311874270439148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22249652445316315, + "step": 9398 + }, + { + "epoch": 0.188, + "grad_norm": 2.140625, + "grad_norm_var": 0.008503214518229166, + "learning_rate": 0.0001, + "loss": 4.3808, + "loss/crossentropy": 2.430112838745117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23544297367334366, + "step": 9400 + }, + { + "epoch": 0.18804, + "grad_norm": 1.96875, + "grad_norm_var": 0.00892333984375, + "learning_rate": 0.0001, + "loss": 4.2393, + "loss/crossentropy": 2.146397888660431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23211465775966644, + "step": 9402 + }, + { + "epoch": 0.18808, + "grad_norm": 2.203125, + "grad_norm_var": 0.008318837483723958, + "learning_rate": 0.0001, + "loss": 4.5071, + "loss/crossentropy": 2.4402170181274414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25594406574964523, + "step": 9404 + }, + { + "epoch": 0.18812, + "grad_norm": 2.046875, + "grad_norm_var": 0.008294423421223959, + "learning_rate": 0.0001, + "loss": 4.3848, + "loss/crossentropy": 2.141621232032776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2156461626291275, + "step": 9406 + }, + { + "epoch": 0.18816, + "grad_norm": 2.0625, + "grad_norm_var": 0.006290435791015625, + "learning_rate": 0.0001, + "loss": 4.2511, + "loss/crossentropy": 2.0266456604003906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20194754749536514, + "step": 9408 + }, + { + "epoch": 0.1882, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007062784830729167, + "learning_rate": 0.0001, + "loss": 4.2831, + "loss/crossentropy": 2.304496645927429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22345459461212158, + "step": 9410 + }, + { + "epoch": 0.18824, + "grad_norm": 2.25, + "grad_norm_var": 0.007523600260416667, + "learning_rate": 0.0001, + "loss": 4.4085, + "loss/crossentropy": 1.8486470580101013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20493299514055252, + "step": 9412 + }, + { + "epoch": 0.18828, + "grad_norm": 2.09375, + "grad_norm_var": 0.0075927734375, + "learning_rate": 0.0001, + "loss": 4.4562, + "loss/crossentropy": 2.0593990683555603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22942744940519333, + "step": 9414 + }, + { + "epoch": 0.18832, + "grad_norm": 2.15625, + "grad_norm_var": 0.006528472900390625, + "learning_rate": 0.0001, + "loss": 4.1551, + "loss/crossentropy": 1.7936111688613892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1776513010263443, + "step": 9416 + }, + { + "epoch": 0.18836, + "grad_norm": 2.0, + "grad_norm_var": 0.005997467041015625, + "learning_rate": 0.0001, + "loss": 4.4484, + "loss/crossentropy": 2.0676616430282593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21748381853103638, + "step": 9418 + }, + { + "epoch": 0.1884, + "grad_norm": 2.1875, + "grad_norm_var": 0.005236562093098958, + "learning_rate": 0.0001, + "loss": 4.3415, + "loss/crossentropy": 2.0317665934562683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2215682566165924, + "step": 9420 + }, + { + "epoch": 0.18844, + "grad_norm": 2.234375, + "grad_norm_var": 0.006359608968098959, + "learning_rate": 0.0001, + "loss": 4.5623, + "loss/crossentropy": 2.3345483541488647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24850556254386902, + "step": 9422 + }, + { + "epoch": 0.18848, + "grad_norm": 2.03125, + "grad_norm_var": 0.007043202718098958, + "learning_rate": 0.0001, + "loss": 4.3526, + "loss/crossentropy": 2.0603779554367065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20997442305088043, + "step": 9424 + }, + { + "epoch": 0.18852, + "grad_norm": 2.15625, + "grad_norm_var": 0.006591796875, + "learning_rate": 0.0001, + "loss": 4.4944, + "loss/crossentropy": 1.9450209140777588, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21465806663036346, + "step": 9426 + }, + { + "epoch": 0.18856, + "grad_norm": 2.046875, + "grad_norm_var": 0.0065266927083333336, + "learning_rate": 0.0001, + "loss": 4.2452, + "loss/crossentropy": 2.3568087816238403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24619507789611816, + "step": 9428 + }, + { + "epoch": 0.1886, + "grad_norm": 2.25, + "grad_norm_var": 0.007828776041666667, + "learning_rate": 0.0001, + "loss": 4.1976, + "loss/crossentropy": 2.2047020196914673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2519628629088402, + "step": 9430 + }, + { + "epoch": 0.18864, + "grad_norm": 1.9609375, + "grad_norm_var": 0.010188547770182292, + "learning_rate": 0.0001, + "loss": 4.3168, + "loss/crossentropy": 2.1594117879867554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21274243295192719, + "step": 9432 + }, + { + "epoch": 0.18868, + "grad_norm": 2.25, + "grad_norm_var": 0.009936269124348958, + "learning_rate": 0.0001, + "loss": 4.3847, + "loss/crossentropy": 2.120336890220642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2092270776629448, + "step": 9434 + }, + { + "epoch": 0.18872, + "grad_norm": 2.015625, + "grad_norm_var": 0.011120351155598958, + "learning_rate": 0.0001, + "loss": 4.2725, + "loss/crossentropy": 2.13198459148407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.238224595785141, + "step": 9436 + }, + { + "epoch": 0.18876, + "grad_norm": 2.09375, + "grad_norm_var": 0.010009511311848959, + "learning_rate": 0.0001, + "loss": 4.3706, + "loss/crossentropy": 1.9252395629882812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22254322469234467, + "step": 9438 + }, + { + "epoch": 0.1888, + "grad_norm": 2.09375, + "grad_norm_var": 0.0123291015625, + "learning_rate": 0.0001, + "loss": 4.0574, + "loss/crossentropy": 2.123266577720642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22276867926120758, + "step": 9440 + }, + { + "epoch": 0.18884, + "grad_norm": 2.140625, + "grad_norm_var": 0.010741170247395833, + "learning_rate": 0.0001, + "loss": 4.2195, + "loss/crossentropy": 1.9219747185707092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18884174525737762, + "step": 9442 + }, + { + "epoch": 0.18888, + "grad_norm": 2.125, + "grad_norm_var": 0.010587565104166667, + "learning_rate": 0.0001, + "loss": 4.2603, + "loss/crossentropy": 2.0122207403182983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20913395285606384, + "step": 9444 + }, + { + "epoch": 0.18892, + "grad_norm": 1.953125, + "grad_norm_var": 0.010632069905598958, + "learning_rate": 0.0001, + "loss": 4.0871, + "loss/crossentropy": 2.0255361199378967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1982431635260582, + "step": 9446 + }, + { + "epoch": 0.18896, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008770497639973958, + "learning_rate": 0.0001, + "loss": 4.4085, + "loss/crossentropy": 1.8060500025749207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19787351042032242, + "step": 9448 + }, + { + "epoch": 0.189, + "grad_norm": 2.171875, + "grad_norm_var": 0.007389068603515625, + "learning_rate": 0.0001, + "loss": 4.3799, + "loss/crossentropy": 2.340656042098999, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22321298718452454, + "step": 9450 + }, + { + "epoch": 0.18904, + "grad_norm": 1.9296875, + "grad_norm_var": 0.00966796875, + "learning_rate": 0.0001, + "loss": 4.3973, + "loss/crossentropy": 2.35786235332489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2356308028101921, + "step": 9452 + }, + { + "epoch": 0.18908, + "grad_norm": 2.0625, + "grad_norm_var": 0.01002197265625, + "learning_rate": 0.0001, + "loss": 4.2622, + "loss/crossentropy": 2.344806671142578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21194154769182205, + "step": 9454 + }, + { + "epoch": 0.18912, + "grad_norm": 2.046875, + "grad_norm_var": 0.008786773681640625, + "learning_rate": 0.0001, + "loss": 4.4861, + "loss/crossentropy": 2.2449493408203125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21038557589054108, + "step": 9456 + }, + { + "epoch": 0.18916, + "grad_norm": 1.8515625, + "grad_norm_var": 0.014518229166666667, + "learning_rate": 0.0001, + "loss": 4.0343, + "loss/crossentropy": 1.977162778377533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20586547255516052, + "step": 9458 + }, + { + "epoch": 0.1892, + "grad_norm": 2.203125, + "grad_norm_var": 0.015860748291015626, + "learning_rate": 0.0001, + "loss": 4.3453, + "loss/crossentropy": 2.0941065549850464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2197253629565239, + "step": 9460 + }, + { + "epoch": 0.18924, + "grad_norm": 2.03125, + "grad_norm_var": 0.0143463134765625, + "learning_rate": 0.0001, + "loss": 4.3479, + "loss/crossentropy": 2.5145565271377563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25138507783412933, + "step": 9462 + }, + { + "epoch": 0.18928, + "grad_norm": 2.0, + "grad_norm_var": 0.014289347330729167, + "learning_rate": 0.0001, + "loss": 4.4076, + "loss/crossentropy": 2.2870718240737915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2298167496919632, + "step": 9464 + }, + { + "epoch": 0.18932, + "grad_norm": 2.109375, + "grad_norm_var": 0.013678995768229167, + "learning_rate": 0.0001, + "loss": 4.3588, + "loss/crossentropy": 2.2095978260040283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.217079259455204, + "step": 9466 + }, + { + "epoch": 0.18936, + "grad_norm": 2.140625, + "grad_norm_var": 0.011726633707682291, + "learning_rate": 0.0001, + "loss": 4.3735, + "loss/crossentropy": 1.9591819047927856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21825183182954788, + "step": 9468 + }, + { + "epoch": 0.1894, + "grad_norm": 2.1875, + "grad_norm_var": 0.011352284749348959, + "learning_rate": 0.0001, + "loss": 4.5072, + "loss/crossentropy": 2.266845226287842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24172072112560272, + "step": 9470 + }, + { + "epoch": 0.18944, + "grad_norm": 2.203125, + "grad_norm_var": 0.011437733968098959, + "learning_rate": 0.0001, + "loss": 4.3448, + "loss/crossentropy": 2.0732688903808594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23074156790971756, + "step": 9472 + }, + { + "epoch": 0.18948, + "grad_norm": 2.34375, + "grad_norm_var": 0.0090728759765625, + "learning_rate": 0.0001, + "loss": 4.6697, + "loss/crossentropy": 1.935340702533722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2194545865058899, + "step": 9474 + }, + { + "epoch": 0.18952, + "grad_norm": 2.125, + "grad_norm_var": 0.007594553629557291, + "learning_rate": 0.0001, + "loss": 4.1518, + "loss/crossentropy": 1.795669674873352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18794939666986465, + "step": 9476 + }, + { + "epoch": 0.18956, + "grad_norm": 2.109375, + "grad_norm_var": 0.008841705322265626, + "learning_rate": 0.0001, + "loss": 3.7745, + "loss/crossentropy": 1.9292446970939636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20147182047367096, + "step": 9478 + }, + { + "epoch": 0.1896, + "grad_norm": 2.09375, + "grad_norm_var": 0.0067779541015625, + "learning_rate": 0.0001, + "loss": 4.1955, + "loss/crossentropy": 2.0110061168670654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2096811756491661, + "step": 9480 + }, + { + "epoch": 0.18964, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008034006754557291, + "learning_rate": 0.0001, + "loss": 4.2937, + "loss/crossentropy": 2.1543468236923218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2142205834388733, + "step": 9482 + }, + { + "epoch": 0.18968, + "grad_norm": 2.046875, + "grad_norm_var": 0.009723917643229166, + "learning_rate": 0.0001, + "loss": 4.0126, + "loss/crossentropy": 2.0860520601272583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21716032177209854, + "step": 9484 + }, + { + "epoch": 0.18972, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010241444905598958, + "learning_rate": 0.0001, + "loss": 3.9412, + "loss/crossentropy": 1.7190409302711487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19397014379501343, + "step": 9486 + }, + { + "epoch": 0.18976, + "grad_norm": 2.234375, + "grad_norm_var": 0.017116038004557292, + "learning_rate": 0.0001, + "loss": 4.4711, + "loss/crossentropy": 2.178081512451172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2280048429965973, + "step": 9488 + }, + { + "epoch": 0.1898, + "grad_norm": 2.109375, + "grad_norm_var": 0.013063303629557292, + "learning_rate": 0.0001, + "loss": 4.267, + "loss/crossentropy": 2.0749863982200623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22755203396081924, + "step": 9490 + }, + { + "epoch": 0.18984, + "grad_norm": 2.09375, + "grad_norm_var": 0.014098866780598959, + "learning_rate": 0.0001, + "loss": 4.3147, + "loss/crossentropy": 2.214204430580139, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23071825504302979, + "step": 9492 + }, + { + "epoch": 0.18988, + "grad_norm": 2.09375, + "grad_norm_var": 0.015794881184895835, + "learning_rate": 0.0001, + "loss": 4.0041, + "loss/crossentropy": 1.6625414490699768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1857389286160469, + "step": 9494 + }, + { + "epoch": 0.18992, + "grad_norm": 2.375, + "grad_norm_var": 0.020539347330729166, + "learning_rate": 0.0001, + "loss": 4.4474, + "loss/crossentropy": 1.8537201285362244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21718977391719818, + "step": 9496 + }, + { + "epoch": 0.18996, + "grad_norm": 1.84375, + "grad_norm_var": 0.024074045817057292, + "learning_rate": 0.0001, + "loss": 4.0488, + "loss/crossentropy": 1.716725468635559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18544895946979523, + "step": 9498 + }, + { + "epoch": 0.19, + "grad_norm": 1.9453125, + "grad_norm_var": 0.024192047119140626, + "learning_rate": 0.0001, + "loss": 4.1035, + "loss/crossentropy": 1.94467431306839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.210828959941864, + "step": 9500 + }, + { + "epoch": 0.19004, + "grad_norm": 2.015625, + "grad_norm_var": 0.023298136393229165, + "learning_rate": 0.0001, + "loss": 4.2383, + "loss/crossentropy": 1.9377062320709229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22141354531049728, + "step": 9502 + }, + { + "epoch": 0.19008, + "grad_norm": 2.0, + "grad_norm_var": 0.016950480143229165, + "learning_rate": 0.0001, + "loss": 4.3717, + "loss/crossentropy": 2.2015358209609985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22363336384296417, + "step": 9504 + }, + { + "epoch": 0.19012, + "grad_norm": 2.109375, + "grad_norm_var": 0.017235310872395833, + "learning_rate": 0.0001, + "loss": 4.2332, + "loss/crossentropy": 2.373024582862854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2402234748005867, + "step": 9506 + }, + { + "epoch": 0.19016, + "grad_norm": 2.140625, + "grad_norm_var": 0.016649373372395835, + "learning_rate": 0.0001, + "loss": 4.0519, + "loss/crossentropy": 2.1573110222816467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24207139760255814, + "step": 9508 + }, + { + "epoch": 0.1902, + "grad_norm": 2.203125, + "grad_norm_var": 0.014788564046223958, + "learning_rate": 0.0001, + "loss": 4.4041, + "loss/crossentropy": 2.324455976486206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24618541449308395, + "step": 9510 + }, + { + "epoch": 0.19024, + "grad_norm": 2.078125, + "grad_norm_var": 0.010603586832682291, + "learning_rate": 0.0001, + "loss": 4.0065, + "loss/crossentropy": 1.7480111718177795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2049119919538498, + "step": 9512 + }, + { + "epoch": 0.19028, + "grad_norm": 2.125, + "grad_norm_var": 0.006300608317057292, + "learning_rate": 0.0001, + "loss": 4.3644, + "loss/crossentropy": 1.9925439953804016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2314591035246849, + "step": 9514 + }, + { + "epoch": 0.19032, + "grad_norm": 2.140625, + "grad_norm_var": 0.007094065348307292, + "learning_rate": 0.0001, + "loss": 4.0702, + "loss/crossentropy": 2.133601188659668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2329796403646469, + "step": 9516 + }, + { + "epoch": 0.19036, + "grad_norm": 1.984375, + "grad_norm_var": 0.010198720296223958, + "learning_rate": 0.0001, + "loss": 4.246, + "loss/crossentropy": 2.093464970588684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22636859863996506, + "step": 9518 + }, + { + "epoch": 0.1904, + "grad_norm": 2.0625, + "grad_norm_var": 0.009757232666015626, + "learning_rate": 0.0001, + "loss": 4.4766, + "loss/crossentropy": 2.6137614250183105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27037859708070755, + "step": 9520 + }, + { + "epoch": 0.19044, + "grad_norm": 2.125, + "grad_norm_var": 0.012300364176432292, + "learning_rate": 0.0001, + "loss": 4.5499, + "loss/crossentropy": 2.008640229701996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21647297590970993, + "step": 9522 + }, + { + "epoch": 0.19048, + "grad_norm": 2.078125, + "grad_norm_var": 0.012286122639973958, + "learning_rate": 0.0001, + "loss": 4.2467, + "loss/crossentropy": 2.12644362449646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23370585590600967, + "step": 9524 + }, + { + "epoch": 0.19052, + "grad_norm": 2.109375, + "grad_norm_var": 0.011425526936848958, + "learning_rate": 0.0001, + "loss": 4.5409, + "loss/crossentropy": 2.2338638305664062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23637598007917404, + "step": 9526 + }, + { + "epoch": 0.19056, + "grad_norm": 2.0, + "grad_norm_var": 0.011785634358723958, + "learning_rate": 0.0001, + "loss": 4.1405, + "loss/crossentropy": 2.0632832646369934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20682457089424133, + "step": 9528 + }, + { + "epoch": 0.1906, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012726847330729167, + "learning_rate": 0.0001, + "loss": 4.4058, + "loss/crossentropy": 2.219120740890503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22551076859235764, + "step": 9530 + }, + { + "epoch": 0.19064, + "grad_norm": 1.953125, + "grad_norm_var": 0.011860911051432292, + "learning_rate": 0.0001, + "loss": 4.2985, + "loss/crossentropy": 2.4633371829986572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24491076171398163, + "step": 9532 + }, + { + "epoch": 0.19068, + "grad_norm": 2.03125, + "grad_norm_var": 0.008211008707682292, + "learning_rate": 0.0001, + "loss": 4.2152, + "loss/crossentropy": 2.2408339977264404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22265981882810593, + "step": 9534 + }, + { + "epoch": 0.19072, + "grad_norm": 2.234375, + "grad_norm_var": 0.009544881184895833, + "learning_rate": 0.0001, + "loss": 4.326, + "loss/crossentropy": 1.9779353141784668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2159332111477852, + "step": 9536 + }, + { + "epoch": 0.19076, + "grad_norm": 2.265625, + "grad_norm_var": 0.0082275390625, + "learning_rate": 0.0001, + "loss": 4.3522, + "loss/crossentropy": 2.0315812826156616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20710154622793198, + "step": 9538 + }, + { + "epoch": 0.1908, + "grad_norm": 2.078125, + "grad_norm_var": 0.011787923177083333, + "learning_rate": 0.0001, + "loss": 4.18, + "loss/crossentropy": 2.085337817668915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22174393385648727, + "step": 9540 + }, + { + "epoch": 0.19084, + "grad_norm": 2.03125, + "grad_norm_var": 0.016507975260416665, + "learning_rate": 0.0001, + "loss": 4.4386, + "loss/crossentropy": 2.3449169397354126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3135230466723442, + "step": 9542 + }, + { + "epoch": 0.19088, + "grad_norm": 2.046875, + "grad_norm_var": 0.016169230143229168, + "learning_rate": 0.0001, + "loss": 4.3943, + "loss/crossentropy": 2.1083431243896484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21395261585712433, + "step": 9544 + }, + { + "epoch": 0.19092, + "grad_norm": 2.03125, + "grad_norm_var": 0.015730539957682293, + "learning_rate": 0.0001, + "loss": 4.2091, + "loss/crossentropy": 2.0386710166931152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22487390786409378, + "step": 9546 + }, + { + "epoch": 0.19096, + "grad_norm": 2.21875, + "grad_norm_var": 0.015240224202473958, + "learning_rate": 0.0001, + "loss": 4.2377, + "loss/crossentropy": 1.9533087611198425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2223353162407875, + "step": 9548 + }, + { + "epoch": 0.191, + "grad_norm": 2.21875, + "grad_norm_var": 0.015317535400390625, + "learning_rate": 0.0001, + "loss": 4.4183, + "loss/crossentropy": 2.35861599445343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26162558794021606, + "step": 9550 + }, + { + "epoch": 0.19104, + "grad_norm": 2.015625, + "grad_norm_var": 0.014388020833333333, + "learning_rate": 0.0001, + "loss": 4.2507, + "loss/crossentropy": 1.9529705047607422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125585675239563, + "step": 9552 + }, + { + "epoch": 0.19108, + "grad_norm": 2.109375, + "grad_norm_var": 0.011844889322916666, + "learning_rate": 0.0001, + "loss": 4.2041, + "loss/crossentropy": 2.0364453196525574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21163037419319153, + "step": 9554 + }, + { + "epoch": 0.19112, + "grad_norm": 2.078125, + "grad_norm_var": 0.009749348958333333, + "learning_rate": 0.0001, + "loss": 4.2758, + "loss/crossentropy": 2.1321409940719604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24250106513500214, + "step": 9556 + }, + { + "epoch": 0.19116, + "grad_norm": 2.125, + "grad_norm_var": 0.0073931376139322914, + "learning_rate": 0.0001, + "loss": 4.1848, + "loss/crossentropy": 2.024011969566345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20132286846637726, + "step": 9558 + }, + { + "epoch": 0.1912, + "grad_norm": 2.109375, + "grad_norm_var": 0.006980133056640625, + "learning_rate": 0.0001, + "loss": 4.3726, + "loss/crossentropy": 2.106776535511017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23114337772130966, + "step": 9560 + }, + { + "epoch": 0.19124, + "grad_norm": 2.265625, + "grad_norm_var": 0.007645416259765625, + "learning_rate": 0.0001, + "loss": 4.4642, + "loss/crossentropy": 1.9228236079216003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22426098585128784, + "step": 9562 + }, + { + "epoch": 0.19128, + "grad_norm": 2.171875, + "grad_norm_var": 0.008090972900390625, + "learning_rate": 0.0001, + "loss": 4.2384, + "loss/crossentropy": 2.3033370971679688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20113499462604523, + "step": 9564 + }, + { + "epoch": 0.19132, + "grad_norm": 2.125, + "grad_norm_var": 0.008973948160807292, + "learning_rate": 0.0001, + "loss": 4.0736, + "loss/crossentropy": 1.6983963251113892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19003060460090637, + "step": 9566 + }, + { + "epoch": 0.19136, + "grad_norm": 2.34375, + "grad_norm_var": 0.011295318603515625, + "learning_rate": 0.0001, + "loss": 4.5283, + "loss/crossentropy": 2.502004861831665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24949797987937927, + "step": 9568 + }, + { + "epoch": 0.1914, + "grad_norm": 2.03125, + "grad_norm_var": 0.011793772379557291, + "learning_rate": 0.0001, + "loss": 4.2004, + "loss/crossentropy": 1.9981504678726196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2152494639158249, + "step": 9570 + }, + { + "epoch": 0.19144, + "grad_norm": 1.96875, + "grad_norm_var": 0.012931060791015626, + "learning_rate": 0.0001, + "loss": 4.1211, + "loss/crossentropy": 2.1489784717559814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22105325013399124, + "step": 9572 + }, + { + "epoch": 0.19148, + "grad_norm": 1.9453125, + "grad_norm_var": 0.013016510009765624, + "learning_rate": 0.0001, + "loss": 4.2234, + "loss/crossentropy": 1.974421203136444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23960395902395248, + "step": 9574 + }, + { + "epoch": 0.19152, + "grad_norm": 2.09375, + "grad_norm_var": 0.012670644124348958, + "learning_rate": 0.0001, + "loss": 4.4119, + "loss/crossentropy": 2.379599928855896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26731471717357635, + "step": 9576 + }, + { + "epoch": 0.19156, + "grad_norm": 2.015625, + "grad_norm_var": 0.012444814046223959, + "learning_rate": 0.0001, + "loss": 4.3718, + "loss/crossentropy": 2.26211154460907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22957566380500793, + "step": 9578 + }, + { + "epoch": 0.1916, + "grad_norm": 2.125, + "grad_norm_var": 0.011871083577473959, + "learning_rate": 0.0001, + "loss": 4.4594, + "loss/crossentropy": 2.282869577407837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2404445931315422, + "step": 9580 + }, + { + "epoch": 0.19164, + "grad_norm": 2.03125, + "grad_norm_var": 0.010457102457682292, + "learning_rate": 0.0001, + "loss": 4.1592, + "loss/crossentropy": 1.9183810949325562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18672315031290054, + "step": 9582 + }, + { + "epoch": 0.19168, + "grad_norm": 2.046875, + "grad_norm_var": 0.0076812744140625, + "learning_rate": 0.0001, + "loss": 4.1801, + "loss/crossentropy": 2.2722173929214478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23262400180101395, + "step": 9584 + }, + { + "epoch": 0.19172, + "grad_norm": 2.0625, + "grad_norm_var": 0.0075266520182291664, + "learning_rate": 0.0001, + "loss": 4.4032, + "loss/crossentropy": 2.426178455352783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23461100459098816, + "step": 9586 + }, + { + "epoch": 0.19176, + "grad_norm": 2.09375, + "grad_norm_var": 0.00687255859375, + "learning_rate": 0.0001, + "loss": 4.344, + "loss/crossentropy": 2.2266165018081665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22530251741409302, + "step": 9588 + }, + { + "epoch": 0.1918, + "grad_norm": 2.078125, + "grad_norm_var": 0.005541737874348958, + "learning_rate": 0.0001, + "loss": 4.1633, + "loss/crossentropy": 1.786275327205658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19257958233356476, + "step": 9590 + }, + { + "epoch": 0.19184, + "grad_norm": 2.0, + "grad_norm_var": 0.008868153889973958, + "learning_rate": 0.0001, + "loss": 4.2367, + "loss/crossentropy": 1.8497431874275208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21965742111206055, + "step": 9592 + }, + { + "epoch": 0.19188, + "grad_norm": 2.078125, + "grad_norm_var": 0.008143870035807292, + "learning_rate": 0.0001, + "loss": 4.5612, + "loss/crossentropy": 2.492846131324768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21851783990859985, + "step": 9594 + }, + { + "epoch": 0.19192, + "grad_norm": 2.28125, + "grad_norm_var": 0.010027821858723958, + "learning_rate": 0.0001, + "loss": 4.6838, + "loss/crossentropy": 2.3447986841201782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26567359268665314, + "step": 9596 + }, + { + "epoch": 0.19196, + "grad_norm": 1.984375, + "grad_norm_var": 0.010654449462890625, + "learning_rate": 0.0001, + "loss": 4.1513, + "loss/crossentropy": 1.656063199043274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1903422325849533, + "step": 9598 + }, + { + "epoch": 0.192, + "grad_norm": 2.125, + "grad_norm_var": 0.008329264322916667, + "learning_rate": 0.0001, + "loss": 4.3252, + "loss/crossentropy": 2.1816134452819824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23314762860536575, + "step": 9600 + }, + { + "epoch": 0.19204, + "grad_norm": 2.015625, + "grad_norm_var": 0.010007476806640625, + "learning_rate": 0.0001, + "loss": 3.8537, + "loss/crossentropy": 1.787261426448822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20938758552074432, + "step": 9602 + }, + { + "epoch": 0.19208, + "grad_norm": 2.015625, + "grad_norm_var": 0.010526275634765625, + "learning_rate": 0.0001, + "loss": 4.2374, + "loss/crossentropy": 1.9722678065299988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23085469752550125, + "step": 9604 + }, + { + "epoch": 0.19212, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012113444010416667, + "learning_rate": 0.0001, + "loss": 4.0954, + "loss/crossentropy": 2.04559987783432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21647220849990845, + "step": 9606 + }, + { + "epoch": 0.19216, + "grad_norm": 2.140625, + "grad_norm_var": 0.008934529622395833, + "learning_rate": 0.0001, + "loss": 4.5397, + "loss/crossentropy": 2.5426105260849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23680832237005234, + "step": 9608 + }, + { + "epoch": 0.1922, + "grad_norm": 2.0, + "grad_norm_var": 0.008890787760416666, + "learning_rate": 0.0001, + "loss": 4.245, + "loss/crossentropy": 2.1339075565338135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22501112520694733, + "step": 9610 + }, + { + "epoch": 0.19224, + "grad_norm": 2.03125, + "grad_norm_var": 0.006550852457682292, + "learning_rate": 0.0001, + "loss": 4.0924, + "loss/crossentropy": 1.9554831981658936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20187357813119888, + "step": 9612 + }, + { + "epoch": 0.19228, + "grad_norm": 2.203125, + "grad_norm_var": 0.009065500895182292, + "learning_rate": 0.0001, + "loss": 4.5578, + "loss/crossentropy": 2.2996249198913574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24954287707805634, + "step": 9614 + }, + { + "epoch": 0.19232, + "grad_norm": 2.125, + "grad_norm_var": 0.009065500895182292, + "learning_rate": 0.0001, + "loss": 4.3753, + "loss/crossentropy": 2.2439414262771606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22833245247602463, + "step": 9616 + }, + { + "epoch": 0.19236, + "grad_norm": 2.046875, + "grad_norm_var": 0.007933553059895833, + "learning_rate": 0.0001, + "loss": 4.2297, + "loss/crossentropy": 1.8900890946388245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21077623218297958, + "step": 9618 + }, + { + "epoch": 0.1924, + "grad_norm": 2.03125, + "grad_norm_var": 0.007804361979166666, + "learning_rate": 0.0001, + "loss": 4.287, + "loss/crossentropy": 2.110726058483124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21897974610328674, + "step": 9620 + }, + { + "epoch": 0.19244, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0080474853515625, + "learning_rate": 0.0001, + "loss": 4.0135, + "loss/crossentropy": 1.7363090515136719, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19939633458852768, + "step": 9622 + }, + { + "epoch": 0.19248, + "grad_norm": 2.109375, + "grad_norm_var": 0.01014404296875, + "learning_rate": 0.0001, + "loss": 4.3643, + "loss/crossentropy": 1.8148014545440674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21637701243162155, + "step": 9624 + }, + { + "epoch": 0.19252, + "grad_norm": 2.03125, + "grad_norm_var": 0.0092041015625, + "learning_rate": 0.0001, + "loss": 4.2451, + "loss/crossentropy": 2.2895134687423706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2297188639640808, + "step": 9626 + }, + { + "epoch": 0.19256, + "grad_norm": 2.15625, + "grad_norm_var": 0.006296539306640625, + "learning_rate": 0.0001, + "loss": 4.4778, + "loss/crossentropy": 2.117924213409424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24057473242282867, + "step": 9628 + }, + { + "epoch": 0.1926, + "grad_norm": 2.140625, + "grad_norm_var": 0.005421702067057292, + "learning_rate": 0.0001, + "loss": 4.2791, + "loss/crossentropy": 1.8709319829940796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2106107696890831, + "step": 9630 + }, + { + "epoch": 0.19264, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0079742431640625, + "learning_rate": 0.0001, + "loss": 4.0174, + "loss/crossentropy": 1.5156871676445007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17487400770187378, + "step": 9632 + }, + { + "epoch": 0.19268, + "grad_norm": 2.1875, + "grad_norm_var": 0.0183837890625, + "learning_rate": 0.0001, + "loss": 4.6281, + "loss/crossentropy": 2.153126537799835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22009101510047913, + "step": 9634 + }, + { + "epoch": 0.19272, + "grad_norm": 2.09375, + "grad_norm_var": 0.017853800455729166, + "learning_rate": 0.0001, + "loss": 4.443, + "loss/crossentropy": 2.0890414714813232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2233077436685562, + "step": 9636 + }, + { + "epoch": 0.19276, + "grad_norm": 2.015625, + "grad_norm_var": 0.01858495076497396, + "learning_rate": 0.0001, + "loss": 4.2572, + "loss/crossentropy": 2.1218496561050415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22576671838760376, + "step": 9638 + }, + { + "epoch": 0.1928, + "grad_norm": 1.921875, + "grad_norm_var": 0.020918528238932293, + "learning_rate": 0.0001, + "loss": 4.2522, + "loss/crossentropy": 2.1131649017333984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21846124529838562, + "step": 9640 + }, + { + "epoch": 0.19284, + "grad_norm": 2.1875, + "grad_norm_var": 0.02067845662434896, + "learning_rate": 0.0001, + "loss": 4.6643, + "loss/crossentropy": 2.3941714763641357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.246211439371109, + "step": 9642 + }, + { + "epoch": 0.19288, + "grad_norm": 1.890625, + "grad_norm_var": 0.02687352498372396, + "learning_rate": 0.0001, + "loss": 4.0641, + "loss/crossentropy": 1.9579638838768005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198220357298851, + "step": 9644 + }, + { + "epoch": 0.19292, + "grad_norm": 2.140625, + "grad_norm_var": 0.02754491170247396, + "learning_rate": 0.0001, + "loss": 4.564, + "loss/crossentropy": 2.388027787208557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23349716514348984, + "step": 9646 + }, + { + "epoch": 0.19296, + "grad_norm": 2.15625, + "grad_norm_var": 0.02520726521809896, + "learning_rate": 0.0001, + "loss": 4.3678, + "loss/crossentropy": 1.8203087449073792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18923642486333847, + "step": 9648 + }, + { + "epoch": 0.193, + "grad_norm": 2.03125, + "grad_norm_var": 0.015476226806640625, + "learning_rate": 0.0001, + "loss": 4.2119, + "loss/crossentropy": 1.996269702911377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2096453383564949, + "step": 9650 + }, + { + "epoch": 0.19304, + "grad_norm": 2.09375, + "grad_norm_var": 0.01572240193684896, + "learning_rate": 0.0001, + "loss": 4.2921, + "loss/crossentropy": 1.806606113910675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19824761897325516, + "step": 9652 + }, + { + "epoch": 0.19308, + "grad_norm": 2.28125, + "grad_norm_var": 0.01651178995768229, + "learning_rate": 0.0001, + "loss": 4.1681, + "loss/crossentropy": 2.190830111503601, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22319861501455307, + "step": 9654 + }, + { + "epoch": 0.19312, + "grad_norm": 2.15625, + "grad_norm_var": 0.014048004150390625, + "learning_rate": 0.0001, + "loss": 4.4579, + "loss/crossentropy": 1.9721493124961853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2011292800307274, + "step": 9656 + }, + { + "epoch": 0.19316, + "grad_norm": 2.234375, + "grad_norm_var": 0.014277903238932292, + "learning_rate": 0.0001, + "loss": 4.5673, + "loss/crossentropy": 2.1256929636001587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2728194147348404, + "step": 9658 + }, + { + "epoch": 0.1932, + "grad_norm": 2.125, + "grad_norm_var": 0.008345286051432291, + "learning_rate": 0.0001, + "loss": 4.3206, + "loss/crossentropy": 2.090156316757202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2240355908870697, + "step": 9660 + }, + { + "epoch": 0.19324, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0108795166015625, + "learning_rate": 0.0001, + "loss": 4.0231, + "loss/crossentropy": 2.2930272817611694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23188824206590652, + "step": 9662 + }, + { + "epoch": 0.19328, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0122222900390625, + "learning_rate": 0.0001, + "loss": 4.0075, + "loss/crossentropy": 1.9754068851470947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18698863685131073, + "step": 9664 + }, + { + "epoch": 0.19332, + "grad_norm": 2.09375, + "grad_norm_var": 0.0118560791015625, + "learning_rate": 0.0001, + "loss": 4.1668, + "loss/crossentropy": 1.8987788558006287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22531752288341522, + "step": 9666 + }, + { + "epoch": 0.19336, + "grad_norm": 2.0625, + "grad_norm_var": 0.011678059895833334, + "learning_rate": 0.0001, + "loss": 4.3185, + "loss/crossentropy": 2.2699583768844604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2572309076786041, + "step": 9668 + }, + { + "epoch": 0.1934, + "grad_norm": 2.0625, + "grad_norm_var": 0.0072509765625, + "learning_rate": 0.0001, + "loss": 4.3267, + "loss/crossentropy": 1.6649349927902222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23388104140758514, + "step": 9670 + }, + { + "epoch": 0.19344, + "grad_norm": 2.171875, + "grad_norm_var": 0.007258097330729167, + "learning_rate": 0.0001, + "loss": 4.3846, + "loss/crossentropy": 2.173617362976074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20848755538463593, + "step": 9672 + }, + { + "epoch": 0.19348, + "grad_norm": 2.03125, + "grad_norm_var": 0.005890909830729167, + "learning_rate": 0.0001, + "loss": 4.3252, + "loss/crossentropy": 2.2690787315368652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23493453860282898, + "step": 9674 + }, + { + "epoch": 0.19352, + "grad_norm": 1.9375, + "grad_norm_var": 0.007079060872395833, + "learning_rate": 0.0001, + "loss": 4.3621, + "loss/crossentropy": 2.00560861825943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22538188099861145, + "step": 9676 + }, + { + "epoch": 0.19356, + "grad_norm": 2.0625, + "grad_norm_var": 0.004369862874348958, + "learning_rate": 0.0001, + "loss": 4.1264, + "loss/crossentropy": 1.960120975971222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21056914329528809, + "step": 9678 + }, + { + "epoch": 0.1936, + "grad_norm": 1.9921875, + "grad_norm_var": 0.004689280192057292, + "learning_rate": 0.0001, + "loss": 3.9868, + "loss/crossentropy": 1.8921862840652466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22250327467918396, + "step": 9680 + }, + { + "epoch": 0.19364, + "grad_norm": 2.0625, + "grad_norm_var": 0.004839833577473958, + "learning_rate": 0.0001, + "loss": 4.296, + "loss/crossentropy": 1.9474233984947205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19719959795475006, + "step": 9682 + }, + { + "epoch": 0.19368, + "grad_norm": 2.21875, + "grad_norm_var": 0.0072100321451822914, + "learning_rate": 0.0001, + "loss": 4.2985, + "loss/crossentropy": 2.3391844034194946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22263485193252563, + "step": 9684 + }, + { + "epoch": 0.19372, + "grad_norm": 2.21875, + "grad_norm_var": 0.008715565999348958, + "learning_rate": 0.0001, + "loss": 4.3641, + "loss/crossentropy": 2.190012037754059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22531607002019882, + "step": 9686 + }, + { + "epoch": 0.19376, + "grad_norm": 2.15625, + "grad_norm_var": 0.008283487955729167, + "learning_rate": 0.0001, + "loss": 4.2601, + "loss/crossentropy": 1.9935640096664429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21438511461019516, + "step": 9688 + }, + { + "epoch": 0.1938, + "grad_norm": 2.15625, + "grad_norm_var": 0.009186808268229167, + "learning_rate": 0.0001, + "loss": 4.1756, + "loss/crossentropy": 1.7482191324234009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20972990244627, + "step": 9690 + }, + { + "epoch": 0.19384, + "grad_norm": 2.125, + "grad_norm_var": 0.008353678385416667, + "learning_rate": 0.0001, + "loss": 4.475, + "loss/crossentropy": 2.2413275241851807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24103231728076935, + "step": 9692 + }, + { + "epoch": 0.19388, + "grad_norm": 2.140625, + "grad_norm_var": 0.008519490559895834, + "learning_rate": 0.0001, + "loss": 4.2886, + "loss/crossentropy": 2.2846572399139404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2420315518975258, + "step": 9694 + }, + { + "epoch": 0.19392, + "grad_norm": 2.265625, + "grad_norm_var": 0.007968902587890625, + "learning_rate": 0.0001, + "loss": 4.1665, + "loss/crossentropy": 1.7977504134178162, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20145538449287415, + "step": 9696 + }, + { + "epoch": 0.19396, + "grad_norm": 2.0625, + "grad_norm_var": 0.007535552978515625, + "learning_rate": 0.0001, + "loss": 4.205, + "loss/crossentropy": 2.0343902111053467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21595563739538193, + "step": 9698 + }, + { + "epoch": 0.194, + "grad_norm": 2.125, + "grad_norm_var": 0.0052487691243489586, + "learning_rate": 0.0001, + "loss": 3.9524, + "loss/crossentropy": 1.8828233480453491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19624938070774078, + "step": 9700 + }, + { + "epoch": 0.19404, + "grad_norm": 1.984375, + "grad_norm_var": 0.005602773030598958, + "learning_rate": 0.0001, + "loss": 4.1569, + "loss/crossentropy": 1.9177573323249817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19467243552207947, + "step": 9702 + }, + { + "epoch": 0.19408, + "grad_norm": 1.9296875, + "grad_norm_var": 0.006613922119140625, + "learning_rate": 0.0001, + "loss": 3.8881, + "loss/crossentropy": 1.8025588393211365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18757501989603043, + "step": 9704 + }, + { + "epoch": 0.19412, + "grad_norm": 2.140625, + "grad_norm_var": 0.006723785400390625, + "learning_rate": 0.0001, + "loss": 4.3195, + "loss/crossentropy": 2.025633454322815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21623078733682632, + "step": 9706 + }, + { + "epoch": 0.19416, + "grad_norm": 2.109375, + "grad_norm_var": 0.009129842122395834, + "learning_rate": 0.0001, + "loss": 3.8244, + "loss/crossentropy": 1.9421055316925049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20563311874866486, + "step": 9708 + }, + { + "epoch": 0.1942, + "grad_norm": 2.09375, + "grad_norm_var": 0.009191640218098958, + "learning_rate": 0.0001, + "loss": 4.0977, + "loss/crossentropy": 1.9820671081542969, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22029083967208862, + "step": 9710 + }, + { + "epoch": 0.19424, + "grad_norm": 2.328125, + "grad_norm_var": 0.010341135660807292, + "learning_rate": 0.0001, + "loss": 4.2037, + "loss/crossentropy": 1.9491158723831177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19559209793806076, + "step": 9712 + }, + { + "epoch": 0.19428, + "grad_norm": 2.0625, + "grad_norm_var": 0.010416412353515625, + "learning_rate": 0.0001, + "loss": 4.1592, + "loss/crossentropy": 1.8653306365013123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18629660457372665, + "step": 9714 + }, + { + "epoch": 0.19432, + "grad_norm": 1.953125, + "grad_norm_var": 0.01950251261393229, + "learning_rate": 0.0001, + "loss": 4.2567, + "loss/crossentropy": 1.6897491812705994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1948539912700653, + "step": 9716 + }, + { + "epoch": 0.19436, + "grad_norm": 2.203125, + "grad_norm_var": 0.020776112874348957, + "learning_rate": 0.0001, + "loss": 4.4387, + "loss/crossentropy": 2.07690966129303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2147115021944046, + "step": 9718 + }, + { + "epoch": 0.1944, + "grad_norm": 2.03125, + "grad_norm_var": 0.020189412434895835, + "learning_rate": 0.0001, + "loss": 4.269, + "loss/crossentropy": 2.0356597304344177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20626354217529297, + "step": 9720 + }, + { + "epoch": 0.19444, + "grad_norm": 2.0625, + "grad_norm_var": 0.0204254150390625, + "learning_rate": 0.0001, + "loss": 4.0548, + "loss/crossentropy": 1.7709991931915283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18573015183210373, + "step": 9722 + }, + { + "epoch": 0.19448, + "grad_norm": 1.984375, + "grad_norm_var": 0.01830012003580729, + "learning_rate": 0.0001, + "loss": 4.117, + "loss/crossentropy": 2.0255925059318542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22114800661802292, + "step": 9724 + }, + { + "epoch": 0.19452, + "grad_norm": 2.25, + "grad_norm_var": 0.01871337890625, + "learning_rate": 0.0001, + "loss": 4.4655, + "loss/crossentropy": 2.0046772956848145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22649522870779037, + "step": 9726 + }, + { + "epoch": 0.19456, + "grad_norm": 2.15625, + "grad_norm_var": 0.015827433268229166, + "learning_rate": 0.0001, + "loss": 4.179, + "loss/crossentropy": 2.2746634483337402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23017344623804092, + "step": 9728 + }, + { + "epoch": 0.1946, + "grad_norm": 2.078125, + "grad_norm_var": 0.015184529622395833, + "learning_rate": 0.0001, + "loss": 4.203, + "loss/crossentropy": 2.0191025137901306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20670472085475922, + "step": 9730 + }, + { + "epoch": 0.19464, + "grad_norm": 2.015625, + "grad_norm_var": 0.008177693684895833, + "learning_rate": 0.0001, + "loss": 4.1657, + "loss/crossentropy": 2.3198455572128296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24985665082931519, + "step": 9732 + }, + { + "epoch": 0.19468, + "grad_norm": 2.09375, + "grad_norm_var": 0.0073150634765625, + "learning_rate": 0.0001, + "loss": 4.3886, + "loss/crossentropy": 2.5229711532592773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22920701652765274, + "step": 9734 + }, + { + "epoch": 0.19472, + "grad_norm": 2.078125, + "grad_norm_var": 0.0072662353515625, + "learning_rate": 0.0001, + "loss": 3.9991, + "loss/crossentropy": 2.081319808959961, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21542686223983765, + "step": 9736 + }, + { + "epoch": 0.19476, + "grad_norm": 2.046875, + "grad_norm_var": 0.008373006184895834, + "learning_rate": 0.0001, + "loss": 4.0698, + "loss/crossentropy": 2.2170007824897766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21852095425128937, + "step": 9738 + }, + { + "epoch": 0.1948, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008265940348307292, + "learning_rate": 0.0001, + "loss": 4.1528, + "loss/crossentropy": 1.830683708190918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1910252571105957, + "step": 9740 + }, + { + "epoch": 0.19484, + "grad_norm": 2.25, + "grad_norm_var": 0.008420562744140625, + "learning_rate": 0.0001, + "loss": 4.3543, + "loss/crossentropy": 2.1303864121437073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.228670135140419, + "step": 9742 + }, + { + "epoch": 0.19488, + "grad_norm": 2.015625, + "grad_norm_var": 0.007486724853515625, + "learning_rate": 0.0001, + "loss": 4.1504, + "loss/crossentropy": 2.2621915340423584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22027206420898438, + "step": 9744 + }, + { + "epoch": 0.19492, + "grad_norm": 2.0, + "grad_norm_var": 0.0110260009765625, + "learning_rate": 0.0001, + "loss": 3.9564, + "loss/crossentropy": 2.044555902481079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20469766855239868, + "step": 9746 + }, + { + "epoch": 0.19496, + "grad_norm": 2.109375, + "grad_norm_var": 0.0115142822265625, + "learning_rate": 0.0001, + "loss": 4.1677, + "loss/crossentropy": 1.911176860332489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22036674618721008, + "step": 9748 + }, + { + "epoch": 0.195, + "grad_norm": 1.96875, + "grad_norm_var": 0.010545857747395833, + "learning_rate": 0.0001, + "loss": 4.0904, + "loss/crossentropy": 1.8650219440460205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19878911972045898, + "step": 9750 + }, + { + "epoch": 0.19504, + "grad_norm": 2.125, + "grad_norm_var": 0.02088623046875, + "learning_rate": 0.0001, + "loss": 4.3648, + "loss/crossentropy": 2.196391463279724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20845064520835876, + "step": 9752 + }, + { + "epoch": 0.19508, + "grad_norm": 1.9921875, + "grad_norm_var": 0.021201324462890626, + "learning_rate": 0.0001, + "loss": 4.0132, + "loss/crossentropy": 2.222484588623047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2144618257880211, + "step": 9754 + }, + { + "epoch": 0.19512, + "grad_norm": 2.15625, + "grad_norm_var": 0.020685831705729168, + "learning_rate": 0.0001, + "loss": 4.4078, + "loss/crossentropy": 2.416514754295349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2370949387550354, + "step": 9756 + }, + { + "epoch": 0.19516, + "grad_norm": 1.9765625, + "grad_norm_var": 0.020566558837890624, + "learning_rate": 0.0001, + "loss": 4.0119, + "loss/crossentropy": 2.22190260887146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23128122836351395, + "step": 9758 + }, + { + "epoch": 0.1952, + "grad_norm": 2.15625, + "grad_norm_var": 0.024008941650390626, + "learning_rate": 0.0001, + "loss": 4.4253, + "loss/crossentropy": 2.446492910385132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23962965607643127, + "step": 9760 + }, + { + "epoch": 0.19524, + "grad_norm": 2.015625, + "grad_norm_var": 0.019245402018229166, + "learning_rate": 0.0001, + "loss": 4.1676, + "loss/crossentropy": 2.1458136439323425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19657295942306519, + "step": 9762 + }, + { + "epoch": 0.19528, + "grad_norm": 2.203125, + "grad_norm_var": 0.0197418212890625, + "learning_rate": 0.0001, + "loss": 4.6029, + "loss/crossentropy": 1.9773340225219727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22887050360441208, + "step": 9764 + }, + { + "epoch": 0.19532, + "grad_norm": 1.9453125, + "grad_norm_var": 0.020334625244140626, + "learning_rate": 0.0001, + "loss": 4.1583, + "loss/crossentropy": 2.0938061475753784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19823112338781357, + "step": 9766 + }, + { + "epoch": 0.19536, + "grad_norm": 2.3125, + "grad_norm_var": 0.015636952718098958, + "learning_rate": 0.0001, + "loss": 4.2628, + "loss/crossentropy": 1.9068174958229065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19924984872341156, + "step": 9768 + }, + { + "epoch": 0.1954, + "grad_norm": 2.109375, + "grad_norm_var": 0.012422688802083333, + "learning_rate": 0.0001, + "loss": 4.3567, + "loss/crossentropy": 2.045413613319397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19175175577402115, + "step": 9770 + }, + { + "epoch": 0.19544, + "grad_norm": 2.09375, + "grad_norm_var": 0.013106282552083333, + "learning_rate": 0.0001, + "loss": 4.1972, + "loss/crossentropy": 2.0262961983680725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22137057781219482, + "step": 9772 + }, + { + "epoch": 0.19548, + "grad_norm": 2.0625, + "grad_norm_var": 0.012992350260416667, + "learning_rate": 0.0001, + "loss": 4.0624, + "loss/crossentropy": 2.146475672721863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2284562587738037, + "step": 9774 + }, + { + "epoch": 0.19552, + "grad_norm": 2.015625, + "grad_norm_var": 0.011885579427083333, + "learning_rate": 0.0001, + "loss": 4.154, + "loss/crossentropy": 2.0316100120544434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26275022327899933, + "step": 9776 + }, + { + "epoch": 0.19556, + "grad_norm": 2.140625, + "grad_norm_var": 0.011359659830729167, + "learning_rate": 0.0001, + "loss": 4.5424, + "loss/crossentropy": 2.1386696100234985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22695952653884888, + "step": 9778 + }, + { + "epoch": 0.1956, + "grad_norm": 2.578125, + "grad_norm_var": 0.024405924479166667, + "learning_rate": 0.0001, + "loss": 4.2391, + "loss/crossentropy": 2.3000820875167847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24045731872320175, + "step": 9780 + }, + { + "epoch": 0.19564, + "grad_norm": 2.3125, + "grad_norm_var": 0.02415949503580729, + "learning_rate": 0.0001, + "loss": 4.7778, + "loss/crossentropy": 2.1272310614585876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2318389192223549, + "step": 9782 + }, + { + "epoch": 0.19568, + "grad_norm": 1.9453125, + "grad_norm_var": 0.025655110677083332, + "learning_rate": 0.0001, + "loss": 4.1232, + "loss/crossentropy": 1.8953965306282043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.213637076318264, + "step": 9784 + }, + { + "epoch": 0.19572, + "grad_norm": 2.0625, + "grad_norm_var": 0.0261871337890625, + "learning_rate": 0.0001, + "loss": 4.3459, + "loss/crossentropy": 2.0738734006881714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100028172135353, + "step": 9786 + }, + { + "epoch": 0.19576, + "grad_norm": 2.0, + "grad_norm_var": 0.027378082275390625, + "learning_rate": 0.0001, + "loss": 4.2599, + "loss/crossentropy": 1.9454593658447266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18557409197092056, + "step": 9788 + }, + { + "epoch": 0.1958, + "grad_norm": 2.015625, + "grad_norm_var": 0.026569620768229166, + "learning_rate": 0.0001, + "loss": 4.1655, + "loss/crossentropy": 2.101949095726013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22479471564292908, + "step": 9790 + }, + { + "epoch": 0.19584, + "grad_norm": 2.25, + "grad_norm_var": 0.025935872395833334, + "learning_rate": 0.0001, + "loss": 4.3841, + "loss/crossentropy": 1.9524416327476501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20574256777763367, + "step": 9792 + }, + { + "epoch": 0.19588, + "grad_norm": 2.296875, + "grad_norm_var": 0.06084391276041667, + "learning_rate": 0.0001, + "loss": 4.1596, + "loss/crossentropy": 1.9490719437599182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19411193579435349, + "step": 9794 + }, + { + "epoch": 0.19592, + "grad_norm": 2.046875, + "grad_norm_var": 0.050675455729166666, + "learning_rate": 0.0001, + "loss": 4.3438, + "loss/crossentropy": 2.095793664455414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21078093349933624, + "step": 9796 + }, + { + "epoch": 0.19596, + "grad_norm": 2.015625, + "grad_norm_var": 0.0508056640625, + "learning_rate": 0.0001, + "loss": 4.0828, + "loss/crossentropy": 2.032066822052002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21181444078683853, + "step": 9798 + }, + { + "epoch": 0.196, + "grad_norm": 2.140625, + "grad_norm_var": 0.048378245035807295, + "learning_rate": 0.0001, + "loss": 4.0019, + "loss/crossentropy": 2.1378380060195923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224056214094162, + "step": 9800 + }, + { + "epoch": 0.19604, + "grad_norm": 2.1875, + "grad_norm_var": 0.04784520467122396, + "learning_rate": 0.0001, + "loss": 4.2932, + "loss/crossentropy": 2.120614767074585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104548141360283, + "step": 9802 + }, + { + "epoch": 0.19608, + "grad_norm": 1.8984375, + "grad_norm_var": 0.048954010009765625, + "learning_rate": 0.0001, + "loss": 4.1975, + "loss/crossentropy": 1.8837141394615173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19755827635526657, + "step": 9804 + }, + { + "epoch": 0.19612, + "grad_norm": 2.03125, + "grad_norm_var": 0.04793675740559896, + "learning_rate": 0.0001, + "loss": 4.1645, + "loss/crossentropy": 1.9325945973396301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2005266472697258, + "step": 9806 + }, + { + "epoch": 0.19616, + "grad_norm": 2.046875, + "grad_norm_var": 0.04839045206705729, + "learning_rate": 0.0001, + "loss": 4.2211, + "loss/crossentropy": 1.789370834827423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20060084015130997, + "step": 9808 + }, + { + "epoch": 0.1962, + "grad_norm": 1.96875, + "grad_norm_var": 0.01709162394205729, + "learning_rate": 0.0001, + "loss": 4.2463, + "loss/crossentropy": 1.9807876348495483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21805762499570847, + "step": 9810 + }, + { + "epoch": 0.19624, + "grad_norm": 1.9375, + "grad_norm_var": 0.018888092041015624, + "learning_rate": 0.0001, + "loss": 4.1339, + "loss/crossentropy": 2.210070848464966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23082506656646729, + "step": 9812 + }, + { + "epoch": 0.19628, + "grad_norm": 2.0625, + "grad_norm_var": 0.01693903605143229, + "learning_rate": 0.0001, + "loss": 4.1633, + "loss/crossentropy": 1.8644117712974548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20300551503896713, + "step": 9814 + }, + { + "epoch": 0.19632, + "grad_norm": 2.125, + "grad_norm_var": 0.01634496053059896, + "learning_rate": 0.0001, + "loss": 4.2064, + "loss/crossentropy": 1.6804233193397522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20438820868730545, + "step": 9816 + }, + { + "epoch": 0.19636, + "grad_norm": 2.09375, + "grad_norm_var": 0.01757990519205729, + "learning_rate": 0.0001, + "loss": 4.4339, + "loss/crossentropy": 2.2513452768325806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24517202377319336, + "step": 9818 + }, + { + "epoch": 0.1964, + "grad_norm": 2.203125, + "grad_norm_var": 0.01683527628580729, + "learning_rate": 0.0001, + "loss": 4.2238, + "loss/crossentropy": 2.3133161067962646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22146832942962646, + "step": 9820 + }, + { + "epoch": 0.19644, + "grad_norm": 2.1875, + "grad_norm_var": 0.016949208577473958, + "learning_rate": 0.0001, + "loss": 4.5003, + "loss/crossentropy": 2.3226611614227295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2327018678188324, + "step": 9822 + }, + { + "epoch": 0.19648, + "grad_norm": 2.078125, + "grad_norm_var": 0.016228993733723957, + "learning_rate": 0.0001, + "loss": 4.1832, + "loss/crossentropy": 2.234626054763794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23657850921154022, + "step": 9824 + }, + { + "epoch": 0.19652, + "grad_norm": 2.25, + "grad_norm_var": 0.008941396077473959, + "learning_rate": 0.0001, + "loss": 4.4576, + "loss/crossentropy": 2.3478844165802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24796272069215775, + "step": 9826 + }, + { + "epoch": 0.19656, + "grad_norm": 2.171875, + "grad_norm_var": 0.006982167561848958, + "learning_rate": 0.0001, + "loss": 4.3678, + "loss/crossentropy": 2.094748795032501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21118396520614624, + "step": 9828 + }, + { + "epoch": 0.1966, + "grad_norm": 2.140625, + "grad_norm_var": 0.007289377848307291, + "learning_rate": 0.0001, + "loss": 4.2916, + "loss/crossentropy": 2.166300058364868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21773284673690796, + "step": 9830 + }, + { + "epoch": 0.19664, + "grad_norm": 2.09375, + "grad_norm_var": 0.007212066650390625, + "learning_rate": 0.0001, + "loss": 4.5253, + "loss/crossentropy": 2.198317289352417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22636590898036957, + "step": 9832 + }, + { + "epoch": 0.19668, + "grad_norm": 2.1875, + "grad_norm_var": 0.006278228759765625, + "learning_rate": 0.0001, + "loss": 4.3405, + "loss/crossentropy": 2.3081597089767456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.231675922870636, + "step": 9834 + }, + { + "epoch": 0.19672, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005206044514973958, + "learning_rate": 0.0001, + "loss": 4.1181, + "loss/crossentropy": 2.048017203807831, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20089948922395706, + "step": 9836 + }, + { + "epoch": 0.19676, + "grad_norm": 2.125, + "grad_norm_var": 0.01768773396809896, + "learning_rate": 0.0001, + "loss": 4.0515, + "loss/crossentropy": 1.886117160320282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21702590584754944, + "step": 9838 + }, + { + "epoch": 0.1968, + "grad_norm": 2.0625, + "grad_norm_var": 0.017895253499348958, + "learning_rate": 0.0001, + "loss": 3.9827, + "loss/crossentropy": 1.902605414390564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19976041465997696, + "step": 9840 + }, + { + "epoch": 0.19684, + "grad_norm": 4.28125, + "grad_norm_var": 0.309179433186849, + "learning_rate": 0.0001, + "loss": 4.0874, + "loss/crossentropy": 1.7959995865821838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2023170217871666, + "step": 9842 + }, + { + "epoch": 0.19688, + "grad_norm": 2.15625, + "grad_norm_var": 0.3066993713378906, + "learning_rate": 0.0001, + "loss": 4.5535, + "loss/crossentropy": 2.1481738090515137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22133169323205948, + "step": 9844 + }, + { + "epoch": 0.19692, + "grad_norm": 2.03125, + "grad_norm_var": 0.30752741495768227, + "learning_rate": 0.0001, + "loss": 4.0269, + "loss/crossentropy": 1.9328197240829468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20346572250127792, + "step": 9846 + }, + { + "epoch": 0.19696, + "grad_norm": 2.015625, + "grad_norm_var": 0.31202367146809895, + "learning_rate": 0.0001, + "loss": 4.0299, + "loss/crossentropy": 2.096457004547119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21341010928153992, + "step": 9848 + }, + { + "epoch": 0.197, + "grad_norm": 2.03125, + "grad_norm_var": 0.3146522521972656, + "learning_rate": 0.0001, + "loss": 4.3105, + "loss/crossentropy": 1.9698969721794128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1855439990758896, + "step": 9850 + }, + { + "epoch": 0.19704, + "grad_norm": 2.234375, + "grad_norm_var": 0.31038004557291665, + "learning_rate": 0.0001, + "loss": 4.2165, + "loss/crossentropy": 1.987346351146698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23657751083374023, + "step": 9852 + }, + { + "epoch": 0.19708, + "grad_norm": 1.8984375, + "grad_norm_var": 0.3109169006347656, + "learning_rate": 0.0001, + "loss": 4.0346, + "loss/crossentropy": 1.9535572528839111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21131790429353714, + "step": 9854 + }, + { + "epoch": 0.19712, + "grad_norm": 2.09375, + "grad_norm_var": 0.3090349833170573, + "learning_rate": 0.0001, + "loss": 4.5828, + "loss/crossentropy": 2.08541601896286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2290833741426468, + "step": 9856 + }, + { + "epoch": 0.19716, + "grad_norm": 2.28125, + "grad_norm_var": 0.016155751546223958, + "learning_rate": 0.0001, + "loss": 4.2774, + "loss/crossentropy": 2.1930192708969116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23072391748428345, + "step": 9858 + }, + { + "epoch": 0.1972, + "grad_norm": 2.359375, + "grad_norm_var": 0.016658274332682292, + "learning_rate": 0.0001, + "loss": 4.3923, + "loss/crossentropy": 1.8369358777999878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20687467604875565, + "step": 9860 + }, + { + "epoch": 0.19724, + "grad_norm": 2.0, + "grad_norm_var": 0.017380523681640624, + "learning_rate": 0.0001, + "loss": 4.0287, + "loss/crossentropy": 1.6707186102867126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1808091625571251, + "step": 9862 + }, + { + "epoch": 0.19728, + "grad_norm": 1.984375, + "grad_norm_var": 0.017651112874348958, + "learning_rate": 0.0001, + "loss": 4.1184, + "loss/crossentropy": 1.8352991342544556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19169726222753525, + "step": 9864 + }, + { + "epoch": 0.19732, + "grad_norm": 2.734375, + "grad_norm_var": 0.042909495035807294, + "learning_rate": 0.0001, + "loss": 4.2857, + "loss/crossentropy": 1.9427489638328552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20076656341552734, + "step": 9866 + }, + { + "epoch": 0.19736, + "grad_norm": 2.078125, + "grad_norm_var": 0.04237035115559896, + "learning_rate": 0.0001, + "loss": 4.3725, + "loss/crossentropy": 2.2190250158309937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24385111033916473, + "step": 9868 + }, + { + "epoch": 0.1974, + "grad_norm": 1.9453125, + "grad_norm_var": 0.04182510375976563, + "learning_rate": 0.0001, + "loss": 4.2192, + "loss/crossentropy": 2.0958147644996643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23264098167419434, + "step": 9870 + }, + { + "epoch": 0.19744, + "grad_norm": 2.25, + "grad_norm_var": 0.04228897094726562, + "learning_rate": 0.0001, + "loss": 4.3571, + "loss/crossentropy": 2.542472720146179, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23458892852067947, + "step": 9872 + }, + { + "epoch": 0.19748, + "grad_norm": 2.21875, + "grad_norm_var": 0.03728408813476562, + "learning_rate": 0.0001, + "loss": 4.2767, + "loss/crossentropy": 2.1562893390655518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2243279591202736, + "step": 9874 + }, + { + "epoch": 0.19752, + "grad_norm": 2.21875, + "grad_norm_var": 0.034395090738932294, + "learning_rate": 0.0001, + "loss": 4.0114, + "loss/crossentropy": 1.9851951599121094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2369420975446701, + "step": 9876 + }, + { + "epoch": 0.19756, + "grad_norm": 2.03125, + "grad_norm_var": 0.032714589436848955, + "learning_rate": 0.0001, + "loss": 4.3017, + "loss/crossentropy": 1.8751549124717712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20777788758277893, + "step": 9878 + }, + { + "epoch": 0.1976, + "grad_norm": 2.125, + "grad_norm_var": 0.0318267822265625, + "learning_rate": 0.0001, + "loss": 4.1328, + "loss/crossentropy": 2.010055720806122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21204733848571777, + "step": 9880 + }, + { + "epoch": 0.19764, + "grad_norm": 2.0625, + "grad_norm_var": 0.008942667643229167, + "learning_rate": 0.0001, + "loss": 4.1122, + "loss/crossentropy": 2.04589307308197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21330490708351135, + "step": 9882 + }, + { + "epoch": 0.19768, + "grad_norm": 2.125, + "grad_norm_var": 0.010138956705729167, + "learning_rate": 0.0001, + "loss": 4.1663, + "loss/crossentropy": 1.9441133737564087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22589778900146484, + "step": 9884 + }, + { + "epoch": 0.19772, + "grad_norm": 2.25, + "grad_norm_var": 0.008296457926432292, + "learning_rate": 0.0001, + "loss": 4.4239, + "loss/crossentropy": 2.305395483970642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24267761409282684, + "step": 9886 + }, + { + "epoch": 0.19776, + "grad_norm": 2.03125, + "grad_norm_var": 0.007269032796223958, + "learning_rate": 0.0001, + "loss": 4.386, + "loss/crossentropy": 2.096014082431793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21358592063188553, + "step": 9888 + }, + { + "epoch": 0.1978, + "grad_norm": 2.25, + "grad_norm_var": 0.008149973551432292, + "learning_rate": 0.0001, + "loss": 4.2315, + "loss/crossentropy": 2.1115033626556396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22994756698608398, + "step": 9890 + }, + { + "epoch": 0.19784, + "grad_norm": 1.9375, + "grad_norm_var": 0.008654530843098958, + "learning_rate": 0.0001, + "loss": 4.1419, + "loss/crossentropy": 1.6122692227363586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1897209882736206, + "step": 9892 + }, + { + "epoch": 0.19788, + "grad_norm": 2.0625, + "grad_norm_var": 0.012933095296223959, + "learning_rate": 0.0001, + "loss": 4.1679, + "loss/crossentropy": 1.4790136218070984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17421242594718933, + "step": 9894 + }, + { + "epoch": 0.19792, + "grad_norm": 2.09375, + "grad_norm_var": 0.013700358072916667, + "learning_rate": 0.0001, + "loss": 4.2921, + "loss/crossentropy": 1.8394885063171387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20840360969305038, + "step": 9896 + }, + { + "epoch": 0.19796, + "grad_norm": 2.03125, + "grad_norm_var": 0.0135894775390625, + "learning_rate": 0.0001, + "loss": 4.3993, + "loss/crossentropy": 2.152444541454315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23834071308374405, + "step": 9898 + }, + { + "epoch": 0.198, + "grad_norm": 2.1875, + "grad_norm_var": 0.012189737955729167, + "learning_rate": 0.0001, + "loss": 4.5268, + "loss/crossentropy": 2.367082357406616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23409543931484222, + "step": 9900 + }, + { + "epoch": 0.19804, + "grad_norm": 2.125, + "grad_norm_var": 0.011799112955729166, + "learning_rate": 0.0001, + "loss": 4.4599, + "loss/crossentropy": 2.136048436164856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21797242760658264, + "step": 9902 + }, + { + "epoch": 0.19808, + "grad_norm": 2.03125, + "grad_norm_var": 0.011945597330729167, + "learning_rate": 0.0001, + "loss": 4.2262, + "loss/crossentropy": 2.035883128643036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.215388223528862, + "step": 9904 + }, + { + "epoch": 0.19812, + "grad_norm": 2.203125, + "grad_norm_var": 0.011970774332682291, + "learning_rate": 0.0001, + "loss": 4.2024, + "loss/crossentropy": 2.0339369773864746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2085876688361168, + "step": 9906 + }, + { + "epoch": 0.19816, + "grad_norm": 2.0625, + "grad_norm_var": 0.009655507405598958, + "learning_rate": 0.0001, + "loss": 4.4441, + "loss/crossentropy": 1.829429566860199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20085029304027557, + "step": 9908 + }, + { + "epoch": 0.1982, + "grad_norm": 1.984375, + "grad_norm_var": 0.008861287434895834, + "learning_rate": 0.0001, + "loss": 4.1108, + "loss/crossentropy": 2.1721774339675903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2118668630719185, + "step": 9910 + }, + { + "epoch": 0.19824, + "grad_norm": 2.265625, + "grad_norm_var": 0.008861287434895834, + "learning_rate": 0.0001, + "loss": 4.3511, + "loss/crossentropy": 1.958261251449585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22304313629865646, + "step": 9912 + }, + { + "epoch": 0.19828, + "grad_norm": 2.21875, + "grad_norm_var": 0.009870402018229167, + "learning_rate": 0.0001, + "loss": 4.5819, + "loss/crossentropy": 2.4651763439178467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2435118407011032, + "step": 9914 + }, + { + "epoch": 0.19832, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012532297770182292, + "learning_rate": 0.0001, + "loss": 4.0973, + "loss/crossentropy": 2.173910617828369, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21142201125621796, + "step": 9916 + }, + { + "epoch": 0.19836, + "grad_norm": 2.078125, + "grad_norm_var": 0.013239542643229166, + "learning_rate": 0.0001, + "loss": 3.9283, + "loss/crossentropy": 1.9559763073921204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19051063805818558, + "step": 9918 + }, + { + "epoch": 0.1984, + "grad_norm": 2.265625, + "grad_norm_var": 0.015895334879557292, + "learning_rate": 0.0001, + "loss": 4.2516, + "loss/crossentropy": 2.238003969192505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2338731735944748, + "step": 9920 + }, + { + "epoch": 0.19844, + "grad_norm": 2.140625, + "grad_norm_var": 0.015233357747395834, + "learning_rate": 0.0001, + "loss": 4.4624, + "loss/crossentropy": 2.269726276397705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19430368393659592, + "step": 9922 + }, + { + "epoch": 0.19848, + "grad_norm": 2.109375, + "grad_norm_var": 0.015697224934895834, + "learning_rate": 0.0001, + "loss": 4.436, + "loss/crossentropy": 2.351833701133728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24469508230686188, + "step": 9924 + }, + { + "epoch": 0.19852, + "grad_norm": 2.15625, + "grad_norm_var": 0.013398996988932292, + "learning_rate": 0.0001, + "loss": 4.3833, + "loss/crossentropy": 2.0600146055221558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21375566720962524, + "step": 9926 + }, + { + "epoch": 0.19856, + "grad_norm": 2.015625, + "grad_norm_var": 0.012562815348307292, + "learning_rate": 0.0001, + "loss": 4.2957, + "loss/crossentropy": 2.1543694734573364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20916736871004105, + "step": 9928 + }, + { + "epoch": 0.1986, + "grad_norm": 2.015625, + "grad_norm_var": 0.010782623291015625, + "learning_rate": 0.0001, + "loss": 4.3111, + "loss/crossentropy": 1.8952317833900452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2216978445649147, + "step": 9930 + }, + { + "epoch": 0.19864, + "grad_norm": 2.03125, + "grad_norm_var": 0.008695475260416667, + "learning_rate": 0.0001, + "loss": 4.0542, + "loss/crossentropy": 2.102243661880493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22314947098493576, + "step": 9932 + }, + { + "epoch": 0.19868, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0072021484375, + "learning_rate": 0.0001, + "loss": 4.1653, + "loss/crossentropy": 1.9036884307861328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19729873538017273, + "step": 9934 + }, + { + "epoch": 0.19872, + "grad_norm": 2.0, + "grad_norm_var": 0.006459299723307292, + "learning_rate": 0.0001, + "loss": 4.17, + "loss/crossentropy": 1.823796033859253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18089265376329422, + "step": 9936 + }, + { + "epoch": 0.19876, + "grad_norm": 2.0, + "grad_norm_var": 0.006302642822265625, + "learning_rate": 0.0001, + "loss": 3.8742, + "loss/crossentropy": 2.055707633495331, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20620816200971603, + "step": 9938 + }, + { + "epoch": 0.1988, + "grad_norm": 2.109375, + "grad_norm_var": 0.0055010477701822914, + "learning_rate": 0.0001, + "loss": 4.256, + "loss/crossentropy": 2.0490049719810486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21709006279706955, + "step": 9940 + }, + { + "epoch": 0.19884, + "grad_norm": 2.0625, + "grad_norm_var": 0.005891672770182292, + "learning_rate": 0.0001, + "loss": 4.2733, + "loss/crossentropy": 2.164198637008667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2335330694913864, + "step": 9942 + }, + { + "epoch": 0.19888, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006941731770833333, + "learning_rate": 0.0001, + "loss": 4.1463, + "loss/crossentropy": 1.9218478202819824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1945241093635559, + "step": 9944 + }, + { + "epoch": 0.19892, + "grad_norm": 2.109375, + "grad_norm_var": 0.007328033447265625, + "learning_rate": 0.0001, + "loss": 3.8591, + "loss/crossentropy": 1.9456552267074585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20599794387817383, + "step": 9946 + }, + { + "epoch": 0.19896, + "grad_norm": 1.96875, + "grad_norm_var": 0.007749176025390625, + "learning_rate": 0.0001, + "loss": 4.0385, + "loss/crossentropy": 2.0472273230552673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20920252054929733, + "step": 9948 + }, + { + "epoch": 0.199, + "grad_norm": 1.96875, + "grad_norm_var": 0.007860310872395833, + "learning_rate": 0.0001, + "loss": 4.308, + "loss/crossentropy": 2.2252047061920166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2360471710562706, + "step": 9950 + }, + { + "epoch": 0.19904, + "grad_norm": 2.09375, + "grad_norm_var": 0.005537923177083333, + "learning_rate": 0.0001, + "loss": 4.0828, + "loss/crossentropy": 1.646431565284729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1692601889371872, + "step": 9952 + }, + { + "epoch": 0.19908, + "grad_norm": 2.046875, + "grad_norm_var": 0.0054443359375, + "learning_rate": 0.0001, + "loss": 3.8818, + "loss/crossentropy": 2.0114784836769104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20070793479681015, + "step": 9954 + }, + { + "epoch": 0.19912, + "grad_norm": 1.953125, + "grad_norm_var": 0.00677490234375, + "learning_rate": 0.0001, + "loss": 4.324, + "loss/crossentropy": 2.001778781414032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21009314060211182, + "step": 9956 + }, + { + "epoch": 0.19916, + "grad_norm": 2.0, + "grad_norm_var": 0.006037394205729167, + "learning_rate": 0.0001, + "loss": 4.3493, + "loss/crossentropy": 2.1330565214157104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22442802786827087, + "step": 9958 + }, + { + "epoch": 0.1992, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005635579427083333, + "learning_rate": 0.0001, + "loss": 4.193, + "loss/crossentropy": 1.9146793484687805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21145610511302948, + "step": 9960 + }, + { + "epoch": 0.19924, + "grad_norm": 2.015625, + "grad_norm_var": 0.004705556233723958, + "learning_rate": 0.0001, + "loss": 4.3262, + "loss/crossentropy": 2.5224483013153076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23354032635688782, + "step": 9962 + }, + { + "epoch": 0.19928, + "grad_norm": 2.109375, + "grad_norm_var": 0.004552968343098958, + "learning_rate": 0.0001, + "loss": 4.3663, + "loss/crossentropy": 2.245160937309265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21848157793283463, + "step": 9964 + }, + { + "epoch": 0.19932, + "grad_norm": 2.15625, + "grad_norm_var": 0.004622141520182292, + "learning_rate": 0.0001, + "loss": 4.3506, + "loss/crossentropy": 2.122299015522003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21853189170360565, + "step": 9966 + }, + { + "epoch": 0.19936, + "grad_norm": 2.171875, + "grad_norm_var": 0.005716705322265625, + "learning_rate": 0.0001, + "loss": 4.4524, + "loss/crossentropy": 2.31084668636322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23691194504499435, + "step": 9968 + }, + { + "epoch": 0.1994, + "grad_norm": 2.25, + "grad_norm_var": 0.007834625244140626, + "learning_rate": 0.0001, + "loss": 4.274, + "loss/crossentropy": 2.2242307662963867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21957046538591385, + "step": 9970 + }, + { + "epoch": 0.19944, + "grad_norm": 2.078125, + "grad_norm_var": 0.007063547770182292, + "learning_rate": 0.0001, + "loss": 4.2264, + "loss/crossentropy": 1.792852759361267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19996580481529236, + "step": 9972 + }, + { + "epoch": 0.19948, + "grad_norm": 2.09375, + "grad_norm_var": 0.0063250223795572914, + "learning_rate": 0.0001, + "loss": 4.3029, + "loss/crossentropy": 1.9593411087989807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21858739852905273, + "step": 9974 + }, + { + "epoch": 0.19952, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0067827860514322914, + "learning_rate": 0.0001, + "loss": 4.3067, + "loss/crossentropy": 2.259618401527405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22094043344259262, + "step": 9976 + }, + { + "epoch": 0.19956, + "grad_norm": 2.140625, + "grad_norm_var": 0.0063168843587239586, + "learning_rate": 0.0001, + "loss": 4.373, + "loss/crossentropy": 2.31876802444458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22025877982378006, + "step": 9978 + }, + { + "epoch": 0.1996, + "grad_norm": 2.0625, + "grad_norm_var": 0.006109364827473958, + "learning_rate": 0.0001, + "loss": 4.15, + "loss/crossentropy": 1.7625555396080017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21313250809907913, + "step": 9980 + }, + { + "epoch": 0.19964, + "grad_norm": 2.265625, + "grad_norm_var": 0.007342274983723958, + "learning_rate": 0.0001, + "loss": 4.4628, + "loss/crossentropy": 2.19295072555542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23694515973329544, + "step": 9982 + }, + { + "epoch": 0.19968, + "grad_norm": 2.0625, + "grad_norm_var": 0.008739217122395834, + "learning_rate": 0.0001, + "loss": 4.187, + "loss/crossentropy": 2.4073877334594727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22179137915372849, + "step": 9984 + }, + { + "epoch": 0.19972, + "grad_norm": 2.015625, + "grad_norm_var": 0.008125813802083333, + "learning_rate": 0.0001, + "loss": 3.9804, + "loss/crossentropy": 1.8942558765411377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20588286221027374, + "step": 9986 + }, + { + "epoch": 0.19976, + "grad_norm": 1.9375, + "grad_norm_var": 0.012188466389973958, + "learning_rate": 0.0001, + "loss": 4.109, + "loss/crossentropy": 2.220746397972107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2144280970096588, + "step": 9988 + }, + { + "epoch": 0.1998, + "grad_norm": 2.125, + "grad_norm_var": 0.01260986328125, + "learning_rate": 0.0001, + "loss": 4.2279, + "loss/crossentropy": 1.9511706233024597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1998429372906685, + "step": 9990 + }, + { + "epoch": 0.19984, + "grad_norm": 2.125, + "grad_norm_var": 0.012247467041015625, + "learning_rate": 0.0001, + "loss": 4.143, + "loss/crossentropy": 2.249726891517639, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24011528491973877, + "step": 9992 + }, + { + "epoch": 0.19988, + "grad_norm": 2.6875, + "grad_norm_var": 0.03792292277018229, + "learning_rate": 0.0001, + "loss": 4.3104, + "loss/crossentropy": 1.957375943660736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24464774131774902, + "step": 9994 + }, + { + "epoch": 0.19992, + "grad_norm": 2.21875, + "grad_norm_var": 0.039033762613932294, + "learning_rate": 0.0001, + "loss": 4.5492, + "loss/crossentropy": 2.265984058380127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24291887879371643, + "step": 9996 + }, + { + "epoch": 0.19996, + "grad_norm": 2.046875, + "grad_norm_var": 0.03920873006184896, + "learning_rate": 0.0001, + "loss": 4.1432, + "loss/crossentropy": 1.8064668774604797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21571090072393417, + "step": 9998 + }, + { + "epoch": 0.2, + "grad_norm": 2.234375, + "grad_norm_var": 0.03875732421875, + "learning_rate": 0.0001, + "loss": 4.2295, + "loss/crossentropy": 1.9072380661964417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19980185478925705, + "step": 10000 + }, + { + "epoch": 0.20004, + "grad_norm": 2.15625, + "grad_norm_var": 0.036641438802083336, + "learning_rate": 0.0001, + "loss": 4.3919, + "loss/crossentropy": 2.074933707714081, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22161328792572021, + "step": 10002 + }, + { + "epoch": 0.20008, + "grad_norm": 2.015625, + "grad_norm_var": 0.027581532796223957, + "learning_rate": 0.0001, + "loss": 4.0447, + "loss/crossentropy": 1.9344687461853027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1825423538684845, + "step": 10004 + }, + { + "epoch": 0.20012, + "grad_norm": 3.640625, + "grad_norm_var": 0.162158203125, + "learning_rate": 0.0001, + "loss": 4.1116, + "loss/crossentropy": 1.866003930568695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22012518346309662, + "step": 10006 + }, + { + "epoch": 0.20016, + "grad_norm": 2.0, + "grad_norm_var": 0.162158203125, + "learning_rate": 0.0001, + "loss": 4.3517, + "loss/crossentropy": 2.145058751106262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22737383097410202, + "step": 10008 + }, + { + "epoch": 0.2002, + "grad_norm": 2.09375, + "grad_norm_var": 0.15038960774739582, + "learning_rate": 0.0001, + "loss": 3.9755, + "loss/crossentropy": 1.7828176617622375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22082456946372986, + "step": 10010 + }, + { + "epoch": 0.20024, + "grad_norm": 2.109375, + "grad_norm_var": 0.1537994384765625, + "learning_rate": 0.0001, + "loss": 4.3143, + "loss/crossentropy": 2.1222537755966187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2176828756928444, + "step": 10012 + }, + { + "epoch": 0.20028, + "grad_norm": 2.203125, + "grad_norm_var": 0.15230712890625, + "learning_rate": 0.0001, + "loss": 4.53, + "loss/crossentropy": 2.119267463684082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24002012610435486, + "step": 10014 + }, + { + "epoch": 0.20032, + "grad_norm": 2.078125, + "grad_norm_var": 0.15458577473958332, + "learning_rate": 0.0001, + "loss": 4.3528, + "loss/crossentropy": 2.198129415512085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23767977952957153, + "step": 10016 + }, + { + "epoch": 0.20036, + "grad_norm": 2.125, + "grad_norm_var": 0.15608317057291668, + "learning_rate": 0.0001, + "loss": 4.0228, + "loss/crossentropy": 1.7466872334480286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19916100800037384, + "step": 10018 + }, + { + "epoch": 0.2004, + "grad_norm": 1.9296875, + "grad_norm_var": 0.15812352498372395, + "learning_rate": 0.0001, + "loss": 4.1989, + "loss/crossentropy": 1.9834936261177063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1891954466700554, + "step": 10020 + }, + { + "epoch": 0.20044, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005840810139973959, + "learning_rate": 0.0001, + "loss": 4.0987, + "loss/crossentropy": 1.8997412323951721, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19624030590057373, + "step": 10022 + }, + { + "epoch": 0.20048, + "grad_norm": 2.1875, + "grad_norm_var": 0.0065915425618489586, + "learning_rate": 0.0001, + "loss": 4.2531, + "loss/crossentropy": 2.0051563382148743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2069488987326622, + "step": 10024 + }, + { + "epoch": 0.20052, + "grad_norm": 2.265625, + "grad_norm_var": 0.008107248942057292, + "learning_rate": 0.0001, + "loss": 4.4215, + "loss/crossentropy": 1.8662462830543518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2053019255399704, + "step": 10026 + }, + { + "epoch": 0.20056, + "grad_norm": 2.171875, + "grad_norm_var": 0.02585627237955729, + "learning_rate": 0.0001, + "loss": 4.1444, + "loss/crossentropy": 2.1043936014175415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22167576104402542, + "step": 10028 + }, + { + "epoch": 0.2006, + "grad_norm": 5.78125, + "grad_norm_var": 0.859185536702474, + "learning_rate": 0.0001, + "loss": 4.3685, + "loss/crossentropy": 2.1248152256011963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22400003671646118, + "step": 10030 + }, + { + "epoch": 0.20064, + "grad_norm": 2.078125, + "grad_norm_var": 0.852441151936849, + "learning_rate": 0.0001, + "loss": 4.1971, + "loss/crossentropy": 2.326894521713257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24597454071044922, + "step": 10032 + }, + { + "epoch": 0.20068, + "grad_norm": 2.203125, + "grad_norm_var": 0.8413164774576823, + "learning_rate": 0.0001, + "loss": 4.1801, + "loss/crossentropy": 1.8590435981750488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20270948112010956, + "step": 10034 + }, + { + "epoch": 0.20072, + "grad_norm": 2.0, + "grad_norm_var": 0.82972412109375, + "learning_rate": 0.0001, + "loss": 4.5098, + "loss/crossentropy": 2.0383604168891907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21504472196102142, + "step": 10036 + }, + { + "epoch": 0.20076, + "grad_norm": 2.078125, + "grad_norm_var": 0.81461181640625, + "learning_rate": 0.0001, + "loss": 4.599, + "loss/crossentropy": 2.42622447013855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2143569439649582, + "step": 10038 + }, + { + "epoch": 0.2008, + "grad_norm": 2.109375, + "grad_norm_var": 0.825640614827474, + "learning_rate": 0.0001, + "loss": 4.3216, + "loss/crossentropy": 2.0825703144073486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21215181052684784, + "step": 10040 + }, + { + "epoch": 0.20084, + "grad_norm": 2.140625, + "grad_norm_var": 0.8326515197753906, + "learning_rate": 0.0001, + "loss": 3.9615, + "loss/crossentropy": 2.1682112216949463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23538567870855331, + "step": 10042 + }, + { + "epoch": 0.20088, + "grad_norm": 1.984375, + "grad_norm_var": 0.8466957092285157, + "learning_rate": 0.0001, + "loss": 4.2244, + "loss/crossentropy": 2.0161439180374146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21078374981880188, + "step": 10044 + }, + { + "epoch": 0.20092, + "grad_norm": 2.078125, + "grad_norm_var": 0.018143463134765624, + "learning_rate": 0.0001, + "loss": 4.5491, + "loss/crossentropy": 2.2262184619903564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2855340391397476, + "step": 10046 + }, + { + "epoch": 0.20096, + "grad_norm": 2.203125, + "grad_norm_var": 0.011321767171223959, + "learning_rate": 0.0001, + "loss": 4.5078, + "loss/crossentropy": 2.454360246658325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2287754938006401, + "step": 10048 + }, + { + "epoch": 0.201, + "grad_norm": 2.015625, + "grad_norm_var": 0.007940419514973958, + "learning_rate": 0.0001, + "loss": 4.1146, + "loss/crossentropy": 1.970844566822052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20197579264640808, + "step": 10050 + }, + { + "epoch": 0.20104, + "grad_norm": 2.03125, + "grad_norm_var": 0.006211090087890625, + "learning_rate": 0.0001, + "loss": 4.3573, + "loss/crossentropy": 2.0942559242248535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21927295625209808, + "step": 10052 + }, + { + "epoch": 0.20108, + "grad_norm": 2.0, + "grad_norm_var": 0.005771636962890625, + "learning_rate": 0.0001, + "loss": 4.036, + "loss/crossentropy": 1.868508517742157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1838754191994667, + "step": 10054 + }, + { + "epoch": 0.20112, + "grad_norm": 2.15625, + "grad_norm_var": 0.0058095296223958336, + "learning_rate": 0.0001, + "loss": 4.2101, + "loss/crossentropy": 2.028561532497406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21768346428871155, + "step": 10056 + }, + { + "epoch": 0.20116, + "grad_norm": 1.9375, + "grad_norm_var": 0.006669108072916667, + "learning_rate": 0.0001, + "loss": 3.9803, + "loss/crossentropy": 2.3005030155181885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23332027345895767, + "step": 10058 + }, + { + "epoch": 0.2012, + "grad_norm": 2.03125, + "grad_norm_var": 0.0065826416015625, + "learning_rate": 0.0001, + "loss": 4.089, + "loss/crossentropy": 1.7793474793434143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18733646720647812, + "step": 10060 + }, + { + "epoch": 0.20124, + "grad_norm": 2.015625, + "grad_norm_var": 0.005125935872395833, + "learning_rate": 0.0001, + "loss": 4.0197, + "loss/crossentropy": 2.0612844228744507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21674886345863342, + "step": 10062 + }, + { + "epoch": 0.20128, + "grad_norm": 2.078125, + "grad_norm_var": 0.004500071207682292, + "learning_rate": 0.0001, + "loss": 4.237, + "loss/crossentropy": 2.3219568729400635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20989079773426056, + "step": 10064 + }, + { + "epoch": 0.20132, + "grad_norm": 2.078125, + "grad_norm_var": 0.004659016927083333, + "learning_rate": 0.0001, + "loss": 4.0959, + "loss/crossentropy": 2.0941001176834106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19992788136005402, + "step": 10066 + }, + { + "epoch": 0.20136, + "grad_norm": 2.09375, + "grad_norm_var": 0.0038330078125, + "learning_rate": 0.0001, + "loss": 4.053, + "loss/crossentropy": 1.7448341250419617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108384072780609, + "step": 10068 + }, + { + "epoch": 0.2014, + "grad_norm": 2.0625, + "grad_norm_var": 0.0059397379557291664, + "learning_rate": 0.0001, + "loss": 4.263, + "loss/crossentropy": 2.0475903749465942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19875742495059967, + "step": 10070 + }, + { + "epoch": 0.20144, + "grad_norm": 2.140625, + "grad_norm_var": 0.0499755859375, + "learning_rate": 0.0001, + "loss": 4.1014, + "loss/crossentropy": 2.0624433755874634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20577051490545273, + "step": 10072 + }, + { + "epoch": 0.20148, + "grad_norm": 2.03125, + "grad_norm_var": 0.04807840983072917, + "learning_rate": 0.0001, + "loss": 4.3612, + "loss/crossentropy": 2.194978952407837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19964048266410828, + "step": 10074 + }, + { + "epoch": 0.20152, + "grad_norm": 2.0625, + "grad_norm_var": 0.04691162109375, + "learning_rate": 0.0001, + "loss": 4.4025, + "loss/crossentropy": 2.011984169483185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22259068489074707, + "step": 10076 + }, + { + "epoch": 0.20156, + "grad_norm": 2.203125, + "grad_norm_var": 0.0458740234375, + "learning_rate": 0.0001, + "loss": 4.3539, + "loss/crossentropy": 2.098285675048828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2232050970196724, + "step": 10078 + }, + { + "epoch": 0.2016, + "grad_norm": 1.9609375, + "grad_norm_var": 0.04423726399739583, + "learning_rate": 0.0001, + "loss": 4.2772, + "loss/crossentropy": 2.313677191734314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2224883735179901, + "step": 10080 + }, + { + "epoch": 0.20164, + "grad_norm": 1.953125, + "grad_norm_var": 0.04454523722330729, + "learning_rate": 0.0001, + "loss": 4.1523, + "loss/crossentropy": 2.3712470531463623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23282987624406815, + "step": 10082 + }, + { + "epoch": 0.20168, + "grad_norm": 2.125, + "grad_norm_var": 0.043342844645182295, + "learning_rate": 0.0001, + "loss": 4.3173, + "loss/crossentropy": 2.204525947570801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2384341061115265, + "step": 10084 + }, + { + "epoch": 0.20172, + "grad_norm": 1.9453125, + "grad_norm_var": 0.04528401692708333, + "learning_rate": 0.0001, + "loss": 3.9497, + "loss/crossentropy": 1.8102558851242065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17472535371780396, + "step": 10086 + }, + { + "epoch": 0.20176, + "grad_norm": 2.046875, + "grad_norm_var": 0.005711873372395833, + "learning_rate": 0.0001, + "loss": 4.2119, + "loss/crossentropy": 2.041890263557434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21201669424772263, + "step": 10088 + }, + { + "epoch": 0.2018, + "grad_norm": 2.125, + "grad_norm_var": 0.006026204427083333, + "learning_rate": 0.0001, + "loss": 4.3869, + "loss/crossentropy": 1.9904854893684387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23453570157289505, + "step": 10090 + }, + { + "epoch": 0.20184, + "grad_norm": 2.0625, + "grad_norm_var": 0.00716552734375, + "learning_rate": 0.0001, + "loss": 4.4324, + "loss/crossentropy": 2.199320912361145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22087062150239944, + "step": 10092 + }, + { + "epoch": 0.20188, + "grad_norm": 2.203125, + "grad_norm_var": 0.0073150634765625, + "learning_rate": 0.0001, + "loss": 4.392, + "loss/crossentropy": 2.321953535079956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2463374137878418, + "step": 10094 + }, + { + "epoch": 0.20192, + "grad_norm": 2.046875, + "grad_norm_var": 0.006076812744140625, + "learning_rate": 0.0001, + "loss": 4.3276, + "loss/crossentropy": 2.1109840869903564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23713566362857819, + "step": 10096 + }, + { + "epoch": 0.20196, + "grad_norm": 1.9375, + "grad_norm_var": 0.006322987874348958, + "learning_rate": 0.0001, + "loss": 3.9339, + "loss/crossentropy": 1.9126858711242676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20392102003097534, + "step": 10098 + }, + { + "epoch": 0.202, + "grad_norm": 2.09375, + "grad_norm_var": 0.006268056233723959, + "learning_rate": 0.0001, + "loss": 4.2902, + "loss/crossentropy": 2.073318660259247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20465004444122314, + "step": 10100 + }, + { + "epoch": 0.20204, + "grad_norm": 2.390625, + "grad_norm_var": 0.009837849934895834, + "learning_rate": 0.0001, + "loss": 4.1883, + "loss/crossentropy": 1.7532709836959839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22591036558151245, + "step": 10102 + }, + { + "epoch": 0.20208, + "grad_norm": 2.21875, + "grad_norm_var": 0.010660807291666666, + "learning_rate": 0.0001, + "loss": 4.3088, + "loss/crossentropy": 2.1874141693115234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.227812297642231, + "step": 10104 + }, + { + "epoch": 0.20212, + "grad_norm": 2.015625, + "grad_norm_var": 0.014095052083333334, + "learning_rate": 0.0001, + "loss": 4.4738, + "loss/crossentropy": 2.522923469543457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24104881286621094, + "step": 10106 + }, + { + "epoch": 0.20216, + "grad_norm": 2.21875, + "grad_norm_var": 0.013963826497395833, + "learning_rate": 0.0001, + "loss": 4.2823, + "loss/crossentropy": 1.9359918236732483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19563085585832596, + "step": 10108 + }, + { + "epoch": 0.2022, + "grad_norm": 2.015625, + "grad_norm_var": 0.015315755208333334, + "learning_rate": 0.0001, + "loss": 4.3582, + "loss/crossentropy": 2.390757203102112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24748887866735458, + "step": 10110 + }, + { + "epoch": 0.20224, + "grad_norm": 2.0625, + "grad_norm_var": 0.015013631184895833, + "learning_rate": 0.0001, + "loss": 4.5576, + "loss/crossentropy": 2.419153571128845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2228071466088295, + "step": 10112 + }, + { + "epoch": 0.20228, + "grad_norm": 2.03125, + "grad_norm_var": 0.015404256184895833, + "learning_rate": 0.0001, + "loss": 4.0235, + "loss/crossentropy": 1.7460771799087524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19431117177009583, + "step": 10114 + }, + { + "epoch": 0.20232, + "grad_norm": 2.140625, + "grad_norm_var": 0.01754150390625, + "learning_rate": 0.0001, + "loss": 4.601, + "loss/crossentropy": 2.308094024658203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24104833602905273, + "step": 10116 + }, + { + "epoch": 0.20236, + "grad_norm": 2.015625, + "grad_norm_var": 0.013374837239583333, + "learning_rate": 0.0001, + "loss": 4.1696, + "loss/crossentropy": 2.0273314118385315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2325449138879776, + "step": 10118 + }, + { + "epoch": 0.2024, + "grad_norm": 1.9453125, + "grad_norm_var": 0.013792928059895833, + "learning_rate": 0.0001, + "loss": 4.016, + "loss/crossentropy": 2.1261476278305054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21481933444738388, + "step": 10120 + }, + { + "epoch": 0.20244, + "grad_norm": 2.03125, + "grad_norm_var": 0.0107818603515625, + "learning_rate": 0.0001, + "loss": 4.478, + "loss/crossentropy": 2.5006214380264282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25569023191928864, + "step": 10122 + }, + { + "epoch": 0.20248, + "grad_norm": 1.8671875, + "grad_norm_var": 0.011775461832682292, + "learning_rate": 0.0001, + "loss": 4.3208, + "loss/crossentropy": 2.2033116817474365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2601129561662674, + "step": 10124 + }, + { + "epoch": 0.20252, + "grad_norm": 2.09375, + "grad_norm_var": 0.011572011311848958, + "learning_rate": 0.0001, + "loss": 4.1098, + "loss/crossentropy": 2.2208765745162964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20958464592695236, + "step": 10126 + }, + { + "epoch": 0.20256, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011848958333333333, + "learning_rate": 0.0001, + "loss": 3.943, + "loss/crossentropy": 1.7752392888069153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22697371244430542, + "step": 10128 + }, + { + "epoch": 0.2026, + "grad_norm": 2.0625, + "grad_norm_var": 0.011937459309895834, + "learning_rate": 0.0001, + "loss": 4.0529, + "loss/crossentropy": 1.6444379687309265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18721824884414673, + "step": 10130 + }, + { + "epoch": 0.20264, + "grad_norm": 2.046875, + "grad_norm_var": 0.007271321614583334, + "learning_rate": 0.0001, + "loss": 4.1796, + "loss/crossentropy": 1.7681297659873962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24296899884939194, + "step": 10132 + }, + { + "epoch": 0.20268, + "grad_norm": 2.015625, + "grad_norm_var": 0.007614898681640625, + "learning_rate": 0.0001, + "loss": 4.1593, + "loss/crossentropy": 1.8710024952888489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1891346201300621, + "step": 10134 + }, + { + "epoch": 0.20272, + "grad_norm": 2.125, + "grad_norm_var": 0.007515462239583334, + "learning_rate": 0.0001, + "loss": 4.0822, + "loss/crossentropy": 2.0258530974388123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22128118574619293, + "step": 10136 + }, + { + "epoch": 0.20276, + "grad_norm": 2.015625, + "grad_norm_var": 1.7479237874348958, + "learning_rate": 0.0001, + "loss": 4.3091, + "loss/crossentropy": 2.3366141319274902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2612725794315338, + "step": 10138 + }, + { + "epoch": 0.2028, + "grad_norm": 2.0625, + "grad_norm_var": 1.739208730061849, + "learning_rate": 0.0001, + "loss": 4.357, + "loss/crossentropy": 2.152353823184967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2239176332950592, + "step": 10140 + }, + { + "epoch": 0.20284, + "grad_norm": 2.15625, + "grad_norm_var": 1.7346433003743489, + "learning_rate": 0.0001, + "loss": 4.2552, + "loss/crossentropy": 1.8189843893051147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21529949456453323, + "step": 10142 + }, + { + "epoch": 0.20288, + "grad_norm": 2.03125, + "grad_norm_var": 1.7219970703125, + "learning_rate": 0.0001, + "loss": 4.3179, + "loss/crossentropy": 2.308253049850464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21748851984739304, + "step": 10144 + }, + { + "epoch": 0.20292, + "grad_norm": 2.015625, + "grad_norm_var": 1.7289377848307292, + "learning_rate": 0.0001, + "loss": 4.1762, + "loss/crossentropy": 1.953293800354004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1948333978652954, + "step": 10146 + }, + { + "epoch": 0.20296, + "grad_norm": 1.953125, + "grad_norm_var": 1.726512654622396, + "learning_rate": 0.0001, + "loss": 3.9573, + "loss/crossentropy": 2.1122325658798218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2085581198334694, + "step": 10148 + }, + { + "epoch": 0.203, + "grad_norm": 2.0625, + "grad_norm_var": 1.716387685139974, + "learning_rate": 0.0001, + "loss": 4.4209, + "loss/crossentropy": 2.450512409210205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23670841753482819, + "step": 10150 + }, + { + "epoch": 0.20304, + "grad_norm": 1.984375, + "grad_norm_var": 1.7187327067057292, + "learning_rate": 0.0001, + "loss": 4.1131, + "loss/crossentropy": 2.3858957290649414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23381955921649933, + "step": 10152 + }, + { + "epoch": 0.20308, + "grad_norm": 2.0625, + "grad_norm_var": 0.0060618082682291664, + "learning_rate": 0.0001, + "loss": 4.2154, + "loss/crossentropy": 2.1214572191238403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22399096935987473, + "step": 10154 + }, + { + "epoch": 0.20312, + "grad_norm": 1.90625, + "grad_norm_var": 0.0082916259765625, + "learning_rate": 0.0001, + "loss": 4.0487, + "loss/crossentropy": 1.9299064874649048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19886164367198944, + "step": 10156 + }, + { + "epoch": 0.20316, + "grad_norm": 2.234375, + "grad_norm_var": 0.009877268473307292, + "learning_rate": 0.0001, + "loss": 4.108, + "loss/crossentropy": 1.9582993388175964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1859385445713997, + "step": 10158 + }, + { + "epoch": 0.2032, + "grad_norm": 2.15625, + "grad_norm_var": 0.008790842692057292, + "learning_rate": 0.0001, + "loss": 4.408, + "loss/crossentropy": 2.0555814504623413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25231435894966125, + "step": 10160 + }, + { + "epoch": 0.20324, + "grad_norm": 2.109375, + "grad_norm_var": 0.009090169270833334, + "learning_rate": 0.0001, + "loss": 4.2159, + "loss/crossentropy": 1.9849395155906677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20529592037200928, + "step": 10162 + }, + { + "epoch": 0.20328, + "grad_norm": 2.125, + "grad_norm_var": 0.0071441650390625, + "learning_rate": 0.0001, + "loss": 4.5619, + "loss/crossentropy": 1.9529971480369568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1997845619916916, + "step": 10164 + }, + { + "epoch": 0.20332, + "grad_norm": 1.953125, + "grad_norm_var": 0.009330240885416667, + "learning_rate": 0.0001, + "loss": 4.3117, + "loss/crossentropy": 1.9885541200637817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19847019761800766, + "step": 10166 + }, + { + "epoch": 0.20336, + "grad_norm": 2.21875, + "grad_norm_var": 0.010013834635416666, + "learning_rate": 0.0001, + "loss": 4.3431, + "loss/crossentropy": 1.8039852380752563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19332807511091232, + "step": 10168 + }, + { + "epoch": 0.2034, + "grad_norm": 2.125, + "grad_norm_var": 0.01004638671875, + "learning_rate": 0.0001, + "loss": 4.4287, + "loss/crossentropy": 2.1633352041244507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22904948145151138, + "step": 10170 + }, + { + "epoch": 0.20344, + "grad_norm": 2.046875, + "grad_norm_var": 0.0070953369140625, + "learning_rate": 0.0001, + "loss": 4.1148, + "loss/crossentropy": 2.2646039724349976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22390951961278915, + "step": 10172 + }, + { + "epoch": 0.20348, + "grad_norm": 2.015625, + "grad_norm_var": 0.005741119384765625, + "learning_rate": 0.0001, + "loss": 4.4114, + "loss/crossentropy": 2.604608416557312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25521157681941986, + "step": 10174 + }, + { + "epoch": 0.20352, + "grad_norm": 2.125, + "grad_norm_var": 0.016257476806640626, + "learning_rate": 0.0001, + "loss": 4.3319, + "loss/crossentropy": 2.286113977432251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23809745162725449, + "step": 10176 + }, + { + "epoch": 0.20356, + "grad_norm": 1.9453125, + "grad_norm_var": 0.01777928670247396, + "learning_rate": 0.0001, + "loss": 3.9951, + "loss/crossentropy": 2.0746694207191467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21103999018669128, + "step": 10178 + }, + { + "epoch": 0.2036, + "grad_norm": 2.0, + "grad_norm_var": 0.018381500244140626, + "learning_rate": 0.0001, + "loss": 3.9892, + "loss/crossentropy": 1.9076440930366516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2048335075378418, + "step": 10180 + }, + { + "epoch": 0.20364, + "grad_norm": 2.0, + "grad_norm_var": 0.0178863525390625, + "learning_rate": 0.0001, + "loss": 4.1768, + "loss/crossentropy": 1.8896904587745667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20643991231918335, + "step": 10182 + }, + { + "epoch": 0.20368, + "grad_norm": 2.0625, + "grad_norm_var": 0.016747029622395833, + "learning_rate": 0.0001, + "loss": 4.215, + "loss/crossentropy": 2.156657338142395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22853681445121765, + "step": 10184 + }, + { + "epoch": 0.20372, + "grad_norm": 2.203125, + "grad_norm_var": 0.017772420247395834, + "learning_rate": 0.0001, + "loss": 4.6307, + "loss/crossentropy": 2.3767203092575073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2422076091170311, + "step": 10186 + }, + { + "epoch": 0.20376, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01834894816080729, + "learning_rate": 0.0001, + "loss": 4.3402, + "loss/crossentropy": 2.5728834867477417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22912005335092545, + "step": 10188 + }, + { + "epoch": 0.2038, + "grad_norm": 1.9453125, + "grad_norm_var": 0.02005182902018229, + "learning_rate": 0.0001, + "loss": 4.11, + "loss/crossentropy": 2.1542125940322876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19575025141239166, + "step": 10190 + }, + { + "epoch": 0.20384, + "grad_norm": 1.9765625, + "grad_norm_var": 0.004610188802083333, + "learning_rate": 0.0001, + "loss": 3.9114, + "loss/crossentropy": 1.5845852494239807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1845107451081276, + "step": 10192 + }, + { + "epoch": 0.20388, + "grad_norm": 2.34375, + "grad_norm_var": 0.010957590738932292, + "learning_rate": 0.0001, + "loss": 4.2193, + "loss/crossentropy": 2.1166247129440308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2352868989109993, + "step": 10194 + }, + { + "epoch": 0.20392, + "grad_norm": 2.0, + "grad_norm_var": 0.010941314697265624, + "learning_rate": 0.0001, + "loss": 4.3374, + "loss/crossentropy": 2.0129401683807373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22231722623109818, + "step": 10196 + }, + { + "epoch": 0.20396, + "grad_norm": 2.125, + "grad_norm_var": 0.010595703125, + "learning_rate": 0.0001, + "loss": 3.9709, + "loss/crossentropy": 1.7132073044776917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18660317361354828, + "step": 10198 + }, + { + "epoch": 0.204, + "grad_norm": 2.21875, + "grad_norm_var": 0.016743977864583332, + "learning_rate": 0.0001, + "loss": 4.3959, + "loss/crossentropy": 2.0064845085144043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20879874378442764, + "step": 10200 + }, + { + "epoch": 0.20404, + "grad_norm": 1.9140625, + "grad_norm_var": 0.01803766886393229, + "learning_rate": 0.0001, + "loss": 4.3558, + "loss/crossentropy": 2.4205944538116455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24201631546020508, + "step": 10202 + }, + { + "epoch": 0.20408, + "grad_norm": 1.9375, + "grad_norm_var": 0.019694010416666668, + "learning_rate": 0.0001, + "loss": 3.8977, + "loss/crossentropy": 2.0392255187034607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21578273177146912, + "step": 10204 + }, + { + "epoch": 0.20412, + "grad_norm": 2.25, + "grad_norm_var": 0.019230143229166666, + "learning_rate": 0.0001, + "loss": 4.1414, + "loss/crossentropy": 1.8652849197387695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1896674558520317, + "step": 10206 + }, + { + "epoch": 0.20416, + "grad_norm": 2.40625, + "grad_norm_var": 0.02332331339518229, + "learning_rate": 0.0001, + "loss": 4.7698, + "loss/crossentropy": 2.016683042049408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23144973814487457, + "step": 10208 + }, + { + "epoch": 0.2042, + "grad_norm": 2.046875, + "grad_norm_var": 0.020401763916015624, + "learning_rate": 0.0001, + "loss": 4.3691, + "loss/crossentropy": 1.9395010471343994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19717292487621307, + "step": 10210 + }, + { + "epoch": 0.20424, + "grad_norm": 2.234375, + "grad_norm_var": 0.0210845947265625, + "learning_rate": 0.0001, + "loss": 4.0907, + "loss/crossentropy": 1.7626919150352478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.200415201485157, + "step": 10212 + }, + { + "epoch": 0.20428, + "grad_norm": 2.171875, + "grad_norm_var": 0.021955362955729165, + "learning_rate": 0.0001, + "loss": 4.3411, + "loss/crossentropy": 2.3014339208602905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22324562072753906, + "step": 10214 + }, + { + "epoch": 0.20432, + "grad_norm": 1.921875, + "grad_norm_var": 0.019636027018229165, + "learning_rate": 0.0001, + "loss": 4.1237, + "loss/crossentropy": 1.906779408454895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2223847657442093, + "step": 10216 + }, + { + "epoch": 0.20436, + "grad_norm": 2.046875, + "grad_norm_var": 0.01789118448893229, + "learning_rate": 0.0001, + "loss": 4.4555, + "loss/crossentropy": 2.085246205329895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21039249747991562, + "step": 10218 + }, + { + "epoch": 0.2044, + "grad_norm": 2.171875, + "grad_norm_var": 0.014806874593098958, + "learning_rate": 0.0001, + "loss": 4.4477, + "loss/crossentropy": 2.213107645511627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22880114614963531, + "step": 10220 + }, + { + "epoch": 0.20444, + "grad_norm": 2.078125, + "grad_norm_var": 0.013392893473307292, + "learning_rate": 0.0001, + "loss": 3.9015, + "loss/crossentropy": 1.9510034322738647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20935232937335968, + "step": 10222 + }, + { + "epoch": 0.20448, + "grad_norm": 1.921875, + "grad_norm_var": 0.008906809488932292, + "learning_rate": 0.0001, + "loss": 4.1237, + "loss/crossentropy": 1.8595823645591736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20829308032989502, + "step": 10224 + }, + { + "epoch": 0.20452, + "grad_norm": 2.015625, + "grad_norm_var": 0.008990224202473958, + "learning_rate": 0.0001, + "loss": 4.1287, + "loss/crossentropy": 1.8250519037246704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18982955813407898, + "step": 10226 + }, + { + "epoch": 0.20456, + "grad_norm": 2.15625, + "grad_norm_var": 0.0074859619140625, + "learning_rate": 0.0001, + "loss": 4.3656, + "loss/crossentropy": 2.410372495651245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22655458748340607, + "step": 10228 + }, + { + "epoch": 0.2046, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007533518473307291, + "learning_rate": 0.0001, + "loss": 4.2581, + "loss/crossentropy": 2.321051836013794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2280098795890808, + "step": 10230 + }, + { + "epoch": 0.20464, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007380167643229167, + "learning_rate": 0.0001, + "loss": 3.9927, + "loss/crossentropy": 2.266388177871704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2341095432639122, + "step": 10232 + }, + { + "epoch": 0.20468, + "grad_norm": 2.15625, + "grad_norm_var": 0.007478841145833333, + "learning_rate": 0.0001, + "loss": 4.5263, + "loss/crossentropy": 2.390430450439453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23684432357549667, + "step": 10234 + }, + { + "epoch": 0.20472, + "grad_norm": 3.390625, + "grad_norm_var": 0.11719563802083334, + "learning_rate": 0.0001, + "loss": 4.4269, + "loss/crossentropy": 2.07179594039917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21902770549058914, + "step": 10236 + }, + { + "epoch": 0.20476, + "grad_norm": 2.1875, + "grad_norm_var": 0.12704264322916667, + "learning_rate": 0.0001, + "loss": 4.3473, + "loss/crossentropy": 1.686942458152771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20007195323705673, + "step": 10238 + }, + { + "epoch": 0.2048, + "grad_norm": 2.0, + "grad_norm_var": 0.26913248697916664, + "learning_rate": 0.0001, + "loss": 4.2375, + "loss/crossentropy": 2.0421791076660156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2187313288450241, + "step": 10240 + }, + { + "epoch": 0.20484, + "grad_norm": 2.28125, + "grad_norm_var": 0.26201985677083334, + "learning_rate": 0.0001, + "loss": 4.1641, + "loss/crossentropy": 1.9503712058067322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19879397749900818, + "step": 10242 + }, + { + "epoch": 0.20488, + "grad_norm": 3.21875, + "grad_norm_var": 0.82620849609375, + "learning_rate": 0.0001, + "loss": 4.413, + "loss/crossentropy": 2.1453936100006104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22767861932516098, + "step": 10244 + }, + { + "epoch": 0.20492, + "grad_norm": 2.171875, + "grad_norm_var": 0.808221181233724, + "learning_rate": 0.0001, + "loss": 4.1924, + "loss/crossentropy": 1.9007731080055237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20069654285907745, + "step": 10246 + }, + { + "epoch": 0.20496, + "grad_norm": 1.875, + "grad_norm_var": 0.8184832255045573, + "learning_rate": 0.0001, + "loss": 4.0685, + "loss/crossentropy": 2.0545393228530884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1983654722571373, + "step": 10248 + }, + { + "epoch": 0.205, + "grad_norm": 2.1875, + "grad_norm_var": 0.804272206624349, + "learning_rate": 0.0001, + "loss": 4.3391, + "loss/crossentropy": 2.158636450767517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24239712953567505, + "step": 10250 + }, + { + "epoch": 0.20504, + "grad_norm": 2.15625, + "grad_norm_var": 0.7762794494628906, + "learning_rate": 0.0001, + "loss": 4.1608, + "loss/crossentropy": 2.1119120121002197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2136205956339836, + "step": 10252 + }, + { + "epoch": 0.20508, + "grad_norm": 2.21875, + "grad_norm_var": 0.7838417053222656, + "learning_rate": 0.0001, + "loss": 4.3184, + "loss/crossentropy": 2.0690027475357056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20616184175014496, + "step": 10254 + }, + { + "epoch": 0.20512, + "grad_norm": 2.0625, + "grad_norm_var": 0.6926798502604167, + "learning_rate": 0.0001, + "loss": 4.0635, + "loss/crossentropy": 2.178507924079895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21580064296722412, + "step": 10256 + }, + { + "epoch": 0.20516, + "grad_norm": 2.140625, + "grad_norm_var": 0.6889719645182292, + "learning_rate": 0.0001, + "loss": 4.1433, + "loss/crossentropy": 2.292190670967102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22163032740354538, + "step": 10258 + }, + { + "epoch": 0.2052, + "grad_norm": 2.015625, + "grad_norm_var": 0.03568115234375, + "learning_rate": 0.0001, + "loss": 4.0138, + "loss/crossentropy": 2.066649317741394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2081037238240242, + "step": 10260 + }, + { + "epoch": 0.20524, + "grad_norm": 2.296875, + "grad_norm_var": 0.05111490885416667, + "learning_rate": 0.0001, + "loss": 4.4315, + "loss/crossentropy": 1.9017595052719116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19284649938344955, + "step": 10262 + }, + { + "epoch": 0.20528, + "grad_norm": 2.125, + "grad_norm_var": 0.04326960245768229, + "learning_rate": 0.0001, + "loss": 4.0974, + "loss/crossentropy": 2.1215697526931763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2244422286748886, + "step": 10264 + }, + { + "epoch": 0.20532, + "grad_norm": 2.03125, + "grad_norm_var": 0.02539647420247396, + "learning_rate": 0.0001, + "loss": 4.3231, + "loss/crossentropy": 2.170191764831543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2308938354253769, + "step": 10266 + }, + { + "epoch": 0.20536, + "grad_norm": 2.140625, + "grad_norm_var": 0.02535985310872396, + "learning_rate": 0.0001, + "loss": 4.3472, + "loss/crossentropy": 2.0430655479431152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2435612976551056, + "step": 10268 + }, + { + "epoch": 0.2054, + "grad_norm": 2.203125, + "grad_norm_var": 0.025394439697265625, + "learning_rate": 0.0001, + "loss": 4.536, + "loss/crossentropy": 2.3141634464263916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2234780564904213, + "step": 10270 + }, + { + "epoch": 0.20544, + "grad_norm": 2.09375, + "grad_norm_var": 0.023371378580729168, + "learning_rate": 0.0001, + "loss": 4.1944, + "loss/crossentropy": 2.310709834098816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2505229711532593, + "step": 10272 + }, + { + "epoch": 0.20548, + "grad_norm": 2.078125, + "grad_norm_var": 0.02434056599934896, + "learning_rate": 0.0001, + "loss": 4.0664, + "loss/crossentropy": 1.9158611297607422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2062816470861435, + "step": 10274 + }, + { + "epoch": 0.20552, + "grad_norm": 2.015625, + "grad_norm_var": 0.02697728474934896, + "learning_rate": 0.0001, + "loss": 4.0545, + "loss/crossentropy": 2.0835859179496765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21051569283008575, + "step": 10276 + }, + { + "epoch": 0.20556, + "grad_norm": 2.125, + "grad_norm_var": 0.006951649983723958, + "learning_rate": 0.0001, + "loss": 4.4047, + "loss/crossentropy": 1.9533037543296814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20007294416427612, + "step": 10278 + }, + { + "epoch": 0.2056, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010701497395833334, + "learning_rate": 0.0001, + "loss": 4.4499, + "loss/crossentropy": 2.3090076446533203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2295464426279068, + "step": 10280 + }, + { + "epoch": 0.20564, + "grad_norm": 2.171875, + "grad_norm_var": 0.014989217122395834, + "learning_rate": 0.0001, + "loss": 4.2708, + "loss/crossentropy": 2.2951393127441406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22297964990139008, + "step": 10282 + }, + { + "epoch": 0.20568, + "grad_norm": 2.09375, + "grad_norm_var": 0.014574178059895833, + "learning_rate": 0.0001, + "loss": 4.1153, + "loss/crossentropy": 2.311514675617218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2287898138165474, + "step": 10284 + }, + { + "epoch": 0.20572, + "grad_norm": 1.921875, + "grad_norm_var": 0.014989217122395834, + "learning_rate": 0.0001, + "loss": 4.2287, + "loss/crossentropy": 2.277890205383301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21169421076774597, + "step": 10286 + }, + { + "epoch": 0.20576, + "grad_norm": 2.109375, + "grad_norm_var": 0.015143839518229167, + "learning_rate": 0.0001, + "loss": 4.4448, + "loss/crossentropy": 2.070693612098694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21682647615671158, + "step": 10288 + }, + { + "epoch": 0.2058, + "grad_norm": 1.984375, + "grad_norm_var": 0.014788564046223958, + "learning_rate": 0.0001, + "loss": 4.205, + "loss/crossentropy": 2.223360061645508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24135399609804153, + "step": 10290 + }, + { + "epoch": 0.20584, + "grad_norm": 2.40625, + "grad_norm_var": 0.018155670166015624, + "learning_rate": 0.0001, + "loss": 4.3304, + "loss/crossentropy": 2.430101752281189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21579021215438843, + "step": 10292 + }, + { + "epoch": 0.20588, + "grad_norm": 2.03125, + "grad_norm_var": 0.018173980712890624, + "learning_rate": 0.0001, + "loss": 4.1615, + "loss/crossentropy": 1.960309624671936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22619223594665527, + "step": 10294 + }, + { + "epoch": 0.20592, + "grad_norm": 2.515625, + "grad_norm_var": 0.024217732747395835, + "learning_rate": 0.0001, + "loss": 4.8862, + "loss/crossentropy": 2.0035970211029053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20148956030607224, + "step": 10296 + }, + { + "epoch": 0.20596, + "grad_norm": 2.09375, + "grad_norm_var": 0.021968587239583334, + "learning_rate": 0.0001, + "loss": 4.175, + "loss/crossentropy": 1.976987361907959, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21035870164632797, + "step": 10298 + }, + { + "epoch": 0.206, + "grad_norm": 2.140625, + "grad_norm_var": 0.021675618489583333, + "learning_rate": 0.0001, + "loss": 4.1596, + "loss/crossentropy": 2.0631470680236816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21725613623857498, + "step": 10300 + }, + { + "epoch": 0.20604, + "grad_norm": 2.421875, + "grad_norm_var": 0.02693456013997396, + "learning_rate": 0.0001, + "loss": 4.4734, + "loss/crossentropy": 2.2747987508773804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2295055389404297, + "step": 10302 + }, + { + "epoch": 0.20608, + "grad_norm": 1.9609375, + "grad_norm_var": 0.029255167643229166, + "learning_rate": 0.0001, + "loss": 4.1255, + "loss/crossentropy": 2.0811264514923096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21890189498662949, + "step": 10304 + }, + { + "epoch": 0.20612, + "grad_norm": 2.203125, + "grad_norm_var": 0.028319295247395834, + "learning_rate": 0.0001, + "loss": 4.4391, + "loss/crossentropy": 2.2474766969680786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2430667206645012, + "step": 10306 + }, + { + "epoch": 0.20616, + "grad_norm": 2.03125, + "grad_norm_var": 0.024933878580729166, + "learning_rate": 0.0001, + "loss": 4.3016, + "loss/crossentropy": 1.899698257446289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21101202815771103, + "step": 10308 + }, + { + "epoch": 0.2062, + "grad_norm": 2.078125, + "grad_norm_var": 0.023851521809895835, + "learning_rate": 0.0001, + "loss": 3.9769, + "loss/crossentropy": 1.6432967782020569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19004638493061066, + "step": 10310 + }, + { + "epoch": 0.20624, + "grad_norm": 2.046875, + "grad_norm_var": 0.0149658203125, + "learning_rate": 0.0001, + "loss": 4.3901, + "loss/crossentropy": 1.9606398940086365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24546240270137787, + "step": 10312 + }, + { + "epoch": 0.20628, + "grad_norm": 2.15625, + "grad_norm_var": 0.015021769205729167, + "learning_rate": 0.0001, + "loss": 4.0988, + "loss/crossentropy": 2.140414595603943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2428836077451706, + "step": 10314 + }, + { + "epoch": 0.20632, + "grad_norm": 2.3125, + "grad_norm_var": 0.018393707275390626, + "learning_rate": 0.0001, + "loss": 4.3768, + "loss/crossentropy": 2.0886260271072388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2201090231537819, + "step": 10316 + }, + { + "epoch": 0.20636, + "grad_norm": 2.234375, + "grad_norm_var": 0.1039947509765625, + "learning_rate": 0.0001, + "loss": 4.5696, + "loss/crossentropy": 2.3409098386764526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24670831114053726, + "step": 10318 + }, + { + "epoch": 0.2064, + "grad_norm": 2.09375, + "grad_norm_var": 0.10114313761393229, + "learning_rate": 0.0001, + "loss": 4.4153, + "loss/crossentropy": 2.233125150203705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21548831462860107, + "step": 10320 + }, + { + "epoch": 0.20644, + "grad_norm": 2.0625, + "grad_norm_var": 0.1031206766764323, + "learning_rate": 0.0001, + "loss": 4.0617, + "loss/crossentropy": 2.131038188934326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20244313776493073, + "step": 10322 + }, + { + "epoch": 0.20648, + "grad_norm": 2.03125, + "grad_norm_var": 0.10423965454101562, + "learning_rate": 0.0001, + "loss": 4.3428, + "loss/crossentropy": 2.1683152318000793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21852879226207733, + "step": 10324 + }, + { + "epoch": 0.20652, + "grad_norm": 2.09375, + "grad_norm_var": 0.10465672810872396, + "learning_rate": 0.0001, + "loss": 4.4117, + "loss/crossentropy": 2.2986634969711304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24344030022621155, + "step": 10326 + }, + { + "epoch": 0.20656, + "grad_norm": 1.9140625, + "grad_norm_var": 0.10876057942708334, + "learning_rate": 0.0001, + "loss": 4.063, + "loss/crossentropy": 2.2356297969818115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21090354025363922, + "step": 10328 + }, + { + "epoch": 0.2066, + "grad_norm": 2.28125, + "grad_norm_var": 0.10851949055989583, + "learning_rate": 0.0001, + "loss": 4.4235, + "loss/crossentropy": 2.6431000232696533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26503315567970276, + "step": 10330 + }, + { + "epoch": 0.20664, + "grad_norm": 2.125, + "grad_norm_var": 0.10668919881184896, + "learning_rate": 0.0001, + "loss": 4.2277, + "loss/crossentropy": 2.0261669754981995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21528497338294983, + "step": 10332 + }, + { + "epoch": 0.20668, + "grad_norm": 2.15625, + "grad_norm_var": 0.007755279541015625, + "learning_rate": 0.0001, + "loss": 4.1092, + "loss/crossentropy": 1.5593605041503906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18069667369127274, + "step": 10334 + }, + { + "epoch": 0.20672, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0087646484375, + "learning_rate": 0.0001, + "loss": 4.1836, + "loss/crossentropy": 1.953243374824524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19982123374938965, + "step": 10336 + }, + { + "epoch": 0.20676, + "grad_norm": 2.09375, + "grad_norm_var": 0.0088531494140625, + "learning_rate": 0.0001, + "loss": 4.4369, + "loss/crossentropy": 2.067806303501129, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2209654077887535, + "step": 10338 + }, + { + "epoch": 0.2068, + "grad_norm": 2.0625, + "grad_norm_var": 0.0088043212890625, + "learning_rate": 0.0001, + "loss": 4.2458, + "loss/crossentropy": 1.9948397874832153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21432363241910934, + "step": 10340 + }, + { + "epoch": 0.20684, + "grad_norm": 2.046875, + "grad_norm_var": 0.010231272379557291, + "learning_rate": 0.0001, + "loss": 3.7643, + "loss/crossentropy": 1.7932568788528442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19161275029182434, + "step": 10342 + }, + { + "epoch": 0.20688, + "grad_norm": 2.3125, + "grad_norm_var": 0.0162017822265625, + "learning_rate": 0.0001, + "loss": 4.5325, + "loss/crossentropy": 2.021036922931671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21150880306959152, + "step": 10344 + }, + { + "epoch": 0.20692, + "grad_norm": 1.9453125, + "grad_norm_var": 0.014623769124348958, + "learning_rate": 0.0001, + "loss": 3.9351, + "loss/crossentropy": 2.0004186630249023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18971875309944153, + "step": 10346 + }, + { + "epoch": 0.20696, + "grad_norm": 1.984375, + "grad_norm_var": 0.014898427327473958, + "learning_rate": 0.0001, + "loss": 4.1234, + "loss/crossentropy": 2.2949434518814087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2356676608324051, + "step": 10348 + }, + { + "epoch": 0.207, + "grad_norm": 2.15625, + "grad_norm_var": 0.013108062744140624, + "learning_rate": 0.0001, + "loss": 4.1755, + "loss/crossentropy": 2.161319613456726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24161628633737564, + "step": 10350 + }, + { + "epoch": 0.20704, + "grad_norm": 1.921875, + "grad_norm_var": 0.013285319010416666, + "learning_rate": 0.0001, + "loss": 4.1432, + "loss/crossentropy": 2.027602195739746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19196761399507523, + "step": 10352 + }, + { + "epoch": 0.20708, + "grad_norm": 2.109375, + "grad_norm_var": 0.013703409830729167, + "learning_rate": 0.0001, + "loss": 4.3658, + "loss/crossentropy": 2.163583278656006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21044812351465225, + "step": 10354 + }, + { + "epoch": 0.20712, + "grad_norm": 1.9921875, + "grad_norm_var": 0.014045969645182291, + "learning_rate": 0.0001, + "loss": 4.0419, + "loss/crossentropy": 2.055150866508484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21587203443050385, + "step": 10356 + }, + { + "epoch": 0.20716, + "grad_norm": 2.4375, + "grad_norm_var": 0.0192047119140625, + "learning_rate": 0.0001, + "loss": 4.2947, + "loss/crossentropy": 2.368631362915039, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2406565323472023, + "step": 10358 + }, + { + "epoch": 0.2072, + "grad_norm": 2.234375, + "grad_norm_var": 0.016486612955729167, + "learning_rate": 0.0001, + "loss": 4.6473, + "loss/crossentropy": 2.5399086475372314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2554238885641098, + "step": 10360 + }, + { + "epoch": 0.20724, + "grad_norm": 2.046875, + "grad_norm_var": 0.015366363525390624, + "learning_rate": 0.0001, + "loss": 4.2346, + "loss/crossentropy": 2.0829185843467712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2278829663991928, + "step": 10362 + }, + { + "epoch": 0.20728, + "grad_norm": 2.078125, + "grad_norm_var": 0.014371490478515625, + "learning_rate": 0.0001, + "loss": 4.2583, + "loss/crossentropy": 1.9829052090644836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19677383452653885, + "step": 10364 + }, + { + "epoch": 0.20732, + "grad_norm": 2.0625, + "grad_norm_var": 0.014385732014973958, + "learning_rate": 0.0001, + "loss": 4.3801, + "loss/crossentropy": 2.2335458993911743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23407185822725296, + "step": 10366 + }, + { + "epoch": 0.20736, + "grad_norm": 2.046875, + "grad_norm_var": 0.012149810791015625, + "learning_rate": 0.0001, + "loss": 4.2074, + "loss/crossentropy": 1.824280858039856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.203867107629776, + "step": 10368 + }, + { + "epoch": 0.2074, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013898722330729167, + "learning_rate": 0.0001, + "loss": 4.0666, + "loss/crossentropy": 2.1007773876190186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20312541723251343, + "step": 10370 + }, + { + "epoch": 0.20744, + "grad_norm": 2.109375, + "grad_norm_var": 0.013350168863932291, + "learning_rate": 0.0001, + "loss": 4.3692, + "loss/crossentropy": 1.953888475894928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20697196573019028, + "step": 10372 + }, + { + "epoch": 0.20748, + "grad_norm": 2.28125, + "grad_norm_var": 0.03509089152018229, + "learning_rate": 0.0001, + "loss": 4.2786, + "loss/crossentropy": 2.186478853225708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24410755187273026, + "step": 10374 + }, + { + "epoch": 0.20752, + "grad_norm": 2.34375, + "grad_norm_var": 0.03706232706705729, + "learning_rate": 0.0001, + "loss": 3.9974, + "loss/crossentropy": 1.9011998772621155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1983819529414177, + "step": 10376 + }, + { + "epoch": 0.20756, + "grad_norm": 2.03125, + "grad_norm_var": 0.03695246378580729, + "learning_rate": 0.0001, + "loss": 4.5884, + "loss/crossentropy": 2.599787950515747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25155550241470337, + "step": 10378 + }, + { + "epoch": 0.2076, + "grad_norm": 2.078125, + "grad_norm_var": 0.037021636962890625, + "learning_rate": 0.0001, + "loss": 4.2702, + "loss/crossentropy": 2.0120421648025513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21281170845031738, + "step": 10380 + }, + { + "epoch": 0.20764, + "grad_norm": 2.09375, + "grad_norm_var": 0.036710357666015624, + "learning_rate": 0.0001, + "loss": 4.4915, + "loss/crossentropy": 2.0685967803001404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23026303946971893, + "step": 10382 + }, + { + "epoch": 0.20768, + "grad_norm": 2.125, + "grad_norm_var": 0.0445068359375, + "learning_rate": 0.0001, + "loss": 4.1171, + "loss/crossentropy": 1.8392394185066223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19240978360176086, + "step": 10384 + }, + { + "epoch": 0.20772, + "grad_norm": 1.9453125, + "grad_norm_var": 0.04523518880208333, + "learning_rate": 0.0001, + "loss": 4.2428, + "loss/crossentropy": 1.786954402923584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20547360181808472, + "step": 10386 + }, + { + "epoch": 0.20776, + "grad_norm": 2.234375, + "grad_norm_var": 0.0452301025390625, + "learning_rate": 0.0001, + "loss": 4.2084, + "loss/crossentropy": 1.6960806250572205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1748766005039215, + "step": 10388 + }, + { + "epoch": 0.2078, + "grad_norm": 2.3125, + "grad_norm_var": 0.018693033854166666, + "learning_rate": 0.0001, + "loss": 4.2491, + "loss/crossentropy": 2.271879196166992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25173740088939667, + "step": 10390 + }, + { + "epoch": 0.20784, + "grad_norm": 2.109375, + "grad_norm_var": 0.015778605143229166, + "learning_rate": 0.0001, + "loss": 4.5199, + "loss/crossentropy": 2.2860567569732666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23291928321123123, + "step": 10392 + }, + { + "epoch": 0.20788, + "grad_norm": 2.046875, + "grad_norm_var": 0.0148193359375, + "learning_rate": 0.0001, + "loss": 4.1871, + "loss/crossentropy": 1.9925037026405334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.212161086499691, + "step": 10394 + }, + { + "epoch": 0.20792, + "grad_norm": 2.140625, + "grad_norm_var": 0.016306304931640626, + "learning_rate": 0.0001, + "loss": 4.3728, + "loss/crossentropy": 2.1189831495285034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19486143440008163, + "step": 10396 + }, + { + "epoch": 0.20796, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0168121337890625, + "learning_rate": 0.0001, + "loss": 4.2325, + "loss/crossentropy": 2.170132279396057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23083829134702682, + "step": 10398 + }, + { + "epoch": 0.208, + "grad_norm": 2.109375, + "grad_norm_var": 0.012129465738932291, + "learning_rate": 0.0001, + "loss": 4.0773, + "loss/crossentropy": 1.8486035466194153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18999971449375153, + "step": 10400 + }, + { + "epoch": 0.20804, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011240386962890625, + "learning_rate": 0.0001, + "loss": 4.138, + "loss/crossentropy": 1.727788269519806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19234652817249298, + "step": 10402 + }, + { + "epoch": 0.20808, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01046142578125, + "learning_rate": 0.0001, + "loss": 4.4102, + "loss/crossentropy": 2.196265935897827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21071433275938034, + "step": 10404 + }, + { + "epoch": 0.20812, + "grad_norm": 2.15625, + "grad_norm_var": 0.007111612955729167, + "learning_rate": 0.0001, + "loss": 4.3378, + "loss/crossentropy": 1.7230273485183716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19570616632699966, + "step": 10406 + }, + { + "epoch": 0.20816, + "grad_norm": 2.109375, + "grad_norm_var": 0.00567626953125, + "learning_rate": 0.0001, + "loss": 3.9693, + "loss/crossentropy": 1.8815646767616272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2055530995130539, + "step": 10408 + }, + { + "epoch": 0.2082, + "grad_norm": 2.015625, + "grad_norm_var": 0.006400299072265625, + "learning_rate": 0.0001, + "loss": 4.1974, + "loss/crossentropy": 2.2131329774856567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22710958123207092, + "step": 10410 + }, + { + "epoch": 0.20824, + "grad_norm": 2.046875, + "grad_norm_var": 0.006103515625, + "learning_rate": 0.0001, + "loss": 4.1904, + "loss/crossentropy": 1.5199981927871704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16881641000509262, + "step": 10412 + }, + { + "epoch": 0.20828, + "grad_norm": 2.0625, + "grad_norm_var": 0.005356597900390625, + "learning_rate": 0.0001, + "loss": 4.2456, + "loss/crossentropy": 2.305867075920105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2523300349712372, + "step": 10414 + }, + { + "epoch": 0.20832, + "grad_norm": 2.0625, + "grad_norm_var": 0.004571278889973958, + "learning_rate": 0.0001, + "loss": 4.2727, + "loss/crossentropy": 1.989980161190033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19124356657266617, + "step": 10416 + }, + { + "epoch": 0.20836, + "grad_norm": 2.046875, + "grad_norm_var": 0.004255167643229167, + "learning_rate": 0.0001, + "loss": 4.4745, + "loss/crossentropy": 2.165328025817871, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23673278093338013, + "step": 10418 + }, + { + "epoch": 0.2084, + "grad_norm": 2.328125, + "grad_norm_var": 0.008074696858723958, + "learning_rate": 0.0001, + "loss": 4.5429, + "loss/crossentropy": 2.2451056241989136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23130114376544952, + "step": 10420 + }, + { + "epoch": 0.20844, + "grad_norm": 1.953125, + "grad_norm_var": 0.0106689453125, + "learning_rate": 0.0001, + "loss": 3.9259, + "loss/crossentropy": 2.1694064140319824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21424317359924316, + "step": 10422 + }, + { + "epoch": 0.20848, + "grad_norm": 2.1875, + "grad_norm_var": 0.012059529622395834, + "learning_rate": 0.0001, + "loss": 4.3558, + "loss/crossentropy": 2.140601873397827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22452331334352493, + "step": 10424 + }, + { + "epoch": 0.20852, + "grad_norm": 2.125, + "grad_norm_var": 0.011476389567057292, + "learning_rate": 0.0001, + "loss": 4.3058, + "loss/crossentropy": 2.2076770067214966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21608063578605652, + "step": 10426 + }, + { + "epoch": 0.20856, + "grad_norm": 2.125, + "grad_norm_var": 0.011579386393229167, + "learning_rate": 0.0001, + "loss": 4.151, + "loss/crossentropy": 2.1738568544387817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2288571298122406, + "step": 10428 + }, + { + "epoch": 0.2086, + "grad_norm": 2.015625, + "grad_norm_var": 0.012165323893229166, + "learning_rate": 0.0001, + "loss": 4.2576, + "loss/crossentropy": 2.2586612701416016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22107571363449097, + "step": 10430 + }, + { + "epoch": 0.20864, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014511871337890624, + "learning_rate": 0.0001, + "loss": 4.5441, + "loss/crossentropy": 2.336306095123291, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23462744057178497, + "step": 10432 + }, + { + "epoch": 0.20868, + "grad_norm": 2.046875, + "grad_norm_var": 0.0148590087890625, + "learning_rate": 0.0001, + "loss": 4.1625, + "loss/crossentropy": 2.3164994716644287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21541497856378555, + "step": 10434 + }, + { + "epoch": 0.20872, + "grad_norm": 2.046875, + "grad_norm_var": 0.0121734619140625, + "learning_rate": 0.0001, + "loss": 4.2497, + "loss/crossentropy": 1.848636507987976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20993927866220474, + "step": 10436 + }, + { + "epoch": 0.20876, + "grad_norm": 2.25, + "grad_norm_var": 0.010509999593098958, + "learning_rate": 0.0001, + "loss": 4.4985, + "loss/crossentropy": 2.234964370727539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21675845235586166, + "step": 10438 + }, + { + "epoch": 0.2088, + "grad_norm": 2.125, + "grad_norm_var": 0.009159088134765625, + "learning_rate": 0.0001, + "loss": 4.2389, + "loss/crossentropy": 1.9301238656044006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21708600223064423, + "step": 10440 + }, + { + "epoch": 0.20884, + "grad_norm": 2.171875, + "grad_norm_var": 0.010721842447916666, + "learning_rate": 0.0001, + "loss": 4.1706, + "loss/crossentropy": 2.231620192527771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23156532645225525, + "step": 10442 + }, + { + "epoch": 0.20888, + "grad_norm": 2.015625, + "grad_norm_var": 0.010237375895182291, + "learning_rate": 0.0001, + "loss": 4.18, + "loss/crossentropy": 1.8612747192382812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1914522647857666, + "step": 10444 + }, + { + "epoch": 0.20892, + "grad_norm": 2.109375, + "grad_norm_var": 0.009907786051432292, + "learning_rate": 0.0001, + "loss": 4.1713, + "loss/crossentropy": 2.2229456305503845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23199696838855743, + "step": 10446 + }, + { + "epoch": 0.20896, + "grad_norm": 2.046875, + "grad_norm_var": 0.009708658854166666, + "learning_rate": 0.0001, + "loss": 4.0556, + "loss/crossentropy": 1.9055940508842468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2158679962158203, + "step": 10448 + }, + { + "epoch": 0.209, + "grad_norm": 1.8984375, + "grad_norm_var": 0.011774698893229166, + "learning_rate": 0.0001, + "loss": 3.8876, + "loss/crossentropy": 1.5537404417991638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17176809161901474, + "step": 10450 + }, + { + "epoch": 0.20904, + "grad_norm": 1.890625, + "grad_norm_var": 0.0123931884765625, + "learning_rate": 0.0001, + "loss": 3.9222, + "loss/crossentropy": 2.319318413734436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22001250088214874, + "step": 10452 + }, + { + "epoch": 0.20908, + "grad_norm": 2.140625, + "grad_norm_var": 0.010383097330729167, + "learning_rate": 0.0001, + "loss": 4.421, + "loss/crossentropy": 2.2334396839141846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.222968190908432, + "step": 10454 + }, + { + "epoch": 0.20912, + "grad_norm": 2.109375, + "grad_norm_var": 0.0110260009765625, + "learning_rate": 0.0001, + "loss": 4.5172, + "loss/crossentropy": 2.5762773752212524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26605312526226044, + "step": 10456 + }, + { + "epoch": 0.20916, + "grad_norm": 2.0, + "grad_norm_var": 0.010503896077473958, + "learning_rate": 0.0001, + "loss": 4.1109, + "loss/crossentropy": 1.7634761333465576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18641388416290283, + "step": 10458 + }, + { + "epoch": 0.2092, + "grad_norm": 1.875, + "grad_norm_var": 0.013224029541015625, + "learning_rate": 0.0001, + "loss": 4.126, + "loss/crossentropy": 2.28191876411438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22916647791862488, + "step": 10460 + }, + { + "epoch": 0.20924, + "grad_norm": 2.125, + "grad_norm_var": 0.013903554280598958, + "learning_rate": 0.0001, + "loss": 4.1984, + "loss/crossentropy": 1.9712265729904175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2293689101934433, + "step": 10462 + }, + { + "epoch": 0.20928, + "grad_norm": 2.109375, + "grad_norm_var": 0.012292226155598959, + "learning_rate": 0.0001, + "loss": 4.3826, + "loss/crossentropy": 2.425857424736023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.255269430577755, + "step": 10464 + }, + { + "epoch": 0.20932, + "grad_norm": 2.109375, + "grad_norm_var": 0.011031087239583333, + "learning_rate": 0.0001, + "loss": 4.1919, + "loss/crossentropy": 2.0697131752967834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22233501076698303, + "step": 10466 + }, + { + "epoch": 0.20936, + "grad_norm": 2.140625, + "grad_norm_var": 0.008854166666666666, + "learning_rate": 0.0001, + "loss": 4.4017, + "loss/crossentropy": 1.8367178440093994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063748985528946, + "step": 10468 + }, + { + "epoch": 0.2094, + "grad_norm": 2.046875, + "grad_norm_var": 0.018192545572916666, + "learning_rate": 0.0001, + "loss": 4.1864, + "loss/crossentropy": 1.977232813835144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20568673312664032, + "step": 10470 + }, + { + "epoch": 0.20944, + "grad_norm": 2.0, + "grad_norm_var": 0.018195597330729167, + "learning_rate": 0.0001, + "loss": 4.1622, + "loss/crossentropy": 2.1934465169906616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2175101339817047, + "step": 10472 + }, + { + "epoch": 0.20948, + "grad_norm": 2.09375, + "grad_norm_var": 0.016047159830729168, + "learning_rate": 0.0001, + "loss": 4.3005, + "loss/crossentropy": 1.7997339367866516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20166774094104767, + "step": 10474 + }, + { + "epoch": 0.20952, + "grad_norm": 2.140625, + "grad_norm_var": 0.014671834309895833, + "learning_rate": 0.0001, + "loss": 4.1167, + "loss/crossentropy": 1.9334582090377808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19190441817045212, + "step": 10476 + }, + { + "epoch": 0.20956, + "grad_norm": 2.109375, + "grad_norm_var": 0.015148671468098958, + "learning_rate": 0.0001, + "loss": 3.9434, + "loss/crossentropy": 1.7263885140419006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17968116700649261, + "step": 10478 + }, + { + "epoch": 0.2096, + "grad_norm": 1.9375, + "grad_norm_var": 0.01661961873372396, + "learning_rate": 0.0001, + "loss": 4.1153, + "loss/crossentropy": 2.1710296869277954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19717055559158325, + "step": 10480 + }, + { + "epoch": 0.20964, + "grad_norm": 2.171875, + "grad_norm_var": 0.016947174072265626, + "learning_rate": 0.0001, + "loss": 4.2089, + "loss/crossentropy": 2.043896973133087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21403591334819794, + "step": 10482 + }, + { + "epoch": 0.20968, + "grad_norm": 2.203125, + "grad_norm_var": 0.02459691365559896, + "learning_rate": 0.0001, + "loss": 3.9237, + "loss/crossentropy": 1.7813313603401184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20273292809724808, + "step": 10484 + }, + { + "epoch": 0.20972, + "grad_norm": 2.0625, + "grad_norm_var": 0.013155110677083333, + "learning_rate": 0.0001, + "loss": 4.0985, + "loss/crossentropy": 1.8628470301628113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1945454627275467, + "step": 10486 + }, + { + "epoch": 0.20976, + "grad_norm": 2.03125, + "grad_norm_var": 0.013044230143229167, + "learning_rate": 0.0001, + "loss": 4.2279, + "loss/crossentropy": 2.2047882080078125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23798923939466476, + "step": 10488 + }, + { + "epoch": 0.2098, + "grad_norm": 1.9609375, + "grad_norm_var": 0.012697092692057292, + "learning_rate": 0.0001, + "loss": 4.1327, + "loss/crossentropy": 1.8838441967964172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2224583625793457, + "step": 10490 + }, + { + "epoch": 0.20984, + "grad_norm": 2.015625, + "grad_norm_var": 0.011201731363932292, + "learning_rate": 0.0001, + "loss": 4.1985, + "loss/crossentropy": 1.9326539039611816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2116800993680954, + "step": 10492 + }, + { + "epoch": 0.20988, + "grad_norm": 2.375, + "grad_norm_var": 0.017829386393229167, + "learning_rate": 0.0001, + "loss": 4.2023, + "loss/crossentropy": 1.7767577171325684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20496132969856262, + "step": 10494 + }, + { + "epoch": 0.20992, + "grad_norm": 2.015625, + "grad_norm_var": 0.016996256510416665, + "learning_rate": 0.0001, + "loss": 4.1665, + "loss/crossentropy": 1.996176838874817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1979241669178009, + "step": 10496 + }, + { + "epoch": 0.20996, + "grad_norm": 2.015625, + "grad_norm_var": 0.016136678059895833, + "learning_rate": 0.0001, + "loss": 4.1467, + "loss/crossentropy": 2.5282262563705444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23003733158111572, + "step": 10498 + }, + { + "epoch": 0.21, + "grad_norm": 2.109375, + "grad_norm_var": 0.008760579427083333, + "learning_rate": 0.0001, + "loss": 4.1981, + "loss/crossentropy": 2.3120675086975098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22543538361787796, + "step": 10500 + }, + { + "epoch": 0.21004, + "grad_norm": 2.15625, + "grad_norm_var": 0.008766428629557291, + "learning_rate": 0.0001, + "loss": 4.1216, + "loss/crossentropy": 2.2110280990600586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23066550493240356, + "step": 10502 + }, + { + "epoch": 0.21008, + "grad_norm": 2.015625, + "grad_norm_var": 0.009242502848307292, + "learning_rate": 0.0001, + "loss": 4.0804, + "loss/crossentropy": 1.6792908906936646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20689093321561813, + "step": 10504 + }, + { + "epoch": 0.21012, + "grad_norm": 2.296875, + "grad_norm_var": 0.011116536458333333, + "learning_rate": 0.0001, + "loss": 4.137, + "loss/crossentropy": 1.9797767400741577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2185518741607666, + "step": 10506 + }, + { + "epoch": 0.21016, + "grad_norm": 2.046875, + "grad_norm_var": 0.011533355712890625, + "learning_rate": 0.0001, + "loss": 4.0243, + "loss/crossentropy": 2.0001984238624573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22058366239070892, + "step": 10508 + }, + { + "epoch": 0.2102, + "grad_norm": 2.125, + "grad_norm_var": 0.006414540608723958, + "learning_rate": 0.0001, + "loss": 4.3707, + "loss/crossentropy": 2.1622806787490845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21733585745096207, + "step": 10510 + }, + { + "epoch": 0.21024, + "grad_norm": 2.03125, + "grad_norm_var": 0.00740966796875, + "learning_rate": 0.0001, + "loss": 4.0392, + "loss/crossentropy": 2.0224735736846924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20596059411764145, + "step": 10512 + }, + { + "epoch": 0.21028, + "grad_norm": 1.8359375, + "grad_norm_var": 0.011244455973307291, + "learning_rate": 0.0001, + "loss": 4.0712, + "loss/crossentropy": 2.099945902824402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19777391105890274, + "step": 10514 + }, + { + "epoch": 0.21032, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011823527018229167, + "learning_rate": 0.0001, + "loss": 4.1246, + "loss/crossentropy": 1.9806578159332275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19959519058465958, + "step": 10516 + }, + { + "epoch": 0.21036, + "grad_norm": 2.140625, + "grad_norm_var": 0.012239329020182292, + "learning_rate": 0.0001, + "loss": 4.1182, + "loss/crossentropy": 2.0056468844413757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21810457110404968, + "step": 10518 + }, + { + "epoch": 0.2104, + "grad_norm": 2.171875, + "grad_norm_var": 0.01785456339518229, + "learning_rate": 0.0001, + "loss": 4.1085, + "loss/crossentropy": 1.9525137543678284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19195258617401123, + "step": 10520 + }, + { + "epoch": 0.21044, + "grad_norm": 2.15625, + "grad_norm_var": 0.01580988566080729, + "learning_rate": 0.0001, + "loss": 4.4648, + "loss/crossentropy": 2.2391252517700195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23115740716457367, + "step": 10522 + }, + { + "epoch": 0.21048, + "grad_norm": 2.203125, + "grad_norm_var": 0.016727701822916666, + "learning_rate": 0.0001, + "loss": 4.6467, + "loss/crossentropy": 2.550819158554077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2846238762140274, + "step": 10524 + }, + { + "epoch": 0.21052, + "grad_norm": 2.234375, + "grad_norm_var": 0.0176177978515625, + "learning_rate": 0.0001, + "loss": 4.2787, + "loss/crossentropy": 2.181081712245941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23952369391918182, + "step": 10526 + }, + { + "epoch": 0.21056, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01784032185872396, + "learning_rate": 0.0001, + "loss": 4.1627, + "loss/crossentropy": 2.3070446848869324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21255015581846237, + "step": 10528 + }, + { + "epoch": 0.2106, + "grad_norm": 2.140625, + "grad_norm_var": 0.014235178629557291, + "learning_rate": 0.0001, + "loss": 4.3184, + "loss/crossentropy": 1.9560331106185913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19652864336967468, + "step": 10530 + }, + { + "epoch": 0.21064, + "grad_norm": 2.203125, + "grad_norm_var": 0.0127593994140625, + "learning_rate": 0.0001, + "loss": 4.5728, + "loss/crossentropy": 2.2470709085464478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25076867640018463, + "step": 10532 + }, + { + "epoch": 0.21068, + "grad_norm": 2.03125, + "grad_norm_var": 0.011767323811848958, + "learning_rate": 0.0001, + "loss": 4.2652, + "loss/crossentropy": 2.0689820051193237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21815705299377441, + "step": 10534 + }, + { + "epoch": 0.21072, + "grad_norm": 1.8671875, + "grad_norm_var": 0.013499959309895834, + "learning_rate": 0.0001, + "loss": 3.9524, + "loss/crossentropy": 2.0633797645568848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20865381509065628, + "step": 10536 + }, + { + "epoch": 0.21076, + "grad_norm": 1.96875, + "grad_norm_var": 0.014029947916666667, + "learning_rate": 0.0001, + "loss": 4.2644, + "loss/crossentropy": 2.3332748413085938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21530094742774963, + "step": 10538 + }, + { + "epoch": 0.2108, + "grad_norm": 2.078125, + "grad_norm_var": 0.011942545572916666, + "learning_rate": 0.0001, + "loss": 4.1452, + "loss/crossentropy": 2.093048572540283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23568043112754822, + "step": 10540 + }, + { + "epoch": 0.21084, + "grad_norm": 2.125, + "grad_norm_var": 0.009992472330729167, + "learning_rate": 0.0001, + "loss": 4.5464, + "loss/crossentropy": 2.0059397220611572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.253838449716568, + "step": 10542 + }, + { + "epoch": 0.21088, + "grad_norm": 2.0625, + "grad_norm_var": 0.009284464518229167, + "learning_rate": 0.0001, + "loss": 4.1417, + "loss/crossentropy": 2.278248429298401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23320979624986649, + "step": 10544 + }, + { + "epoch": 0.21092, + "grad_norm": 2.0625, + "grad_norm_var": 0.008540598551432292, + "learning_rate": 0.0001, + "loss": 4.0951, + "loss/crossentropy": 2.063527822494507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21290308982133865, + "step": 10546 + }, + { + "epoch": 0.21096, + "grad_norm": 1.953125, + "grad_norm_var": 0.007511138916015625, + "learning_rate": 0.0001, + "loss": 4.1456, + "loss/crossentropy": 2.092045545578003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21721098572015762, + "step": 10548 + }, + { + "epoch": 0.211, + "grad_norm": 2.078125, + "grad_norm_var": 0.06824111938476562, + "learning_rate": 0.0001, + "loss": 4.0586, + "loss/crossentropy": 1.9876770973205566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21775592118501663, + "step": 10550 + }, + { + "epoch": 0.21104, + "grad_norm": 2.0625, + "grad_norm_var": 0.06347249348958334, + "learning_rate": 0.0001, + "loss": 4.2372, + "loss/crossentropy": 2.292428970336914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24829304218292236, + "step": 10552 + }, + { + "epoch": 0.21108, + "grad_norm": 2.015625, + "grad_norm_var": 0.06552632649739583, + "learning_rate": 0.0001, + "loss": 4.3412, + "loss/crossentropy": 2.257239043712616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2195405438542366, + "step": 10554 + }, + { + "epoch": 0.21112, + "grad_norm": 2.171875, + "grad_norm_var": 0.06616923014322916, + "learning_rate": 0.0001, + "loss": 4.404, + "loss/crossentropy": 2.1424754858016968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21044345945119858, + "step": 10556 + }, + { + "epoch": 0.21116, + "grad_norm": 2.15625, + "grad_norm_var": 0.06642964680989584, + "learning_rate": 0.0001, + "loss": 4.2812, + "loss/crossentropy": 1.777747094631195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19684316962957382, + "step": 10558 + }, + { + "epoch": 0.2112, + "grad_norm": 2.015625, + "grad_norm_var": 0.06655171712239584, + "learning_rate": 0.0001, + "loss": 4.1239, + "loss/crossentropy": 2.0860772728919983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20560266077518463, + "step": 10560 + }, + { + "epoch": 0.21124, + "grad_norm": 1.96875, + "grad_norm_var": 0.06787007649739583, + "learning_rate": 0.0001, + "loss": 4.2409, + "loss/crossentropy": 1.9594369530677795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015877440571785, + "step": 10562 + }, + { + "epoch": 0.21128, + "grad_norm": 2.078125, + "grad_norm_var": 0.06467692057291667, + "learning_rate": 0.0001, + "loss": 4.2539, + "loss/crossentropy": 2.069046676158905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21409741044044495, + "step": 10564 + }, + { + "epoch": 0.21132, + "grad_norm": 1.921875, + "grad_norm_var": 0.009464263916015625, + "learning_rate": 0.0001, + "loss": 3.822, + "loss/crossentropy": 1.7300589084625244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18024658411741257, + "step": 10566 + }, + { + "epoch": 0.21136, + "grad_norm": 2.125, + "grad_norm_var": 0.0077288309733072914, + "learning_rate": 0.0001, + "loss": 4.2608, + "loss/crossentropy": 2.048615336418152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.228649340569973, + "step": 10568 + }, + { + "epoch": 0.2114, + "grad_norm": 2.171875, + "grad_norm_var": 0.007085927327473958, + "learning_rate": 0.0001, + "loss": 4.4622, + "loss/crossentropy": 1.9678268432617188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20117972791194916, + "step": 10570 + }, + { + "epoch": 0.21144, + "grad_norm": 2.1875, + "grad_norm_var": 0.006534576416015625, + "learning_rate": 0.0001, + "loss": 4.3919, + "loss/crossentropy": 2.595113754272461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24565115571022034, + "step": 10572 + }, + { + "epoch": 0.21148, + "grad_norm": 2.25, + "grad_norm_var": 0.008658599853515626, + "learning_rate": 0.0001, + "loss": 4.0873, + "loss/crossentropy": 2.096100628376007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20913005620241165, + "step": 10574 + }, + { + "epoch": 0.21152, + "grad_norm": 2.25, + "grad_norm_var": 0.012223052978515624, + "learning_rate": 0.0001, + "loss": 4.181, + "loss/crossentropy": 2.0096259713172913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21125900745391846, + "step": 10576 + }, + { + "epoch": 0.21156, + "grad_norm": 2.046875, + "grad_norm_var": 0.011572011311848958, + "learning_rate": 0.0001, + "loss": 3.9738, + "loss/crossentropy": 1.9990533590316772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20894166082143784, + "step": 10578 + }, + { + "epoch": 0.2116, + "grad_norm": 2.0625, + "grad_norm_var": 0.013606516520182292, + "learning_rate": 0.0001, + "loss": 4.0404, + "loss/crossentropy": 2.1385116577148438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19753456860780716, + "step": 10580 + }, + { + "epoch": 0.21164, + "grad_norm": 2.328125, + "grad_norm_var": 0.012482706705729167, + "learning_rate": 0.0001, + "loss": 4.5056, + "loss/crossentropy": 2.2081239819526672, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23155531287193298, + "step": 10582 + }, + { + "epoch": 0.21168, + "grad_norm": 2.125, + "grad_norm_var": 0.014339192708333334, + "learning_rate": 0.0001, + "loss": 4.161, + "loss/crossentropy": 1.6915860772132874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18060292303562164, + "step": 10584 + }, + { + "epoch": 0.21172, + "grad_norm": 1.875, + "grad_norm_var": 0.019245402018229166, + "learning_rate": 0.0001, + "loss": 4.083, + "loss/crossentropy": 1.5825872421264648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19387374818325043, + "step": 10586 + }, + { + "epoch": 0.21176, + "grad_norm": 2.109375, + "grad_norm_var": 0.0253082275390625, + "learning_rate": 0.0001, + "loss": 4.6102, + "loss/crossentropy": 2.386876940727234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23594971746206284, + "step": 10588 + }, + { + "epoch": 0.2118, + "grad_norm": 1.8828125, + "grad_norm_var": 0.02759577433268229, + "learning_rate": 0.0001, + "loss": 4.106, + "loss/crossentropy": 2.202653169631958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22666631639003754, + "step": 10590 + }, + { + "epoch": 0.21184, + "grad_norm": 2.046875, + "grad_norm_var": 0.026041412353515626, + "learning_rate": 0.0001, + "loss": 4.2285, + "loss/crossentropy": 2.013135075569153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22155165672302246, + "step": 10592 + }, + { + "epoch": 0.21188, + "grad_norm": 2.25, + "grad_norm_var": 2.762861887613932, + "learning_rate": 0.0001, + "loss": 4.1353, + "loss/crossentropy": 1.6701499223709106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21808167546987534, + "step": 10594 + }, + { + "epoch": 0.21192, + "grad_norm": 2.015625, + "grad_norm_var": 2.7677996317545572, + "learning_rate": 0.0001, + "loss": 4.1438, + "loss/crossentropy": 2.074462592601776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22254129499197006, + "step": 10596 + }, + { + "epoch": 0.21196, + "grad_norm": 2.203125, + "grad_norm_var": 2.787275950113932, + "learning_rate": 0.0001, + "loss": 4.2329, + "loss/crossentropy": 2.182308316230774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21201200038194656, + "step": 10598 + }, + { + "epoch": 0.212, + "grad_norm": 2.0625, + "grad_norm_var": 2.784148915608724, + "learning_rate": 0.0001, + "loss": 4.0688, + "loss/crossentropy": 1.9777602553367615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21454624831676483, + "step": 10600 + }, + { + "epoch": 0.21204, + "grad_norm": 2.03125, + "grad_norm_var": 2.778930409749349, + "learning_rate": 0.0001, + "loss": 4.2707, + "loss/crossentropy": 2.272592306137085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22290532290935516, + "step": 10602 + }, + { + "epoch": 0.21208, + "grad_norm": 2.078125, + "grad_norm_var": 2.784010569254557, + "learning_rate": 0.0001, + "loss": 4.3406, + "loss/crossentropy": 2.110643744468689, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22552277147769928, + "step": 10604 + }, + { + "epoch": 0.21212, + "grad_norm": 2.125, + "grad_norm_var": 2.7753326416015627, + "learning_rate": 0.0001, + "loss": 4.3485, + "loss/crossentropy": 2.279823422431946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22356732934713364, + "step": 10606 + }, + { + "epoch": 0.21216, + "grad_norm": 2.109375, + "grad_norm_var": 2.768701171875, + "learning_rate": 0.0001, + "loss": 4.3203, + "loss/crossentropy": 2.0985517501831055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22894078493118286, + "step": 10608 + }, + { + "epoch": 0.2122, + "grad_norm": 2.046875, + "grad_norm_var": 0.006884765625, + "learning_rate": 0.0001, + "loss": 4.2295, + "loss/crossentropy": 2.251029133796692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24473578482866287, + "step": 10610 + }, + { + "epoch": 0.21224, + "grad_norm": 2.140625, + "grad_norm_var": 0.005785115559895833, + "learning_rate": 0.0001, + "loss": 4.2451, + "loss/crossentropy": 2.1706892251968384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21725767105817795, + "step": 10612 + }, + { + "epoch": 0.21228, + "grad_norm": 2.046875, + "grad_norm_var": 0.004378255208333333, + "learning_rate": 0.0001, + "loss": 4.3319, + "loss/crossentropy": 2.0709590315818787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22771050035953522, + "step": 10614 + }, + { + "epoch": 0.21232, + "grad_norm": 1.9609375, + "grad_norm_var": 0.005163319905598958, + "learning_rate": 0.0001, + "loss": 4.2507, + "loss/crossentropy": 2.0437510013580322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2055979147553444, + "step": 10616 + }, + { + "epoch": 0.21236, + "grad_norm": 2.140625, + "grad_norm_var": 0.005204010009765625, + "learning_rate": 0.0001, + "loss": 4.2492, + "loss/crossentropy": 2.023369252681732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23950359225273132, + "step": 10618 + }, + { + "epoch": 0.2124, + "grad_norm": 2.140625, + "grad_norm_var": 0.004325103759765625, + "learning_rate": 0.0001, + "loss": 4.3522, + "loss/crossentropy": 2.051850199699402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24145089089870453, + "step": 10620 + }, + { + "epoch": 0.21244, + "grad_norm": 2.015625, + "grad_norm_var": 0.0044247945149739586, + "learning_rate": 0.0001, + "loss": 4.1743, + "loss/crossentropy": 1.9617546796798706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2095404863357544, + "step": 10622 + }, + { + "epoch": 0.21248, + "grad_norm": 2.140625, + "grad_norm_var": 0.004662831624348958, + "learning_rate": 0.0001, + "loss": 4.2115, + "loss/crossentropy": 1.9792284965515137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21156969666481018, + "step": 10624 + }, + { + "epoch": 0.21252, + "grad_norm": 2.046875, + "grad_norm_var": 0.004662831624348958, + "learning_rate": 0.0001, + "loss": 4.4402, + "loss/crossentropy": 1.936375379562378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21360263973474503, + "step": 10626 + }, + { + "epoch": 0.21256, + "grad_norm": 1.859375, + "grad_norm_var": 0.0071408589680989586, + "learning_rate": 0.0001, + "loss": 4.0695, + "loss/crossentropy": 2.11286723613739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20090097934007645, + "step": 10628 + }, + { + "epoch": 0.2126, + "grad_norm": 2.015625, + "grad_norm_var": 0.007120513916015625, + "learning_rate": 0.0001, + "loss": 4.2555, + "loss/crossentropy": 2.1020379066467285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21139420568943024, + "step": 10630 + }, + { + "epoch": 0.21264, + "grad_norm": 2.0625, + "grad_norm_var": 0.0064453125, + "learning_rate": 0.0001, + "loss": 4.2027, + "loss/crossentropy": 2.220509111881256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22286564111709595, + "step": 10632 + }, + { + "epoch": 0.21268, + "grad_norm": 1.890625, + "grad_norm_var": 0.0080078125, + "learning_rate": 0.0001, + "loss": 4.0012, + "loss/crossentropy": 2.058075189590454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22242067009210587, + "step": 10634 + }, + { + "epoch": 0.21272, + "grad_norm": 2.09375, + "grad_norm_var": 0.005269368489583333, + "learning_rate": 0.0001, + "loss": 4.1825, + "loss/crossentropy": 1.9641217589378357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20702090859413147, + "step": 10636 + }, + { + "epoch": 0.21276, + "grad_norm": 2.046875, + "grad_norm_var": 0.005231730143229167, + "learning_rate": 0.0001, + "loss": 4.1309, + "loss/crossentropy": 1.9234120845794678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20697829127311707, + "step": 10638 + }, + { + "epoch": 0.2128, + "grad_norm": 1.9375, + "grad_norm_var": 0.0067047119140625, + "learning_rate": 0.0001, + "loss": 4.2551, + "loss/crossentropy": 2.1050453782081604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21321691572666168, + "step": 10640 + }, + { + "epoch": 0.21284, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009323883056640624, + "learning_rate": 0.0001, + "loss": 4.0852, + "loss/crossentropy": 2.21867573261261, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21468330919742584, + "step": 10642 + }, + { + "epoch": 0.21288, + "grad_norm": 2.09375, + "grad_norm_var": 0.007319895426432291, + "learning_rate": 0.0001, + "loss": 4.4209, + "loss/crossentropy": 2.213089942932129, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21172764152288437, + "step": 10644 + }, + { + "epoch": 0.21292, + "grad_norm": 2.109375, + "grad_norm_var": 0.012556711832682291, + "learning_rate": 0.0001, + "loss": 4.3261, + "loss/crossentropy": 2.0370752811431885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23427169024944305, + "step": 10646 + }, + { + "epoch": 0.21296, + "grad_norm": 2.109375, + "grad_norm_var": 0.016035715738932293, + "learning_rate": 0.0001, + "loss": 4.2622, + "loss/crossentropy": 2.080373227596283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148713618516922, + "step": 10648 + }, + { + "epoch": 0.213, + "grad_norm": 2.078125, + "grad_norm_var": 0.014134724934895834, + "learning_rate": 0.0001, + "loss": 4.5408, + "loss/crossentropy": 2.3023130893707275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22768481075763702, + "step": 10650 + }, + { + "epoch": 0.21304, + "grad_norm": 2.203125, + "grad_norm_var": 0.016185506184895834, + "learning_rate": 0.0001, + "loss": 4.372, + "loss/crossentropy": 2.14642870426178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22463608533143997, + "step": 10652 + }, + { + "epoch": 0.21308, + "grad_norm": 2.03125, + "grad_norm_var": 0.016377766927083332, + "learning_rate": 0.0001, + "loss": 4.2164, + "loss/crossentropy": 2.2469639778137207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2505262568593025, + "step": 10654 + }, + { + "epoch": 0.21312, + "grad_norm": 2.0625, + "grad_norm_var": 0.015915679931640624, + "learning_rate": 0.0001, + "loss": 4.243, + "loss/crossentropy": 2.0431448221206665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21013544499874115, + "step": 10656 + }, + { + "epoch": 0.21316, + "grad_norm": 1.9375, + "grad_norm_var": 0.013631184895833334, + "learning_rate": 0.0001, + "loss": 3.9872, + "loss/crossentropy": 1.6768526434898376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18652133643627167, + "step": 10658 + }, + { + "epoch": 0.2132, + "grad_norm": 2.0, + "grad_norm_var": 0.017175038655598957, + "learning_rate": 0.0001, + "loss": 3.9442, + "loss/crossentropy": 1.3748261332511902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15284860879182816, + "step": 10660 + }, + { + "epoch": 0.21324, + "grad_norm": 2.03125, + "grad_norm_var": 0.012282053629557291, + "learning_rate": 0.0001, + "loss": 4.1451, + "loss/crossentropy": 1.9126732349395752, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21141140908002853, + "step": 10662 + }, + { + "epoch": 0.21328, + "grad_norm": 2.1875, + "grad_norm_var": 0.010908762613932291, + "learning_rate": 0.0001, + "loss": 4.2392, + "loss/crossentropy": 1.97357976436615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21326088905334473, + "step": 10664 + }, + { + "epoch": 0.21332, + "grad_norm": 2.109375, + "grad_norm_var": 0.011201985677083333, + "learning_rate": 0.0001, + "loss": 4.2825, + "loss/crossentropy": 2.1782814860343933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21183249354362488, + "step": 10666 + }, + { + "epoch": 0.21336, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009098307291666666, + "learning_rate": 0.0001, + "loss": 4.1251, + "loss/crossentropy": 2.1700649857521057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2310282066464424, + "step": 10668 + }, + { + "epoch": 0.2134, + "grad_norm": 2.109375, + "grad_norm_var": 0.0087554931640625, + "learning_rate": 0.0001, + "loss": 4.479, + "loss/crossentropy": 2.249666213989258, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21025578677654266, + "step": 10670 + }, + { + "epoch": 0.21344, + "grad_norm": 2.15625, + "grad_norm_var": 0.009287261962890625, + "learning_rate": 0.0001, + "loss": 4.3445, + "loss/crossentropy": 1.993752121925354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20829375833272934, + "step": 10672 + }, + { + "epoch": 0.21348, + "grad_norm": 2.046875, + "grad_norm_var": 0.009244791666666667, + "learning_rate": 0.0001, + "loss": 4.0814, + "loss/crossentropy": 2.0472013354301453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21456149220466614, + "step": 10674 + }, + { + "epoch": 0.21352, + "grad_norm": 2.109375, + "grad_norm_var": 0.007222239176432292, + "learning_rate": 0.0001, + "loss": 3.9497, + "loss/crossentropy": 2.0347819328308105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2133965790271759, + "step": 10676 + }, + { + "epoch": 0.21356, + "grad_norm": 1.984375, + "grad_norm_var": 0.008168284098307292, + "learning_rate": 0.0001, + "loss": 4.1528, + "loss/crossentropy": 1.86410254240036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22139018028974533, + "step": 10678 + }, + { + "epoch": 0.2136, + "grad_norm": 2.015625, + "grad_norm_var": 0.005641428629557291, + "learning_rate": 0.0001, + "loss": 4.3134, + "loss/crossentropy": 2.3244482278823853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24854417145252228, + "step": 10680 + }, + { + "epoch": 0.21364, + "grad_norm": 2.0, + "grad_norm_var": 0.006304677327473958, + "learning_rate": 0.0001, + "loss": 4.7182, + "loss/crossentropy": 2.518718123435974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.274397537112236, + "step": 10682 + }, + { + "epoch": 0.21368, + "grad_norm": 2.09375, + "grad_norm_var": 0.005928548177083334, + "learning_rate": 0.0001, + "loss": 4.1289, + "loss/crossentropy": 1.8111079931259155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039540931582451, + "step": 10684 + }, + { + "epoch": 0.21372, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008129628499348958, + "learning_rate": 0.0001, + "loss": 4.3446, + "loss/crossentropy": 2.186485230922699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22497782111167908, + "step": 10686 + }, + { + "epoch": 0.21376, + "grad_norm": 2.21875, + "grad_norm_var": 0.009877268473307292, + "learning_rate": 0.0001, + "loss": 4.4708, + "loss/crossentropy": 2.1850993633270264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22541093826293945, + "step": 10688 + }, + { + "epoch": 0.2138, + "grad_norm": 2.078125, + "grad_norm_var": 0.0090576171875, + "learning_rate": 0.0001, + "loss": 4.3991, + "loss/crossentropy": 1.9756001830101013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23400776088237762, + "step": 10690 + }, + { + "epoch": 0.21384, + "grad_norm": 2.125, + "grad_norm_var": 0.010652669270833333, + "learning_rate": 0.0001, + "loss": 4.0283, + "loss/crossentropy": 1.7454423904418945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18263398110866547, + "step": 10692 + }, + { + "epoch": 0.21388, + "grad_norm": 2.015625, + "grad_norm_var": 0.010066731770833334, + "learning_rate": 0.0001, + "loss": 3.9364, + "loss/crossentropy": 1.5824024081230164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18881092965602875, + "step": 10694 + }, + { + "epoch": 0.21392, + "grad_norm": 2.15625, + "grad_norm_var": 0.009383138020833333, + "learning_rate": 0.0001, + "loss": 4.3587, + "loss/crossentropy": 2.1171644926071167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2173115462064743, + "step": 10696 + }, + { + "epoch": 0.21396, + "grad_norm": 2.0625, + "grad_norm_var": 0.008698527018229167, + "learning_rate": 0.0001, + "loss": 4.2102, + "loss/crossentropy": 1.9327389001846313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22025491297245026, + "step": 10698 + }, + { + "epoch": 0.214, + "grad_norm": 1.90625, + "grad_norm_var": 0.009952799479166666, + "learning_rate": 0.0001, + "loss": 3.9622, + "loss/crossentropy": 1.9806901216506958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2089216560125351, + "step": 10700 + }, + { + "epoch": 0.21404, + "grad_norm": 2.109375, + "grad_norm_var": 0.007503000895182291, + "learning_rate": 0.0001, + "loss": 4.2503, + "loss/crossentropy": 2.216805338859558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21268506348133087, + "step": 10702 + }, + { + "epoch": 0.21408, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0055084228515625, + "learning_rate": 0.0001, + "loss": 4.5121, + "loss/crossentropy": 2.3998383283615112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26092807948589325, + "step": 10704 + }, + { + "epoch": 0.21412, + "grad_norm": 2.171875, + "grad_norm_var": 0.007869466145833334, + "learning_rate": 0.0001, + "loss": 4.4273, + "loss/crossentropy": 2.0581844449043274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2316882163286209, + "step": 10706 + }, + { + "epoch": 0.21416, + "grad_norm": 2.09375, + "grad_norm_var": 0.006258138020833333, + "learning_rate": 0.0001, + "loss": 4.2684, + "loss/crossentropy": 2.3091371059417725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23538943380117416, + "step": 10708 + }, + { + "epoch": 0.2142, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0067291259765625, + "learning_rate": 0.0001, + "loss": 3.9567, + "loss/crossentropy": 1.7134324312210083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1941806674003601, + "step": 10710 + }, + { + "epoch": 0.21424, + "grad_norm": 2.078125, + "grad_norm_var": 0.0060699462890625, + "learning_rate": 0.0001, + "loss": 4.3165, + "loss/crossentropy": 2.2040648460388184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20669078081846237, + "step": 10712 + }, + { + "epoch": 0.21428, + "grad_norm": 2.25, + "grad_norm_var": 0.020612589518229165, + "learning_rate": 0.0001, + "loss": 4.3207, + "loss/crossentropy": 2.3040376901626587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20685256272554398, + "step": 10714 + }, + { + "epoch": 0.21432, + "grad_norm": 2.328125, + "grad_norm_var": 0.020318349202473957, + "learning_rate": 0.0001, + "loss": 4.2674, + "loss/crossentropy": 2.189309239387512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21499691903591156, + "step": 10716 + }, + { + "epoch": 0.21436, + "grad_norm": 2.1875, + "grad_norm_var": 0.020216623942057293, + "learning_rate": 0.0001, + "loss": 4.4144, + "loss/crossentropy": 2.1955957412719727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20932748913764954, + "step": 10718 + }, + { + "epoch": 0.2144, + "grad_norm": 2.140625, + "grad_norm_var": 0.020271809895833333, + "learning_rate": 0.0001, + "loss": 4.2135, + "loss/crossentropy": 1.7800896763801575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1939619928598404, + "step": 10720 + }, + { + "epoch": 0.21444, + "grad_norm": 2.15625, + "grad_norm_var": 0.020335896809895834, + "learning_rate": 0.0001, + "loss": 4.2757, + "loss/crossentropy": 1.8974847197532654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19434937089681625, + "step": 10722 + }, + { + "epoch": 0.21448, + "grad_norm": 2.171875, + "grad_norm_var": 0.021455891927083335, + "learning_rate": 0.0001, + "loss": 4.2688, + "loss/crossentropy": 2.1631242632865906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21765826642513275, + "step": 10724 + }, + { + "epoch": 0.21452, + "grad_norm": 2.03125, + "grad_norm_var": 0.0203521728515625, + "learning_rate": 0.0001, + "loss": 4.6421, + "loss/crossentropy": 2.241260290145874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2334459200501442, + "step": 10726 + }, + { + "epoch": 0.21456, + "grad_norm": 2.0625, + "grad_norm_var": 0.0226470947265625, + "learning_rate": 0.0001, + "loss": 3.9478, + "loss/crossentropy": 1.9063156247138977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22118454426527023, + "step": 10728 + }, + { + "epoch": 0.2146, + "grad_norm": 2.03125, + "grad_norm_var": 0.011310831705729166, + "learning_rate": 0.0001, + "loss": 4.0446, + "loss/crossentropy": 1.8109349012374878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18581955134868622, + "step": 10730 + }, + { + "epoch": 0.21464, + "grad_norm": 2.265625, + "grad_norm_var": 0.009749348958333333, + "learning_rate": 0.0001, + "loss": 4.3262, + "loss/crossentropy": 2.1113163232803345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21339301764965057, + "step": 10732 + }, + { + "epoch": 0.21468, + "grad_norm": 2.0625, + "grad_norm_var": 0.0108551025390625, + "learning_rate": 0.0001, + "loss": 4.5308, + "loss/crossentropy": 2.1731717586517334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22267260402441025, + "step": 10734 + }, + { + "epoch": 0.21472, + "grad_norm": 2.078125, + "grad_norm_var": 0.010087076822916667, + "learning_rate": 0.0001, + "loss": 4.2363, + "loss/crossentropy": 2.322808027267456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21375543624162674, + "step": 10736 + }, + { + "epoch": 0.21476, + "grad_norm": 2.0, + "grad_norm_var": 0.010358683268229167, + "learning_rate": 0.0001, + "loss": 4.109, + "loss/crossentropy": 1.8200489282608032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19796901941299438, + "step": 10738 + }, + { + "epoch": 0.2148, + "grad_norm": 2.0625, + "grad_norm_var": 0.009455362955729166, + "learning_rate": 0.0001, + "loss": 4.0918, + "loss/crossentropy": 2.353461265563965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22755976021289825, + "step": 10740 + }, + { + "epoch": 0.21484, + "grad_norm": 2.015625, + "grad_norm_var": 0.0071451822916666664, + "learning_rate": 0.0001, + "loss": 4.2925, + "loss/crossentropy": 2.3200663328170776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23450587689876556, + "step": 10742 + }, + { + "epoch": 0.21488, + "grad_norm": 2.0, + "grad_norm_var": 0.006403605143229167, + "learning_rate": 0.0001, + "loss": 4.187, + "loss/crossentropy": 1.7293912768363953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17938270419836044, + "step": 10744 + }, + { + "epoch": 0.21492, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008656565348307292, + "learning_rate": 0.0001, + "loss": 4.3213, + "loss/crossentropy": 1.9759944081306458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20631389319896698, + "step": 10746 + }, + { + "epoch": 0.21496, + "grad_norm": 2.1875, + "grad_norm_var": 0.006870269775390625, + "learning_rate": 0.0001, + "loss": 4.3658, + "loss/crossentropy": 2.2232764959335327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21919700503349304, + "step": 10748 + }, + { + "epoch": 0.215, + "grad_norm": 1.953125, + "grad_norm_var": 0.004898834228515625, + "learning_rate": 0.0001, + "loss": 4.2466, + "loss/crossentropy": 2.0827722549438477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20569747686386108, + "step": 10750 + }, + { + "epoch": 0.21504, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005509440104166667, + "learning_rate": 0.0001, + "loss": 4.2373, + "loss/crossentropy": 2.0712032318115234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20504066348075867, + "step": 10752 + }, + { + "epoch": 0.21508, + "grad_norm": 2.234375, + "grad_norm_var": 0.007963053385416667, + "learning_rate": 0.0001, + "loss": 4.3466, + "loss/crossentropy": 2.2717082500457764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22957370430231094, + "step": 10754 + }, + { + "epoch": 0.21512, + "grad_norm": 2.359375, + "grad_norm_var": 0.0133056640625, + "learning_rate": 0.0001, + "loss": 4.2821, + "loss/crossentropy": 2.263810157775879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2291572391986847, + "step": 10756 + }, + { + "epoch": 0.21516, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013724517822265626, + "learning_rate": 0.0001, + "loss": 4.2157, + "loss/crossentropy": 2.103231191635132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21839886158704758, + "step": 10758 + }, + { + "epoch": 0.2152, + "grad_norm": 2.203125, + "grad_norm_var": 0.013962554931640624, + "learning_rate": 0.0001, + "loss": 4.4772, + "loss/crossentropy": 2.1272148489952087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2280830293893814, + "step": 10760 + }, + { + "epoch": 0.21524, + "grad_norm": 2.078125, + "grad_norm_var": 0.012962849934895833, + "learning_rate": 0.0001, + "loss": 4.1192, + "loss/crossentropy": 2.133938789367676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2162996605038643, + "step": 10762 + }, + { + "epoch": 0.21528, + "grad_norm": 2.828125, + "grad_norm_var": 0.04528401692708333, + "learning_rate": 0.0001, + "loss": 4.5351, + "loss/crossentropy": 2.305683732032776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23797836154699326, + "step": 10764 + }, + { + "epoch": 0.21532, + "grad_norm": 2.078125, + "grad_norm_var": 0.04370829264322917, + "learning_rate": 0.0001, + "loss": 3.9886, + "loss/crossentropy": 1.7248413562774658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17226764559745789, + "step": 10766 + }, + { + "epoch": 0.21536, + "grad_norm": 2.1875, + "grad_norm_var": 0.041715240478515624, + "learning_rate": 0.0001, + "loss": 4.6422, + "loss/crossentropy": 2.275644540786743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2433001920580864, + "step": 10768 + }, + { + "epoch": 0.2154, + "grad_norm": 2.1875, + "grad_norm_var": 0.043338775634765625, + "learning_rate": 0.0001, + "loss": 4.2836, + "loss/crossentropy": 1.908457100391388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19758973270654678, + "step": 10770 + }, + { + "epoch": 0.21544, + "grad_norm": 1.8984375, + "grad_norm_var": 0.07423909505208333, + "learning_rate": 0.0001, + "loss": 4.2317, + "loss/crossentropy": 1.9401238560676575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22177566587924957, + "step": 10772 + }, + { + "epoch": 0.21548, + "grad_norm": 2.046875, + "grad_norm_var": 0.07476170857747395, + "learning_rate": 0.0001, + "loss": 4.295, + "loss/crossentropy": 2.247257351875305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21560464799404144, + "step": 10774 + }, + { + "epoch": 0.21552, + "grad_norm": 2.015625, + "grad_norm_var": 0.07644424438476563, + "learning_rate": 0.0001, + "loss": 4.1482, + "loss/crossentropy": 1.9191248416900635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.214834064245224, + "step": 10776 + }, + { + "epoch": 0.21556, + "grad_norm": 2.0625, + "grad_norm_var": 0.0785888671875, + "learning_rate": 0.0001, + "loss": 4.0644, + "loss/crossentropy": 1.8963102102279663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20552606880664825, + "step": 10778 + }, + { + "epoch": 0.2156, + "grad_norm": 2.015625, + "grad_norm_var": 0.047078450520833336, + "learning_rate": 0.0001, + "loss": 4.265, + "loss/crossentropy": 2.0517951250076294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1963544860482216, + "step": 10780 + }, + { + "epoch": 0.21564, + "grad_norm": 2.171875, + "grad_norm_var": 0.046727498372395836, + "learning_rate": 0.0001, + "loss": 4.3571, + "loss/crossentropy": 2.1484888792037964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23963741213083267, + "step": 10782 + }, + { + "epoch": 0.21568, + "grad_norm": 2.671875, + "grad_norm_var": 0.06716206868489584, + "learning_rate": 0.0001, + "loss": 4.6827, + "loss/crossentropy": 2.012593388557434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2115195021033287, + "step": 10784 + }, + { + "epoch": 0.21572, + "grad_norm": 2.03125, + "grad_norm_var": 0.06651102701822917, + "learning_rate": 0.0001, + "loss": 4.018, + "loss/crossentropy": 2.139856696128845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23609659075737, + "step": 10786 + }, + { + "epoch": 0.21576, + "grad_norm": 2.25, + "grad_norm_var": 0.03216120402018229, + "learning_rate": 0.0001, + "loss": 4.5007, + "loss/crossentropy": 2.0139951705932617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22799161076545715, + "step": 10788 + }, + { + "epoch": 0.2158, + "grad_norm": 2.125, + "grad_norm_var": 0.035676829020182294, + "learning_rate": 0.0001, + "loss": 4.227, + "loss/crossentropy": 1.8325074315071106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18156912177801132, + "step": 10790 + }, + { + "epoch": 0.21584, + "grad_norm": 2.109375, + "grad_norm_var": 0.04155654907226562, + "learning_rate": 0.0001, + "loss": 3.8042, + "loss/crossentropy": 1.7531892657279968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18498936295509338, + "step": 10792 + }, + { + "epoch": 0.21588, + "grad_norm": 2.25, + "grad_norm_var": 0.03681233723958333, + "learning_rate": 0.0001, + "loss": 4.3955, + "loss/crossentropy": 2.09742671251297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21543975174427032, + "step": 10794 + }, + { + "epoch": 0.21592, + "grad_norm": 2.09375, + "grad_norm_var": 0.03532613118489583, + "learning_rate": 0.0001, + "loss": 4.4752, + "loss/crossentropy": 2.4303117990493774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24365779757499695, + "step": 10796 + }, + { + "epoch": 0.21596, + "grad_norm": 2.1875, + "grad_norm_var": 0.0371002197265625, + "learning_rate": 0.0001, + "loss": 4.1973, + "loss/crossentropy": 2.007950007915497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20470578223466873, + "step": 10798 + }, + { + "epoch": 0.216, + "grad_norm": 2.5, + "grad_norm_var": 0.026949055989583335, + "learning_rate": 0.0001, + "loss": 3.9975, + "loss/crossentropy": 1.895507276058197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2005787119269371, + "step": 10800 + }, + { + "epoch": 0.21604, + "grad_norm": 2.25, + "grad_norm_var": 0.026691691080729166, + "learning_rate": 0.0001, + "loss": 4.2543, + "loss/crossentropy": 2.174374043941498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2282470539212227, + "step": 10802 + }, + { + "epoch": 0.21608, + "grad_norm": 1.9921875, + "grad_norm_var": 0.02789484659830729, + "learning_rate": 0.0001, + "loss": 4.0642, + "loss/crossentropy": 1.8878389596939087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19730794429779053, + "step": 10804 + }, + { + "epoch": 0.21612, + "grad_norm": 2.203125, + "grad_norm_var": 0.023884073893229166, + "learning_rate": 0.0001, + "loss": 4.1733, + "loss/crossentropy": 1.9874022006988525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21521702408790588, + "step": 10806 + }, + { + "epoch": 0.21616, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01962458292643229, + "learning_rate": 0.0001, + "loss": 4.0783, + "loss/crossentropy": 2.0173474550247192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21552105993032455, + "step": 10808 + }, + { + "epoch": 0.2162, + "grad_norm": 2.140625, + "grad_norm_var": 0.018344879150390625, + "learning_rate": 0.0001, + "loss": 4.0583, + "loss/crossentropy": 2.175139367580414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20279064774513245, + "step": 10810 + }, + { + "epoch": 0.21624, + "grad_norm": 2.390625, + "grad_norm_var": 0.022849273681640626, + "learning_rate": 0.0001, + "loss": 4.5125, + "loss/crossentropy": 2.6407864093780518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2939087897539139, + "step": 10812 + }, + { + "epoch": 0.21628, + "grad_norm": 2.09375, + "grad_norm_var": 0.024930826822916665, + "learning_rate": 0.0001, + "loss": 4.0869, + "loss/crossentropy": 1.947974681854248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2102196291089058, + "step": 10814 + }, + { + "epoch": 0.21632, + "grad_norm": 9.875, + "grad_norm_var": 3.7874745686848956, + "learning_rate": 0.0001, + "loss": 4.3404, + "loss/crossentropy": 1.9040276408195496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20579400658607483, + "step": 10816 + }, + { + "epoch": 0.21636, + "grad_norm": 2.21875, + "grad_norm_var": 3.7853190104166665, + "learning_rate": 0.0001, + "loss": 3.7119, + "loss/crossentropy": 1.7941421270370483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19792088121175766, + "step": 10818 + }, + { + "epoch": 0.2164, + "grad_norm": 1.953125, + "grad_norm_var": 3.7839088439941406, + "learning_rate": 0.0001, + "loss": 4.2573, + "loss/crossentropy": 2.2164441347122192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2903618812561035, + "step": 10820 + }, + { + "epoch": 0.21644, + "grad_norm": 2.109375, + "grad_norm_var": 3.77445068359375, + "learning_rate": 0.0001, + "loss": 4.2537, + "loss/crossentropy": 1.9888933897018433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20800678431987762, + "step": 10822 + }, + { + "epoch": 0.21648, + "grad_norm": 1.875, + "grad_norm_var": 3.79569091796875, + "learning_rate": 0.0001, + "loss": 3.9218, + "loss/crossentropy": 1.9026559591293335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18179579824209213, + "step": 10824 + }, + { + "epoch": 0.21652, + "grad_norm": 2.078125, + "grad_norm_var": 3.7974202473958334, + "learning_rate": 0.0001, + "loss": 4.1267, + "loss/crossentropy": 1.9020920991897583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20755057036876678, + "step": 10826 + }, + { + "epoch": 0.21656, + "grad_norm": 2.03125, + "grad_norm_var": 3.8177286783854165, + "learning_rate": 0.0001, + "loss": 4.4689, + "loss/crossentropy": 2.3464537858963013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23209182918071747, + "step": 10828 + }, + { + "epoch": 0.2166, + "grad_norm": 2.3125, + "grad_norm_var": 3.7860877990722654, + "learning_rate": 0.0001, + "loss": 4.4486, + "loss/crossentropy": 2.305312991142273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2548810988664627, + "step": 10830 + }, + { + "epoch": 0.21664, + "grad_norm": 2.515625, + "grad_norm_var": 0.026775868733723958, + "learning_rate": 0.0001, + "loss": 3.9818, + "loss/crossentropy": 1.4548576474189758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1718049794435501, + "step": 10832 + }, + { + "epoch": 0.21668, + "grad_norm": 2.1875, + "grad_norm_var": 0.02504247029622396, + "learning_rate": 0.0001, + "loss": 4.4909, + "loss/crossentropy": 2.2954181432724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22977370768785477, + "step": 10834 + }, + { + "epoch": 0.21672, + "grad_norm": 2.09375, + "grad_norm_var": 0.022930653889973958, + "learning_rate": 0.0001, + "loss": 4.0433, + "loss/crossentropy": 2.1974023580551147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21474600583314896, + "step": 10836 + }, + { + "epoch": 0.21676, + "grad_norm": 2.109375, + "grad_norm_var": 0.02304865519205729, + "learning_rate": 0.0001, + "loss": 4.5905, + "loss/crossentropy": 2.1211158633232117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19471213221549988, + "step": 10838 + }, + { + "epoch": 0.2168, + "grad_norm": 1.984375, + "grad_norm_var": 0.0204010009765625, + "learning_rate": 0.0001, + "loss": 4.0868, + "loss/crossentropy": 1.9942908883094788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1899253949522972, + "step": 10840 + }, + { + "epoch": 0.21684, + "grad_norm": 1.9296875, + "grad_norm_var": 0.023656209309895832, + "learning_rate": 0.0001, + "loss": 4.0787, + "loss/crossentropy": 1.9848283529281616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1777423620223999, + "step": 10842 + }, + { + "epoch": 0.21688, + "grad_norm": 1.9296875, + "grad_norm_var": 0.02545140584309896, + "learning_rate": 0.0001, + "loss": 4.1365, + "loss/crossentropy": 1.84994775056839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19193057715892792, + "step": 10844 + }, + { + "epoch": 0.21692, + "grad_norm": 2.09375, + "grad_norm_var": 0.02163670857747396, + "learning_rate": 0.0001, + "loss": 4.2869, + "loss/crossentropy": 1.8300130367279053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19747256487607956, + "step": 10846 + }, + { + "epoch": 0.21696, + "grad_norm": 2.234375, + "grad_norm_var": 0.010676829020182292, + "learning_rate": 0.0001, + "loss": 4.1624, + "loss/crossentropy": 1.9316250681877136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22893256694078445, + "step": 10848 + }, + { + "epoch": 0.217, + "grad_norm": 2.109375, + "grad_norm_var": 0.009085845947265626, + "learning_rate": 0.0001, + "loss": 4.0294, + "loss/crossentropy": 1.9862067103385925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22470611333847046, + "step": 10850 + }, + { + "epoch": 0.21704, + "grad_norm": 2.140625, + "grad_norm_var": 0.009372711181640625, + "learning_rate": 0.0001, + "loss": 4.3979, + "loss/crossentropy": 2.196234107017517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22592387348413467, + "step": 10852 + }, + { + "epoch": 0.21708, + "grad_norm": 2.328125, + "grad_norm_var": 0.2918291727701823, + "learning_rate": 0.0001, + "loss": 4.5516, + "loss/crossentropy": 2.3097496032714844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29694322496652603, + "step": 10854 + }, + { + "epoch": 0.21712, + "grad_norm": 1.984375, + "grad_norm_var": 0.2865191141764323, + "learning_rate": 0.0001, + "loss": 3.8687, + "loss/crossentropy": 1.883777916431427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21657781302928925, + "step": 10856 + }, + { + "epoch": 0.21716, + "grad_norm": 2.25, + "grad_norm_var": 0.2916338602701823, + "learning_rate": 0.0001, + "loss": 4.1915, + "loss/crossentropy": 2.0483964681625366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22432015091180801, + "step": 10858 + }, + { + "epoch": 0.2172, + "grad_norm": 2.140625, + "grad_norm_var": 0.2870839436848958, + "learning_rate": 0.0001, + "loss": 4.1503, + "loss/crossentropy": 2.2700769901275635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21760191768407822, + "step": 10860 + }, + { + "epoch": 0.21724, + "grad_norm": 2.03125, + "grad_norm_var": 0.2878214518229167, + "learning_rate": 0.0001, + "loss": 4.574, + "loss/crossentropy": 2.3758704662323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21792180836200714, + "step": 10862 + }, + { + "epoch": 0.21728, + "grad_norm": 2.03125, + "grad_norm_var": 0.2934641520182292, + "learning_rate": 0.0001, + "loss": 4.2895, + "loss/crossentropy": 2.1972473859786987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22460343688726425, + "step": 10864 + }, + { + "epoch": 0.21732, + "grad_norm": 2.171875, + "grad_norm_var": 0.29321187337239585, + "learning_rate": 0.0001, + "loss": 4.3693, + "loss/crossentropy": 1.9811919331550598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23008134216070175, + "step": 10866 + }, + { + "epoch": 0.21736, + "grad_norm": 2.03125, + "grad_norm_var": 0.2992327372233073, + "learning_rate": 0.0001, + "loss": 4.2237, + "loss/crossentropy": 2.032214403152466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1958027482032776, + "step": 10868 + }, + { + "epoch": 0.2174, + "grad_norm": 2.09375, + "grad_norm_var": 0.035162099202473956, + "learning_rate": 0.0001, + "loss": 4.0956, + "loss/crossentropy": 2.08488667011261, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21948565542697906, + "step": 10870 + }, + { + "epoch": 0.21744, + "grad_norm": 2.109375, + "grad_norm_var": 0.03401260375976563, + "learning_rate": 0.0001, + "loss": 4.2923, + "loss/crossentropy": 2.304922103881836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1977907121181488, + "step": 10872 + }, + { + "epoch": 0.21748, + "grad_norm": 2.03125, + "grad_norm_var": 0.005582427978515625, + "learning_rate": 0.0001, + "loss": 4.1762, + "loss/crossentropy": 1.991280436515808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2196018323302269, + "step": 10874 + }, + { + "epoch": 0.21752, + "grad_norm": 2.09375, + "grad_norm_var": 0.004965972900390625, + "learning_rate": 0.0001, + "loss": 4.4116, + "loss/crossentropy": 1.955183207988739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21059879660606384, + "step": 10876 + }, + { + "epoch": 0.21756, + "grad_norm": 2.140625, + "grad_norm_var": 0.0045562744140625, + "learning_rate": 0.0001, + "loss": 4.2105, + "loss/crossentropy": 2.1443604230880737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23160522431135178, + "step": 10878 + }, + { + "epoch": 0.2176, + "grad_norm": 2.140625, + "grad_norm_var": 0.004889933268229166, + "learning_rate": 0.0001, + "loss": 4.2374, + "loss/crossentropy": 2.0859211683273315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2296902909874916, + "step": 10880 + }, + { + "epoch": 0.21764, + "grad_norm": 2.0, + "grad_norm_var": 0.0059234619140625, + "learning_rate": 0.0001, + "loss": 4.3303, + "loss/crossentropy": 2.220987915992737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22863281518220901, + "step": 10882 + }, + { + "epoch": 0.21768, + "grad_norm": 1.96875, + "grad_norm_var": 0.005625152587890625, + "learning_rate": 0.0001, + "loss": 4.2195, + "loss/crossentropy": 2.1172574758529663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.197882778942585, + "step": 10884 + }, + { + "epoch": 0.21772, + "grad_norm": 2.25, + "grad_norm_var": 0.02047704060872396, + "learning_rate": 0.0001, + "loss": 4.3907, + "loss/crossentropy": 1.7861940264701843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1984492540359497, + "step": 10886 + }, + { + "epoch": 0.21776, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0218414306640625, + "learning_rate": 0.0001, + "loss": 4.0445, + "loss/crossentropy": 2.1119033098220825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20427606999874115, + "step": 10888 + }, + { + "epoch": 0.2178, + "grad_norm": 2.15625, + "grad_norm_var": 0.02235692342122396, + "learning_rate": 0.0001, + "loss": 3.923, + "loss/crossentropy": 1.7849717140197754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20817570388317108, + "step": 10890 + }, + { + "epoch": 0.21784, + "grad_norm": 2.03125, + "grad_norm_var": 0.025131988525390624, + "learning_rate": 0.0001, + "loss": 4.2015, + "loss/crossentropy": 2.0543535351753235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20579595863819122, + "step": 10892 + }, + { + "epoch": 0.21788, + "grad_norm": 1.8984375, + "grad_norm_var": 0.02643000284830729, + "learning_rate": 0.0001, + "loss": 4.1527, + "loss/crossentropy": 2.085465431213379, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23541826009750366, + "step": 10894 + }, + { + "epoch": 0.21792, + "grad_norm": 2.40625, + "grad_norm_var": 0.03343073527018229, + "learning_rate": 0.0001, + "loss": 4.3288, + "loss/crossentropy": 1.9521069526672363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21404746174812317, + "step": 10896 + }, + { + "epoch": 0.21796, + "grad_norm": 1.96875, + "grad_norm_var": 0.03172378540039063, + "learning_rate": 0.0001, + "loss": 4.039, + "loss/crossentropy": 1.8710272908210754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20500759035348892, + "step": 10898 + }, + { + "epoch": 0.218, + "grad_norm": 2.0625, + "grad_norm_var": 0.032083892822265626, + "learning_rate": 0.0001, + "loss": 4.0695, + "loss/crossentropy": 2.275243639945984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24073244631290436, + "step": 10900 + }, + { + "epoch": 0.21804, + "grad_norm": 2.109375, + "grad_norm_var": 0.0181060791015625, + "learning_rate": 0.0001, + "loss": 4.3508, + "loss/crossentropy": 2.2093106508255005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23254899680614471, + "step": 10902 + }, + { + "epoch": 0.21808, + "grad_norm": 2.0, + "grad_norm_var": 0.01702855428059896, + "learning_rate": 0.0001, + "loss": 4.2805, + "loss/crossentropy": 2.0315810441970825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20614907145500183, + "step": 10904 + }, + { + "epoch": 0.21812, + "grad_norm": 2.015625, + "grad_norm_var": 0.016478474934895834, + "learning_rate": 0.0001, + "loss": 4.3461, + "loss/crossentropy": 2.0397735834121704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2093086987733841, + "step": 10906 + }, + { + "epoch": 0.21816, + "grad_norm": 2.03125, + "grad_norm_var": 0.013508097330729166, + "learning_rate": 0.0001, + "loss": 4.0286, + "loss/crossentropy": 1.9616519808769226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19433944672346115, + "step": 10908 + }, + { + "epoch": 0.2182, + "grad_norm": 2.25, + "grad_norm_var": 0.01749445597330729, + "learning_rate": 0.0001, + "loss": 4.2157, + "loss/crossentropy": 2.1907248497009277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23218485713005066, + "step": 10910 + }, + { + "epoch": 0.21824, + "grad_norm": 2.0625, + "grad_norm_var": 0.009981282552083333, + "learning_rate": 0.0001, + "loss": 4.163, + "loss/crossentropy": 2.2785152196884155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21416624635457993, + "step": 10912 + }, + { + "epoch": 0.21828, + "grad_norm": 2.03125, + "grad_norm_var": 0.00955810546875, + "learning_rate": 0.0001, + "loss": 4.2528, + "loss/crossentropy": 2.151498794555664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23632780462503433, + "step": 10914 + }, + { + "epoch": 0.21832, + "grad_norm": 2.03125, + "grad_norm_var": 0.008829752604166666, + "learning_rate": 0.0001, + "loss": 4.3937, + "loss/crossentropy": 2.816411852836609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2243700549006462, + "step": 10916 + }, + { + "epoch": 0.21836, + "grad_norm": 2.359375, + "grad_norm_var": 0.014662424723307291, + "learning_rate": 0.0001, + "loss": 4.4091, + "loss/crossentropy": 2.2203346490859985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22302204370498657, + "step": 10918 + }, + { + "epoch": 0.2184, + "grad_norm": 2.0, + "grad_norm_var": 0.014662424723307291, + "learning_rate": 0.0001, + "loss": 4.3534, + "loss/crossentropy": 2.0416316390037537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20798437297344208, + "step": 10920 + }, + { + "epoch": 0.21844, + "grad_norm": 2.078125, + "grad_norm_var": 0.014426422119140626, + "learning_rate": 0.0001, + "loss": 4.1014, + "loss/crossentropy": 2.2251007556915283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2149331197142601, + "step": 10922 + }, + { + "epoch": 0.21848, + "grad_norm": 2.0, + "grad_norm_var": 0.014631907145182291, + "learning_rate": 0.0001, + "loss": 4.2346, + "loss/crossentropy": 1.976640522480011, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21704353392124176, + "step": 10924 + }, + { + "epoch": 0.21852, + "grad_norm": 2.0625, + "grad_norm_var": 0.011006418863932292, + "learning_rate": 0.0001, + "loss": 4.3284, + "loss/crossentropy": 2.041864037513733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19621634483337402, + "step": 10926 + }, + { + "epoch": 0.21856, + "grad_norm": 2.125, + "grad_norm_var": 0.009696451822916667, + "learning_rate": 0.0001, + "loss": 4.241, + "loss/crossentropy": 1.7188060879707336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1992729976773262, + "step": 10928 + }, + { + "epoch": 0.2186, + "grad_norm": 2.140625, + "grad_norm_var": 0.010512034098307291, + "learning_rate": 0.0001, + "loss": 4.0333, + "loss/crossentropy": 1.664458692073822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20099520683288574, + "step": 10930 + }, + { + "epoch": 0.21864, + "grad_norm": 2.015625, + "grad_norm_var": 0.011161041259765626, + "learning_rate": 0.0001, + "loss": 4.1952, + "loss/crossentropy": 2.199326276779175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23000852018594742, + "step": 10932 + }, + { + "epoch": 0.21868, + "grad_norm": 2.015625, + "grad_norm_var": 0.005694325764973958, + "learning_rate": 0.0001, + "loss": 4.1399, + "loss/crossentropy": 1.980049967765808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2211422473192215, + "step": 10934 + }, + { + "epoch": 0.21872, + "grad_norm": 2.09375, + "grad_norm_var": 0.005411529541015625, + "learning_rate": 0.0001, + "loss": 4.0876, + "loss/crossentropy": 1.7884072661399841, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.203270323574543, + "step": 10936 + }, + { + "epoch": 0.21876, + "grad_norm": 2.0, + "grad_norm_var": 0.005527496337890625, + "learning_rate": 0.0001, + "loss": 4.1923, + "loss/crossentropy": 2.1758522987365723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21960537880659103, + "step": 10938 + }, + { + "epoch": 0.2188, + "grad_norm": 2.171875, + "grad_norm_var": 0.004416656494140625, + "learning_rate": 0.0001, + "loss": 4.1708, + "loss/crossentropy": 2.255508303642273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2171120047569275, + "step": 10940 + }, + { + "epoch": 0.21884, + "grad_norm": 2.0, + "grad_norm_var": 0.004308827718098958, + "learning_rate": 0.0001, + "loss": 4.1294, + "loss/crossentropy": 1.9758012890815735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19945884495973587, + "step": 10942 + }, + { + "epoch": 0.21888, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0055582682291666664, + "learning_rate": 0.0001, + "loss": 4.2122, + "loss/crossentropy": 2.2980172634124756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20864219218492508, + "step": 10944 + }, + { + "epoch": 0.21892, + "grad_norm": 2.09375, + "grad_norm_var": 0.006666819254557292, + "learning_rate": 0.0001, + "loss": 3.919, + "loss/crossentropy": 1.5307916402816772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1799665167927742, + "step": 10946 + }, + { + "epoch": 0.21896, + "grad_norm": 2.0625, + "grad_norm_var": 0.006648508707682291, + "learning_rate": 0.0001, + "loss": 4.3796, + "loss/crossentropy": 2.3332602977752686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22964681684970856, + "step": 10948 + }, + { + "epoch": 0.219, + "grad_norm": 2.078125, + "grad_norm_var": 0.007380930582682291, + "learning_rate": 0.0001, + "loss": 4.1207, + "loss/crossentropy": 2.2509007453918457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22141733020544052, + "step": 10950 + }, + { + "epoch": 0.21904, + "grad_norm": 2.1875, + "grad_norm_var": 0.008949534098307291, + "learning_rate": 0.0001, + "loss": 4.4879, + "loss/crossentropy": 2.3677018880844116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25676336884498596, + "step": 10952 + }, + { + "epoch": 0.21908, + "grad_norm": 2.21875, + "grad_norm_var": 0.009934234619140624, + "learning_rate": 0.0001, + "loss": 4.4283, + "loss/crossentropy": 2.2807843685150146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21486903727054596, + "step": 10954 + }, + { + "epoch": 0.21912, + "grad_norm": 2.078125, + "grad_norm_var": 0.009232330322265624, + "learning_rate": 0.0001, + "loss": 4.3126, + "loss/crossentropy": 2.3883347511291504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21362561732530594, + "step": 10956 + }, + { + "epoch": 0.21916, + "grad_norm": 2.0625, + "grad_norm_var": 0.009405263264973958, + "learning_rate": 0.0001, + "loss": 4.0692, + "loss/crossentropy": 1.8598107695579529, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19806977361440659, + "step": 10958 + }, + { + "epoch": 0.2192, + "grad_norm": 2.140625, + "grad_norm_var": 0.007738240559895833, + "learning_rate": 0.0001, + "loss": 4.3259, + "loss/crossentropy": 2.1962249875068665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20640095323324203, + "step": 10960 + }, + { + "epoch": 0.21924, + "grad_norm": 2.359375, + "grad_norm_var": 0.007958984375, + "learning_rate": 0.0001, + "loss": 4.1774, + "loss/crossentropy": 2.061887502670288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22463876008987427, + "step": 10962 + }, + { + "epoch": 0.21928, + "grad_norm": 2.0625, + "grad_norm_var": 0.010237375895182291, + "learning_rate": 0.0001, + "loss": 3.9749, + "loss/crossentropy": 1.9112628102302551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19327937066555023, + "step": 10964 + }, + { + "epoch": 0.21932, + "grad_norm": 2.03125, + "grad_norm_var": 0.010640207926432292, + "learning_rate": 0.0001, + "loss": 4.3078, + "loss/crossentropy": 2.1655001640319824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22729264944791794, + "step": 10966 + }, + { + "epoch": 0.21936, + "grad_norm": 2.171875, + "grad_norm_var": 0.010400136311848959, + "learning_rate": 0.0001, + "loss": 4.1641, + "loss/crossentropy": 2.199634552001953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23565081506967545, + "step": 10968 + }, + { + "epoch": 0.2194, + "grad_norm": 2.03125, + "grad_norm_var": 0.010155995686848959, + "learning_rate": 0.0001, + "loss": 4.4298, + "loss/crossentropy": 2.4142041206359863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23020224273204803, + "step": 10970 + }, + { + "epoch": 0.21944, + "grad_norm": 2.03125, + "grad_norm_var": 0.011193593343098959, + "learning_rate": 0.0001, + "loss": 4.3856, + "loss/crossentropy": 2.2480571269989014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224863201379776, + "step": 10972 + }, + { + "epoch": 0.21948, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012238566080729167, + "learning_rate": 0.0001, + "loss": 4.099, + "loss/crossentropy": 2.1786953806877136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21182173490524292, + "step": 10974 + }, + { + "epoch": 0.21952, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013337961832682292, + "learning_rate": 0.0001, + "loss": 4.2826, + "loss/crossentropy": 2.3767744302749634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22259585559368134, + "step": 10976 + }, + { + "epoch": 0.21956, + "grad_norm": 2.25, + "grad_norm_var": 0.010794830322265626, + "learning_rate": 0.0001, + "loss": 4.405, + "loss/crossentropy": 1.9838140606880188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21830464899539948, + "step": 10978 + }, + { + "epoch": 0.2196, + "grad_norm": 2.046875, + "grad_norm_var": 0.0092193603515625, + "learning_rate": 0.0001, + "loss": 4.3811, + "loss/crossentropy": 2.0167009234428406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21239794790744781, + "step": 10980 + }, + { + "epoch": 0.21964, + "grad_norm": 2.5, + "grad_norm_var": 0.019269816080729165, + "learning_rate": 0.0001, + "loss": 4.3117, + "loss/crossentropy": 2.001616358757019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21801268309354782, + "step": 10982 + }, + { + "epoch": 0.21968, + "grad_norm": 2.1875, + "grad_norm_var": 0.019432576497395833, + "learning_rate": 0.0001, + "loss": 4.6266, + "loss/crossentropy": 2.259532332420349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20131191611289978, + "step": 10984 + }, + { + "epoch": 0.21972, + "grad_norm": 2.09375, + "grad_norm_var": 0.0215728759765625, + "learning_rate": 0.0001, + "loss": 4.1159, + "loss/crossentropy": 1.9223415851593018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21203956007957458, + "step": 10986 + }, + { + "epoch": 0.21976, + "grad_norm": 2.0, + "grad_norm_var": 0.020295206705729166, + "learning_rate": 0.0001, + "loss": 4.0177, + "loss/crossentropy": 2.3779542446136475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23436614871025085, + "step": 10988 + }, + { + "epoch": 0.2198, + "grad_norm": 2.046875, + "grad_norm_var": 0.02195002237955729, + "learning_rate": 0.0001, + "loss": 4.0412, + "loss/crossentropy": 1.8552000522613525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20212603360414505, + "step": 10990 + }, + { + "epoch": 0.21984, + "grad_norm": 2.03125, + "grad_norm_var": 0.0206451416015625, + "learning_rate": 0.0001, + "loss": 4.2264, + "loss/crossentropy": 1.79928320646286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19676420837640762, + "step": 10992 + }, + { + "epoch": 0.21988, + "grad_norm": 2.078125, + "grad_norm_var": 0.018619791666666666, + "learning_rate": 0.0001, + "loss": 4.2426, + "loss/crossentropy": 1.886910319328308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21270643919706345, + "step": 10994 + }, + { + "epoch": 0.21992, + "grad_norm": 2.078125, + "grad_norm_var": 0.019066365559895833, + "learning_rate": 0.0001, + "loss": 4.0775, + "loss/crossentropy": 1.5682110786437988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19159042835235596, + "step": 10996 + }, + { + "epoch": 0.21996, + "grad_norm": 2.21875, + "grad_norm_var": 0.008104451497395833, + "learning_rate": 0.0001, + "loss": 4.2933, + "loss/crossentropy": 2.3335143327713013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23225059360265732, + "step": 10998 + }, + { + "epoch": 0.22, + "grad_norm": 1.953125, + "grad_norm_var": 0.007222493489583333, + "learning_rate": 0.0001, + "loss": 4.3146, + "loss/crossentropy": 2.2234359979629517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21465980261564255, + "step": 11000 + }, + { + "epoch": 0.22004, + "grad_norm": 2.09375, + "grad_norm_var": 0.00601806640625, + "learning_rate": 0.0001, + "loss": 4.6023, + "loss/crossentropy": 2.3676271438598633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2674448639154434, + "step": 11002 + }, + { + "epoch": 0.22008, + "grad_norm": 2.0, + "grad_norm_var": 0.006180826822916667, + "learning_rate": 0.0001, + "loss": 4.1064, + "loss/crossentropy": 1.9216612577438354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2036839798092842, + "step": 11004 + }, + { + "epoch": 0.22012, + "grad_norm": 2.078125, + "grad_norm_var": 0.004369099934895833, + "learning_rate": 0.0001, + "loss": 4.0427, + "loss/crossentropy": 1.6732578873634338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18962469696998596, + "step": 11006 + }, + { + "epoch": 0.22016, + "grad_norm": 2.265625, + "grad_norm_var": 0.009761555989583334, + "learning_rate": 0.0001, + "loss": 4.2704, + "loss/crossentropy": 2.0608668327331543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23521222919225693, + "step": 11008 + }, + { + "epoch": 0.2202, + "grad_norm": 2.015625, + "grad_norm_var": 0.010872395833333333, + "learning_rate": 0.0001, + "loss": 4.1808, + "loss/crossentropy": 1.8038227558135986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1811496466398239, + "step": 11010 + }, + { + "epoch": 0.22024, + "grad_norm": 2.15625, + "grad_norm_var": 0.010789998372395833, + "learning_rate": 0.0001, + "loss": 4.3407, + "loss/crossentropy": 1.8687627911567688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20196181535720825, + "step": 11012 + }, + { + "epoch": 0.22028, + "grad_norm": 2.1875, + "grad_norm_var": 0.01138916015625, + "learning_rate": 0.0001, + "loss": 4.0608, + "loss/crossentropy": 1.8011687397956848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18384280055761337, + "step": 11014 + }, + { + "epoch": 0.22032, + "grad_norm": 1.9609375, + "grad_norm_var": 0.012727610270182292, + "learning_rate": 0.0001, + "loss": 4.0229, + "loss/crossentropy": 1.8108493089675903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19042345136404037, + "step": 11016 + }, + { + "epoch": 0.22036, + "grad_norm": 2.09375, + "grad_norm_var": 0.012611643473307291, + "learning_rate": 0.0001, + "loss": 4.2755, + "loss/crossentropy": 2.4056872129440308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22949732840061188, + "step": 11018 + }, + { + "epoch": 0.2204, + "grad_norm": 2.0625, + "grad_norm_var": 0.011451975504557291, + "learning_rate": 0.0001, + "loss": 4.2768, + "loss/crossentropy": 2.0417627692222595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21793337166309357, + "step": 11020 + }, + { + "epoch": 0.22044, + "grad_norm": 2.125, + "grad_norm_var": 0.011549631754557291, + "learning_rate": 0.0001, + "loss": 4.2767, + "loss/crossentropy": 1.8312503099441528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19038879871368408, + "step": 11022 + }, + { + "epoch": 0.22048, + "grad_norm": 2.484375, + "grad_norm_var": 0.01941095987955729, + "learning_rate": 0.0001, + "loss": 4.1846, + "loss/crossentropy": 2.0615866780281067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224861241877079, + "step": 11024 + }, + { + "epoch": 0.22052, + "grad_norm": 1.984375, + "grad_norm_var": 0.02702611287434896, + "learning_rate": 0.0001, + "loss": 4.0216, + "loss/crossentropy": 1.8578996062278748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20426958799362183, + "step": 11026 + }, + { + "epoch": 0.22056, + "grad_norm": 1.9375, + "grad_norm_var": 0.028364817301432293, + "learning_rate": 0.0001, + "loss": 3.9303, + "loss/crossentropy": 2.01333224773407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2140725627541542, + "step": 11028 + }, + { + "epoch": 0.2206, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0296142578125, + "learning_rate": 0.0001, + "loss": 3.8142, + "loss/crossentropy": 1.9607431292533875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19605513662099838, + "step": 11030 + }, + { + "epoch": 0.22064, + "grad_norm": 2.078125, + "grad_norm_var": 0.026712799072265626, + "learning_rate": 0.0001, + "loss": 4.1336, + "loss/crossentropy": 1.8091335892677307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18755877763032913, + "step": 11032 + }, + { + "epoch": 0.22068, + "grad_norm": 2.09375, + "grad_norm_var": 0.0282958984375, + "learning_rate": 0.0001, + "loss": 4.0206, + "loss/crossentropy": 1.5547168254852295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18891388177871704, + "step": 11034 + }, + { + "epoch": 0.22072, + "grad_norm": 1.9453125, + "grad_norm_var": 0.029605865478515625, + "learning_rate": 0.0001, + "loss": 4.1658, + "loss/crossentropy": 2.0881760120391846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21140296012163162, + "step": 11036 + }, + { + "epoch": 0.22076, + "grad_norm": 2.953125, + "grad_norm_var": 0.07485936482747396, + "learning_rate": 0.0001, + "loss": 4.0865, + "loss/crossentropy": 2.0874351263046265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2202945575118065, + "step": 11038 + }, + { + "epoch": 0.2208, + "grad_norm": 2.109375, + "grad_norm_var": 0.06611302693684896, + "learning_rate": 0.0001, + "loss": 4.3998, + "loss/crossentropy": 2.2349069118499756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2247784063220024, + "step": 11040 + }, + { + "epoch": 0.22084, + "grad_norm": 2.328125, + "grad_norm_var": 0.060373687744140626, + "learning_rate": 0.0001, + "loss": 4.4679, + "loss/crossentropy": 2.0404593348503113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23377884924411774, + "step": 11042 + }, + { + "epoch": 0.22088, + "grad_norm": 2.046875, + "grad_norm_var": 0.05997314453125, + "learning_rate": 0.0001, + "loss": 4.115, + "loss/crossentropy": 2.418761968612671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2288268655538559, + "step": 11044 + }, + { + "epoch": 0.22092, + "grad_norm": 2.25, + "grad_norm_var": 0.05608495076497396, + "learning_rate": 0.0001, + "loss": 4.514, + "loss/crossentropy": 2.3263691663742065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23381420969963074, + "step": 11046 + }, + { + "epoch": 0.22096, + "grad_norm": 2.34375, + "grad_norm_var": 0.058166249593098955, + "learning_rate": 0.0001, + "loss": 4.3659, + "loss/crossentropy": 2.0020187497138977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993579939007759, + "step": 11048 + }, + { + "epoch": 0.221, + "grad_norm": 1.984375, + "grad_norm_var": 0.05726318359375, + "learning_rate": 0.0001, + "loss": 4.1869, + "loss/crossentropy": 2.3417880535125732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21232012659311295, + "step": 11050 + }, + { + "epoch": 0.22104, + "grad_norm": 2.375, + "grad_norm_var": 0.06346817016601562, + "learning_rate": 0.0001, + "loss": 4.9882, + "loss/crossentropy": 2.2952964305877686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22096505016088486, + "step": 11052 + }, + { + "epoch": 0.22108, + "grad_norm": 2.015625, + "grad_norm_var": 0.028562164306640624, + "learning_rate": 0.0001, + "loss": 4.2431, + "loss/crossentropy": 1.7963152527809143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18996400386095047, + "step": 11054 + }, + { + "epoch": 0.22112, + "grad_norm": 2.109375, + "grad_norm_var": 0.028433990478515626, + "learning_rate": 0.0001, + "loss": 4.4276, + "loss/crossentropy": 2.1328593492507935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21772069483995438, + "step": 11056 + }, + { + "epoch": 0.22116, + "grad_norm": 2.3125, + "grad_norm_var": 0.0318267822265625, + "learning_rate": 0.0001, + "loss": 4.317, + "loss/crossentropy": 1.8048993349075317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17359213531017303, + "step": 11058 + }, + { + "epoch": 0.2212, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0309234619140625, + "learning_rate": 0.0001, + "loss": 4.0173, + "loss/crossentropy": 1.9539333581924438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20828361809253693, + "step": 11060 + }, + { + "epoch": 0.22124, + "grad_norm": 1.9921875, + "grad_norm_var": 0.032291412353515625, + "learning_rate": 0.0001, + "loss": 4.0288, + "loss/crossentropy": 1.8231340050697327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20730753242969513, + "step": 11062 + }, + { + "epoch": 0.22128, + "grad_norm": 2.25, + "grad_norm_var": 0.033878326416015625, + "learning_rate": 0.0001, + "loss": 4.2756, + "loss/crossentropy": 1.9315263032913208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20077265799045563, + "step": 11064 + }, + { + "epoch": 0.22132, + "grad_norm": 1.9375, + "grad_norm_var": 0.034795888264973956, + "learning_rate": 0.0001, + "loss": 4.2137, + "loss/crossentropy": 1.9278368949890137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2203744500875473, + "step": 11066 + }, + { + "epoch": 0.22136, + "grad_norm": 2.21875, + "grad_norm_var": 0.016812896728515624, + "learning_rate": 0.0001, + "loss": 4.38, + "loss/crossentropy": 1.9314138889312744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2024018093943596, + "step": 11068 + }, + { + "epoch": 0.2214, + "grad_norm": 1.9609375, + "grad_norm_var": 0.019618479410807292, + "learning_rate": 0.0001, + "loss": 3.9142, + "loss/crossentropy": 1.822297751903534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20530518889427185, + "step": 11070 + }, + { + "epoch": 0.22144, + "grad_norm": 2.09375, + "grad_norm_var": 0.02112401326497396, + "learning_rate": 0.0001, + "loss": 4.1412, + "loss/crossentropy": 2.0021358132362366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1963522955775261, + "step": 11072 + }, + { + "epoch": 0.22148, + "grad_norm": 2.15625, + "grad_norm_var": 0.018293253580729165, + "learning_rate": 0.0001, + "loss": 4.5595, + "loss/crossentropy": 2.3780601024627686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24999547004699707, + "step": 11074 + }, + { + "epoch": 0.22152, + "grad_norm": 2.109375, + "grad_norm_var": 0.01727472941080729, + "learning_rate": 0.0001, + "loss": 4.2572, + "loss/crossentropy": 1.8128371238708496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1940363049507141, + "step": 11076 + }, + { + "epoch": 0.22156, + "grad_norm": 2.09375, + "grad_norm_var": 0.0147857666015625, + "learning_rate": 0.0001, + "loss": 4.1388, + "loss/crossentropy": 1.836561381816864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19289480894804, + "step": 11078 + }, + { + "epoch": 0.2216, + "grad_norm": 2.03125, + "grad_norm_var": 0.0125640869140625, + "learning_rate": 0.0001, + "loss": 3.9891, + "loss/crossentropy": 2.203549385070801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23240803182125092, + "step": 11080 + }, + { + "epoch": 0.22164, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0147857666015625, + "learning_rate": 0.0001, + "loss": 3.9827, + "loss/crossentropy": 2.126620829105377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21564993262290955, + "step": 11082 + }, + { + "epoch": 0.22168, + "grad_norm": 2.203125, + "grad_norm_var": 0.015672810872395835, + "learning_rate": 0.0001, + "loss": 4.6468, + "loss/crossentropy": 2.5914783477783203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23293393850326538, + "step": 11084 + }, + { + "epoch": 0.22172, + "grad_norm": 2.046875, + "grad_norm_var": 0.016752115885416665, + "learning_rate": 0.0001, + "loss": 4.0881, + "loss/crossentropy": 2.097515106201172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21570277214050293, + "step": 11086 + }, + { + "epoch": 0.22176, + "grad_norm": 2.203125, + "grad_norm_var": 0.01565526326497396, + "learning_rate": 0.0001, + "loss": 4.2469, + "loss/crossentropy": 1.9408356547355652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22177009284496307, + "step": 11088 + }, + { + "epoch": 0.2218, + "grad_norm": 2.171875, + "grad_norm_var": 0.014713287353515625, + "learning_rate": 0.0001, + "loss": 4.378, + "loss/crossentropy": 1.9851223826408386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20538930594921112, + "step": 11090 + }, + { + "epoch": 0.22184, + "grad_norm": 2.03125, + "grad_norm_var": 0.014694976806640624, + "learning_rate": 0.0001, + "loss": 4.2424, + "loss/crossentropy": 2.261754631996155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21294979751110077, + "step": 11092 + }, + { + "epoch": 0.22188, + "grad_norm": 2.015625, + "grad_norm_var": 0.014924875895182292, + "learning_rate": 0.0001, + "loss": 4.1847, + "loss/crossentropy": 1.978752851486206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21184594929218292, + "step": 11094 + }, + { + "epoch": 0.22192, + "grad_norm": 2.21875, + "grad_norm_var": 0.014918772379557292, + "learning_rate": 0.0001, + "loss": 4.515, + "loss/crossentropy": 2.1992926597595215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2428576499223709, + "step": 11096 + }, + { + "epoch": 0.22196, + "grad_norm": 2.078125, + "grad_norm_var": 0.010131581624348959, + "learning_rate": 0.0001, + "loss": 4.1475, + "loss/crossentropy": 1.9407767057418823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25351718813180923, + "step": 11098 + }, + { + "epoch": 0.222, + "grad_norm": 2.234375, + "grad_norm_var": 0.009993235270182291, + "learning_rate": 0.0001, + "loss": 4.2485, + "loss/crossentropy": 2.2973347902297974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2211253046989441, + "step": 11100 + }, + { + "epoch": 0.22204, + "grad_norm": 2.109375, + "grad_norm_var": 0.007380930582682291, + "learning_rate": 0.0001, + "loss": 4.466, + "loss/crossentropy": 2.323284387588501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23907601833343506, + "step": 11102 + }, + { + "epoch": 0.22208, + "grad_norm": 2.21875, + "grad_norm_var": 0.00631103515625, + "learning_rate": 0.0001, + "loss": 4.3745, + "loss/crossentropy": 1.9557610750198364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20206698775291443, + "step": 11104 + }, + { + "epoch": 0.22212, + "grad_norm": 2.203125, + "grad_norm_var": 0.006083170572916667, + "learning_rate": 0.0001, + "loss": 4.1432, + "loss/crossentropy": 2.130094528198242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22658853232860565, + "step": 11106 + }, + { + "epoch": 0.22216, + "grad_norm": 2.140625, + "grad_norm_var": 0.006078084309895833, + "learning_rate": 0.0001, + "loss": 4.0207, + "loss/crossentropy": 1.6461073160171509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.183306485414505, + "step": 11108 + }, + { + "epoch": 0.2222, + "grad_norm": 2.265625, + "grad_norm_var": 0.008310699462890625, + "learning_rate": 0.0001, + "loss": 4.2901, + "loss/crossentropy": 1.9869291186332703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20724371075630188, + "step": 11110 + }, + { + "epoch": 0.22224, + "grad_norm": 2.15625, + "grad_norm_var": 0.007696278889973958, + "learning_rate": 0.0001, + "loss": 4.2361, + "loss/crossentropy": 2.314267873764038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22680803388357162, + "step": 11112 + }, + { + "epoch": 0.22228, + "grad_norm": 2.109375, + "grad_norm_var": 0.0104888916015625, + "learning_rate": 0.0001, + "loss": 4.37, + "loss/crossentropy": 2.061935067176819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1931627094745636, + "step": 11114 + }, + { + "epoch": 0.22232, + "grad_norm": 2.03125, + "grad_norm_var": 0.009601847330729166, + "learning_rate": 0.0001, + "loss": 4.1174, + "loss/crossentropy": 2.157355546951294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2328970953822136, + "step": 11116 + }, + { + "epoch": 0.22236, + "grad_norm": 2.1875, + "grad_norm_var": 0.012214914957682291, + "learning_rate": 0.0001, + "loss": 4.2092, + "loss/crossentropy": 1.8968737125396729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19509851932525635, + "step": 11118 + }, + { + "epoch": 0.2224, + "grad_norm": 2.15625, + "grad_norm_var": 0.013516998291015625, + "learning_rate": 0.0001, + "loss": 4.3433, + "loss/crossentropy": 2.1147825717926025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22639968246221542, + "step": 11120 + }, + { + "epoch": 0.22244, + "grad_norm": 1.8828125, + "grad_norm_var": 0.01519775390625, + "learning_rate": 0.0001, + "loss": 3.9818, + "loss/crossentropy": 1.9084222316741943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16847409307956696, + "step": 11122 + }, + { + "epoch": 0.22248, + "grad_norm": 2.0625, + "grad_norm_var": 0.015028635660807291, + "learning_rate": 0.0001, + "loss": 4.0188, + "loss/crossentropy": 1.8558747172355652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2013881430029869, + "step": 11124 + }, + { + "epoch": 0.22252, + "grad_norm": 2.203125, + "grad_norm_var": 0.015755208333333333, + "learning_rate": 0.0001, + "loss": 4.3236, + "loss/crossentropy": 2.2021052837371826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2129388153553009, + "step": 11126 + }, + { + "epoch": 0.22256, + "grad_norm": 1.8125, + "grad_norm_var": 0.019212849934895835, + "learning_rate": 0.0001, + "loss": 4.2892, + "loss/crossentropy": 2.267111897468567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22087457031011581, + "step": 11128 + }, + { + "epoch": 0.2226, + "grad_norm": 2.0625, + "grad_norm_var": 0.02021052042643229, + "learning_rate": 0.0001, + "loss": 4.3781, + "loss/crossentropy": 1.860244870185852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21286892145872116, + "step": 11130 + }, + { + "epoch": 0.22264, + "grad_norm": 2.046875, + "grad_norm_var": 0.021142578125, + "learning_rate": 0.0001, + "loss": 3.7832, + "loss/crossentropy": 1.8312670588493347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20548796653747559, + "step": 11132 + }, + { + "epoch": 0.22268, + "grad_norm": 2.015625, + "grad_norm_var": 0.019087473551432293, + "learning_rate": 0.0001, + "loss": 4.1916, + "loss/crossentropy": 2.2309051752090454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2244933694601059, + "step": 11134 + }, + { + "epoch": 0.22272, + "grad_norm": 2.09375, + "grad_norm_var": 0.01761449178059896, + "learning_rate": 0.0001, + "loss": 4.1125, + "loss/crossentropy": 2.1003236770629883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22222542017698288, + "step": 11136 + }, + { + "epoch": 0.22276, + "grad_norm": 1.953125, + "grad_norm_var": 0.015965779622395832, + "learning_rate": 0.0001, + "loss": 3.9391, + "loss/crossentropy": 1.95048588514328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20614230632781982, + "step": 11138 + }, + { + "epoch": 0.2228, + "grad_norm": 1.9765625, + "grad_norm_var": 0.016364542643229167, + "learning_rate": 0.0001, + "loss": 4.1608, + "loss/crossentropy": 2.2856240272521973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23748096823692322, + "step": 11140 + }, + { + "epoch": 0.22284, + "grad_norm": 2.03125, + "grad_norm_var": 0.012455240885416666, + "learning_rate": 0.0001, + "loss": 4.2187, + "loss/crossentropy": 2.3873090744018555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23743586987257004, + "step": 11142 + }, + { + "epoch": 0.22288, + "grad_norm": 2.09375, + "grad_norm_var": 0.008234659830729166, + "learning_rate": 0.0001, + "loss": 4.1283, + "loss/crossentropy": 2.248707115650177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22035827487707138, + "step": 11144 + }, + { + "epoch": 0.22292, + "grad_norm": 2.09375, + "grad_norm_var": 0.0067860921223958336, + "learning_rate": 0.0001, + "loss": 4.3561, + "loss/crossentropy": 1.9792630672454834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2102261707186699, + "step": 11146 + }, + { + "epoch": 0.22296, + "grad_norm": 2.078125, + "grad_norm_var": 0.006156158447265625, + "learning_rate": 0.0001, + "loss": 4.0479, + "loss/crossentropy": 2.284587323665619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20941253006458282, + "step": 11148 + }, + { + "epoch": 0.223, + "grad_norm": 2.03125, + "grad_norm_var": 0.006361643473307292, + "learning_rate": 0.0001, + "loss": 4.1792, + "loss/crossentropy": 1.9800288677215576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20009202510118484, + "step": 11150 + }, + { + "epoch": 0.22304, + "grad_norm": 2.046875, + "grad_norm_var": 0.0064165751139322914, + "learning_rate": 0.0001, + "loss": 4.1859, + "loss/crossentropy": 1.925826370716095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18616148829460144, + "step": 11152 + }, + { + "epoch": 0.22308, + "grad_norm": 2.046875, + "grad_norm_var": 0.0053708394368489586, + "learning_rate": 0.0001, + "loss": 3.9281, + "loss/crossentropy": 2.158196806907654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21486472338438034, + "step": 11154 + }, + { + "epoch": 0.22312, + "grad_norm": 2.078125, + "grad_norm_var": 0.004393513997395833, + "learning_rate": 0.0001, + "loss": 4.3342, + "loss/crossentropy": 2.0873841047286987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22101643681526184, + "step": 11156 + }, + { + "epoch": 0.22316, + "grad_norm": 2.0625, + "grad_norm_var": 0.0044748942057291664, + "learning_rate": 0.0001, + "loss": 4.3772, + "loss/crossentropy": 1.9485042691230774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19757962226867676, + "step": 11158 + }, + { + "epoch": 0.2232, + "grad_norm": 2.0625, + "grad_norm_var": 0.004279581705729166, + "learning_rate": 0.0001, + "loss": 4.331, + "loss/crossentropy": 2.055552661418915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19271192699670792, + "step": 11160 + }, + { + "epoch": 0.22324, + "grad_norm": 2.046875, + "grad_norm_var": 0.0033599853515625, + "learning_rate": 0.0001, + "loss": 4.0958, + "loss/crossentropy": 1.9115247130393982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1922249048948288, + "step": 11162 + }, + { + "epoch": 0.22328, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0043609619140625, + "learning_rate": 0.0001, + "loss": 4.2582, + "loss/crossentropy": 2.1461042761802673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2185581848025322, + "step": 11164 + }, + { + "epoch": 0.22332, + "grad_norm": 2.109375, + "grad_norm_var": 0.004808553059895833, + "learning_rate": 0.0001, + "loss": 4.1212, + "loss/crossentropy": 2.1625128984451294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20505116879940033, + "step": 11166 + }, + { + "epoch": 0.22336, + "grad_norm": 2.109375, + "grad_norm_var": 0.003692372639973958, + "learning_rate": 0.0001, + "loss": 4.0317, + "loss/crossentropy": 1.9978403449058533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015106976032257, + "step": 11168 + }, + { + "epoch": 0.2234, + "grad_norm": 2.109375, + "grad_norm_var": 0.003794097900390625, + "learning_rate": 0.0001, + "loss": 4.0108, + "loss/crossentropy": 1.9462909698486328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20814144611358643, + "step": 11170 + }, + { + "epoch": 0.22344, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004423014322916667, + "learning_rate": 0.0001, + "loss": 4.2147, + "loss/crossentropy": 2.262490153312683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21692577749490738, + "step": 11172 + }, + { + "epoch": 0.22348, + "grad_norm": 2.140625, + "grad_norm_var": 0.0052734375, + "learning_rate": 0.0001, + "loss": 4.4079, + "loss/crossentropy": 2.092001974582672, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22943469882011414, + "step": 11174 + }, + { + "epoch": 0.22352, + "grad_norm": 2.09375, + "grad_norm_var": 0.0052734375, + "learning_rate": 0.0001, + "loss": 4.273, + "loss/crossentropy": 2.302329421043396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21188674122095108, + "step": 11176 + }, + { + "epoch": 0.22356, + "grad_norm": 1.9765625, + "grad_norm_var": 0.008701324462890625, + "learning_rate": 0.0001, + "loss": 4.0762, + "loss/crossentropy": 1.7181463837623596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19460663199424744, + "step": 11178 + }, + { + "epoch": 0.2236, + "grad_norm": 2.046875, + "grad_norm_var": 0.007895660400390626, + "learning_rate": 0.0001, + "loss": 4.0061, + "loss/crossentropy": 1.8402328491210938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1928708627820015, + "step": 11180 + }, + { + "epoch": 0.22364, + "grad_norm": 2.21875, + "grad_norm_var": 0.008103179931640624, + "learning_rate": 0.0001, + "loss": 4.2968, + "loss/crossentropy": 1.9490735530853271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2492925077676773, + "step": 11182 + }, + { + "epoch": 0.22368, + "grad_norm": 2.015625, + "grad_norm_var": 0.00867919921875, + "learning_rate": 0.0001, + "loss": 4.113, + "loss/crossentropy": 2.367197036743164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25882700830698013, + "step": 11184 + }, + { + "epoch": 0.22372, + "grad_norm": 2.015625, + "grad_norm_var": 0.0090972900390625, + "learning_rate": 0.0001, + "loss": 4.0782, + "loss/crossentropy": 2.1192296743392944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2208312824368477, + "step": 11186 + }, + { + "epoch": 0.22376, + "grad_norm": 2.296875, + "grad_norm_var": 0.010772450764973959, + "learning_rate": 0.0001, + "loss": 4.7239, + "loss/crossentropy": 2.2150460481643677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21440500020980835, + "step": 11188 + }, + { + "epoch": 0.2238, + "grad_norm": 2.203125, + "grad_norm_var": 0.0121490478515625, + "learning_rate": 0.0001, + "loss": 4.4245, + "loss/crossentropy": 2.0837016105651855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2777148336172104, + "step": 11190 + }, + { + "epoch": 0.22384, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014240519205729166, + "learning_rate": 0.0001, + "loss": 3.918, + "loss/crossentropy": 1.7995057106018066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18295922130346298, + "step": 11192 + }, + { + "epoch": 0.22388, + "grad_norm": 2.1875, + "grad_norm_var": 0.014357248942057291, + "learning_rate": 0.0001, + "loss": 4.3361, + "loss/crossentropy": 2.2150347232818604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24010418355464935, + "step": 11194 + }, + { + "epoch": 0.22392, + "grad_norm": 2.140625, + "grad_norm_var": 0.014357248942057291, + "learning_rate": 0.0001, + "loss": 4.1711, + "loss/crossentropy": 2.1161770820617676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2124250829219818, + "step": 11196 + }, + { + "epoch": 0.22396, + "grad_norm": 2.0625, + "grad_norm_var": 0.013578033447265625, + "learning_rate": 0.0001, + "loss": 4.4875, + "loss/crossentropy": 2.224100112915039, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23966734111309052, + "step": 11198 + }, + { + "epoch": 0.224, + "grad_norm": 2.09375, + "grad_norm_var": 0.011921946207682292, + "learning_rate": 0.0001, + "loss": 4.377, + "loss/crossentropy": 2.196989417076111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21709956228733063, + "step": 11200 + }, + { + "epoch": 0.22404, + "grad_norm": 2.03125, + "grad_norm_var": 0.011574045817057291, + "learning_rate": 0.0001, + "loss": 4.0457, + "loss/crossentropy": 1.6592280864715576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1827787384390831, + "step": 11202 + }, + { + "epoch": 0.22408, + "grad_norm": 2.109375, + "grad_norm_var": 0.008790842692057292, + "learning_rate": 0.0001, + "loss": 4.0564, + "loss/crossentropy": 1.9689467549324036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20387520641088486, + "step": 11204 + }, + { + "epoch": 0.22412, + "grad_norm": 2.8125, + "grad_norm_var": 0.03997802734375, + "learning_rate": 0.0001, + "loss": 4.3494, + "loss/crossentropy": 2.1846379041671753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2068176046013832, + "step": 11206 + }, + { + "epoch": 0.22416, + "grad_norm": 2.0, + "grad_norm_var": 0.037353515625, + "learning_rate": 0.0001, + "loss": 4.3948, + "loss/crossentropy": 1.8361602425575256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22452392429113388, + "step": 11208 + }, + { + "epoch": 0.2242, + "grad_norm": 1.9375, + "grad_norm_var": 0.0390625, + "learning_rate": 0.0001, + "loss": 4.3597, + "loss/crossentropy": 2.5658172369003296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2602368891239166, + "step": 11210 + }, + { + "epoch": 0.22424, + "grad_norm": 1.9296875, + "grad_norm_var": 0.04119440714518229, + "learning_rate": 0.0001, + "loss": 4.0555, + "loss/crossentropy": 1.8167916536331177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20330622047185898, + "step": 11212 + }, + { + "epoch": 0.22428, + "grad_norm": 2.140625, + "grad_norm_var": 0.04070612589518229, + "learning_rate": 0.0001, + "loss": 4.2374, + "loss/crossentropy": 1.9151161313056946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20695462822914124, + "step": 11214 + }, + { + "epoch": 0.22432, + "grad_norm": 1.8515625, + "grad_norm_var": 0.04528401692708333, + "learning_rate": 0.0001, + "loss": 3.9807, + "loss/crossentropy": 1.9590752720832825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1945583075284958, + "step": 11216 + }, + { + "epoch": 0.22436, + "grad_norm": 2.0625, + "grad_norm_var": 0.04522298177083333, + "learning_rate": 0.0001, + "loss": 4.2803, + "loss/crossentropy": 1.9777710437774658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20104076713323593, + "step": 11218 + }, + { + "epoch": 0.2244, + "grad_norm": 1.9296875, + "grad_norm_var": 0.04744847615559896, + "learning_rate": 0.0001, + "loss": 4.0279, + "loss/crossentropy": 1.725940465927124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19038556516170502, + "step": 11220 + }, + { + "epoch": 0.22444, + "grad_norm": 2.203125, + "grad_norm_var": 0.012963612874348959, + "learning_rate": 0.0001, + "loss": 4.4067, + "loss/crossentropy": 2.505728602409363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2434321641921997, + "step": 11222 + }, + { + "epoch": 0.22448, + "grad_norm": 2.03125, + "grad_norm_var": 0.010518137613932292, + "learning_rate": 0.0001, + "loss": 4.2902, + "loss/crossentropy": 2.0733951330184937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21223794668912888, + "step": 11224 + }, + { + "epoch": 0.22452, + "grad_norm": 2.03125, + "grad_norm_var": 0.010628255208333333, + "learning_rate": 0.0001, + "loss": 4.1548, + "loss/crossentropy": 2.105073928833008, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22444891929626465, + "step": 11226 + }, + { + "epoch": 0.22456, + "grad_norm": 2.109375, + "grad_norm_var": 0.009745025634765625, + "learning_rate": 0.0001, + "loss": 4.2361, + "loss/crossentropy": 1.9164994359016418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20893585681915283, + "step": 11228 + }, + { + "epoch": 0.2246, + "grad_norm": 2.328125, + "grad_norm_var": 0.014902496337890625, + "learning_rate": 0.0001, + "loss": 4.3753, + "loss/crossentropy": 2.0922536849975586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22757402062416077, + "step": 11230 + }, + { + "epoch": 0.22464, + "grad_norm": 2.015625, + "grad_norm_var": 0.01190185546875, + "learning_rate": 0.0001, + "loss": 3.9791, + "loss/crossentropy": 1.886056363582611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2054021805524826, + "step": 11232 + }, + { + "epoch": 0.22468, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014300282796223958, + "learning_rate": 0.0001, + "loss": 3.9194, + "loss/crossentropy": 2.028389871120453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2109459862112999, + "step": 11234 + }, + { + "epoch": 0.22472, + "grad_norm": 2.078125, + "grad_norm_var": 0.012813313802083334, + "learning_rate": 0.0001, + "loss": 4.2084, + "loss/crossentropy": 1.9590765833854675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19774210453033447, + "step": 11236 + }, + { + "epoch": 0.22476, + "grad_norm": 2.03125, + "grad_norm_var": 0.012800852457682291, + "learning_rate": 0.0001, + "loss": 4.0153, + "loss/crossentropy": 1.8284756541252136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17485490441322327, + "step": 11238 + }, + { + "epoch": 0.2248, + "grad_norm": 2.203125, + "grad_norm_var": 0.014625803629557291, + "learning_rate": 0.0001, + "loss": 4.3494, + "loss/crossentropy": 2.1119225025177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22235971689224243, + "step": 11240 + }, + { + "epoch": 0.22484, + "grad_norm": 2.1875, + "grad_norm_var": 0.015209706624348958, + "learning_rate": 0.0001, + "loss": 4.3303, + "loss/crossentropy": 2.3050538301467896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2249491587281227, + "step": 11242 + }, + { + "epoch": 0.22488, + "grad_norm": 2.0625, + "grad_norm_var": 0.015360514322916666, + "learning_rate": 0.0001, + "loss": 4.1235, + "loss/crossentropy": 1.9806398153305054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20141054689884186, + "step": 11244 + }, + { + "epoch": 0.22492, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007879384358723958, + "learning_rate": 0.0001, + "loss": 4.0793, + "loss/crossentropy": 2.0702012181282043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1980852484703064, + "step": 11246 + }, + { + "epoch": 0.22496, + "grad_norm": 2.0625, + "grad_norm_var": 0.008113606770833334, + "learning_rate": 0.0001, + "loss": 3.9997, + "loss/crossentropy": 2.205365300178528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22107956558465958, + "step": 11248 + }, + { + "epoch": 0.225, + "grad_norm": 2.890625, + "grad_norm_var": 0.05272598266601562, + "learning_rate": 0.0001, + "loss": 4.0748, + "loss/crossentropy": 2.196021556854248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.199070006608963, + "step": 11250 + }, + { + "epoch": 0.22504, + "grad_norm": 2.140625, + "grad_norm_var": 0.05241673787434896, + "learning_rate": 0.0001, + "loss": 4.2062, + "loss/crossentropy": 2.0661864280700684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22117173671722412, + "step": 11252 + }, + { + "epoch": 0.22508, + "grad_norm": 2.15625, + "grad_norm_var": 0.050687408447265624, + "learning_rate": 0.0001, + "loss": 3.9176, + "loss/crossentropy": 1.825901210308075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.174439437687397, + "step": 11254 + }, + { + "epoch": 0.22512, + "grad_norm": 2.3125, + "grad_norm_var": 0.061470286051432295, + "learning_rate": 0.0001, + "loss": 4.7052, + "loss/crossentropy": 2.3710498809814453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2357504665851593, + "step": 11256 + }, + { + "epoch": 0.22516, + "grad_norm": 1.859375, + "grad_norm_var": 0.06520182291666667, + "learning_rate": 0.0001, + "loss": 3.9419, + "loss/crossentropy": 2.1523157358169556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2179568186402321, + "step": 11258 + }, + { + "epoch": 0.2252, + "grad_norm": 2.0, + "grad_norm_var": 0.06559015909830729, + "learning_rate": 0.0001, + "loss": 4.1953, + "loss/crossentropy": 2.0974292755126953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1980036199092865, + "step": 11260 + }, + { + "epoch": 0.22524, + "grad_norm": 2.234375, + "grad_norm_var": 0.06339925130208333, + "learning_rate": 0.0001, + "loss": 4.4409, + "loss/crossentropy": 2.2679883241653442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22085320204496384, + "step": 11262 + }, + { + "epoch": 0.22528, + "grad_norm": 2.109375, + "grad_norm_var": 0.06108373006184896, + "learning_rate": 0.0001, + "loss": 4.2972, + "loss/crossentropy": 2.0697352290153503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21976519376039505, + "step": 11264 + }, + { + "epoch": 0.22532, + "grad_norm": 2.03125, + "grad_norm_var": 0.025465647379557293, + "learning_rate": 0.0001, + "loss": 4.4149, + "loss/crossentropy": 2.1807767748832703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21304857730865479, + "step": 11266 + }, + { + "epoch": 0.22536, + "grad_norm": 2.140625, + "grad_norm_var": 0.02541071573893229, + "learning_rate": 0.0001, + "loss": 4.4395, + "loss/crossentropy": 2.178891122341156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22129195928573608, + "step": 11268 + }, + { + "epoch": 0.2254, + "grad_norm": 2.109375, + "grad_norm_var": 0.023607381184895835, + "learning_rate": 0.0001, + "loss": 4.2908, + "loss/crossentropy": 2.1025387048721313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20581847429275513, + "step": 11270 + }, + { + "epoch": 0.22544, + "grad_norm": 1.8671875, + "grad_norm_var": 0.013055165608723959, + "learning_rate": 0.0001, + "loss": 4.2192, + "loss/crossentropy": 1.8710424900054932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1900918111205101, + "step": 11272 + }, + { + "epoch": 0.22548, + "grad_norm": 2.296875, + "grad_norm_var": 0.012373606363932291, + "learning_rate": 0.0001, + "loss": 4.498, + "loss/crossentropy": 1.8837561011314392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21019018441438675, + "step": 11274 + }, + { + "epoch": 0.22552, + "grad_norm": 1.984375, + "grad_norm_var": 0.013132476806640625, + "learning_rate": 0.0001, + "loss": 4.3139, + "loss/crossentropy": 2.2795380353927612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23858627676963806, + "step": 11276 + }, + { + "epoch": 0.22556, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0158111572265625, + "learning_rate": 0.0001, + "loss": 4.0532, + "loss/crossentropy": 2.0241262316703796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1990864798426628, + "step": 11278 + }, + { + "epoch": 0.2256, + "grad_norm": 2.40625, + "grad_norm_var": 0.022541300455729166, + "learning_rate": 0.0001, + "loss": 4.2935, + "loss/crossentropy": 1.9307058453559875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100597321987152, + "step": 11280 + }, + { + "epoch": 0.22564, + "grad_norm": 2.171875, + "grad_norm_var": 0.020414225260416665, + "learning_rate": 0.0001, + "loss": 4.2523, + "loss/crossentropy": 2.08352792263031, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21930356323719025, + "step": 11282 + }, + { + "epoch": 0.22568, + "grad_norm": 2.171875, + "grad_norm_var": 0.021361287434895834, + "learning_rate": 0.0001, + "loss": 4.4277, + "loss/crossentropy": 2.393290877342224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2560981214046478, + "step": 11284 + }, + { + "epoch": 0.22572, + "grad_norm": 2.0, + "grad_norm_var": 0.023738606770833334, + "learning_rate": 0.0001, + "loss": 4.1612, + "loss/crossentropy": 2.20097017288208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20318082720041275, + "step": 11286 + }, + { + "epoch": 0.22576, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0221099853515625, + "learning_rate": 0.0001, + "loss": 4.2307, + "loss/crossentropy": 2.031981647014618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20313747972249985, + "step": 11288 + }, + { + "epoch": 0.2258, + "grad_norm": 2.078125, + "grad_norm_var": 0.0187255859375, + "learning_rate": 0.0001, + "loss": 4.3513, + "loss/crossentropy": 2.197006046772003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2301759570837021, + "step": 11290 + }, + { + "epoch": 0.22584, + "grad_norm": 2.0625, + "grad_norm_var": 0.0197021484375, + "learning_rate": 0.0001, + "loss": 4.2903, + "loss/crossentropy": 2.294014096260071, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21280069649219513, + "step": 11292 + }, + { + "epoch": 0.22588, + "grad_norm": 1.9921875, + "grad_norm_var": 0.018089803059895833, + "learning_rate": 0.0001, + "loss": 4.0985, + "loss/crossentropy": 2.0551719665527344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20910422503948212, + "step": 11294 + }, + { + "epoch": 0.22592, + "grad_norm": 2.109375, + "grad_norm_var": 0.010636393229166667, + "learning_rate": 0.0001, + "loss": 4.2109, + "loss/crossentropy": 2.1044358015060425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20463456213474274, + "step": 11296 + }, + { + "epoch": 0.22596, + "grad_norm": 2.265625, + "grad_norm_var": 0.012631988525390625, + "learning_rate": 0.0001, + "loss": 4.198, + "loss/crossentropy": 2.1013529300689697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21202504634857178, + "step": 11298 + }, + { + "epoch": 0.226, + "grad_norm": 2.046875, + "grad_norm_var": 0.009329986572265626, + "learning_rate": 0.0001, + "loss": 4.4227, + "loss/crossentropy": 2.1887649297714233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2048492729663849, + "step": 11300 + }, + { + "epoch": 0.22604, + "grad_norm": 2.015625, + "grad_norm_var": 0.009525299072265625, + "learning_rate": 0.0001, + "loss": 4.0952, + "loss/crossentropy": 1.8243364691734314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2013952136039734, + "step": 11302 + }, + { + "epoch": 0.22608, + "grad_norm": 2.09375, + "grad_norm_var": 0.0137451171875, + "learning_rate": 0.0001, + "loss": 4.4636, + "loss/crossentropy": 2.2550541162490845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2493698000907898, + "step": 11304 + }, + { + "epoch": 0.22612, + "grad_norm": 1.953125, + "grad_norm_var": 0.0149169921875, + "learning_rate": 0.0001, + "loss": 3.9407, + "loss/crossentropy": 1.8744492530822754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19957631081342697, + "step": 11306 + }, + { + "epoch": 0.22616, + "grad_norm": 2.015625, + "grad_norm_var": 0.013681793212890625, + "learning_rate": 0.0001, + "loss": 4.0717, + "loss/crossentropy": 2.026526629924774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20474696904420853, + "step": 11308 + }, + { + "epoch": 0.2262, + "grad_norm": 1.9453125, + "grad_norm_var": 0.015881093343098958, + "learning_rate": 0.0001, + "loss": 4.3898, + "loss/crossentropy": 2.3802725076675415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.247590571641922, + "step": 11310 + }, + { + "epoch": 0.22624, + "grad_norm": 2.015625, + "grad_norm_var": 0.016123199462890626, + "learning_rate": 0.0001, + "loss": 4.171, + "loss/crossentropy": 1.9359605312347412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.207523413002491, + "step": 11312 + }, + { + "epoch": 0.22628, + "grad_norm": 2.046875, + "grad_norm_var": 0.0129547119140625, + "learning_rate": 0.0001, + "loss": 4.1634, + "loss/crossentropy": 2.3035311698913574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21815945208072662, + "step": 11314 + }, + { + "epoch": 0.22632, + "grad_norm": 2.234375, + "grad_norm_var": 0.0225341796875, + "learning_rate": 0.0001, + "loss": 4.3189, + "loss/crossentropy": 1.8083779215812683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20705801248550415, + "step": 11316 + }, + { + "epoch": 0.22636, + "grad_norm": 2.203125, + "grad_norm_var": 0.0205718994140625, + "learning_rate": 0.0001, + "loss": 4.2912, + "loss/crossentropy": 2.306682825088501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22666794806718826, + "step": 11318 + }, + { + "epoch": 0.2264, + "grad_norm": 2.0, + "grad_norm_var": 0.01737060546875, + "learning_rate": 0.0001, + "loss": 4.0519, + "loss/crossentropy": 1.809365153312683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20744331926107407, + "step": 11320 + }, + { + "epoch": 0.22644, + "grad_norm": 2.265625, + "grad_norm_var": 0.017894490559895834, + "learning_rate": 0.0001, + "loss": 4.2428, + "loss/crossentropy": 2.289853572845459, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23791835457086563, + "step": 11322 + }, + { + "epoch": 0.22648, + "grad_norm": 2.53125, + "grad_norm_var": 0.027341461181640624, + "learning_rate": 0.0001, + "loss": 4.5668, + "loss/crossentropy": 2.0609280467033386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2166810780763626, + "step": 11324 + }, + { + "epoch": 0.22652, + "grad_norm": 2.0625, + "grad_norm_var": 0.02535400390625, + "learning_rate": 0.0001, + "loss": 4.0567, + "loss/crossentropy": 2.1174912452697754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20515341311693192, + "step": 11326 + }, + { + "epoch": 0.22656, + "grad_norm": 1.890625, + "grad_norm_var": 0.029319000244140626, + "learning_rate": 0.0001, + "loss": 4.0318, + "loss/crossentropy": 1.99192476272583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20127686113119125, + "step": 11328 + }, + { + "epoch": 0.2266, + "grad_norm": 1.9921875, + "grad_norm_var": 0.031060536702473957, + "learning_rate": 0.0001, + "loss": 3.9552, + "loss/crossentropy": 1.7805609107017517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17500489950180054, + "step": 11330 + }, + { + "epoch": 0.22664, + "grad_norm": 2.171875, + "grad_norm_var": 0.02398656209309896, + "learning_rate": 0.0001, + "loss": 4.1417, + "loss/crossentropy": 2.024085283279419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20659280568361282, + "step": 11332 + }, + { + "epoch": 0.22668, + "grad_norm": 1.9609375, + "grad_norm_var": 0.023631795247395834, + "learning_rate": 0.0001, + "loss": 4.2674, + "loss/crossentropy": 2.174618899822235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23250284045934677, + "step": 11334 + }, + { + "epoch": 0.22672, + "grad_norm": 1.9375, + "grad_norm_var": 0.02451171875, + "learning_rate": 0.0001, + "loss": 4.213, + "loss/crossentropy": 1.886943757534027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18855806440114975, + "step": 11336 + }, + { + "epoch": 0.22676, + "grad_norm": 1.9609375, + "grad_norm_var": 0.02182184855143229, + "learning_rate": 0.0001, + "loss": 4.1281, + "loss/crossentropy": 2.060371160507202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1951771154999733, + "step": 11338 + }, + { + "epoch": 0.2268, + "grad_norm": 2.046875, + "grad_norm_var": 0.005576324462890625, + "learning_rate": 0.0001, + "loss": 4.1692, + "loss/crossentropy": 2.15705668926239, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23738030344247818, + "step": 11340 + }, + { + "epoch": 0.22684, + "grad_norm": 2.015625, + "grad_norm_var": 0.005576324462890625, + "learning_rate": 0.0001, + "loss": 4.2096, + "loss/crossentropy": 2.1922959089279175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24581187963485718, + "step": 11342 + }, + { + "epoch": 0.22688, + "grad_norm": 2.0, + "grad_norm_var": 0.007303873697916667, + "learning_rate": 0.0001, + "loss": 3.9786, + "loss/crossentropy": 2.1590365171432495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20936457812786102, + "step": 11344 + }, + { + "epoch": 0.22692, + "grad_norm": 2.53125, + "grad_norm_var": 0.020783487955729166, + "learning_rate": 0.0001, + "loss": 4.2382, + "loss/crossentropy": 1.8087702989578247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18874070048332214, + "step": 11346 + }, + { + "epoch": 0.22696, + "grad_norm": 2.203125, + "grad_norm_var": 0.021732584635416666, + "learning_rate": 0.0001, + "loss": 4.4981, + "loss/crossentropy": 2.5205971002578735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24057865887880325, + "step": 11348 + }, + { + "epoch": 0.227, + "grad_norm": 2.078125, + "grad_norm_var": 0.02061945597330729, + "learning_rate": 0.0001, + "loss": 4.16, + "loss/crossentropy": 2.114508092403412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20631257444620132, + "step": 11350 + }, + { + "epoch": 0.22704, + "grad_norm": 2.21875, + "grad_norm_var": 0.020881144205729167, + "learning_rate": 0.0001, + "loss": 4.4344, + "loss/crossentropy": 2.385537028312683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.233476921916008, + "step": 11352 + }, + { + "epoch": 0.22708, + "grad_norm": 2.21875, + "grad_norm_var": 0.0217926025390625, + "learning_rate": 0.0001, + "loss": 4.3335, + "loss/crossentropy": 2.4251039028167725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24502182751893997, + "step": 11354 + }, + { + "epoch": 0.22712, + "grad_norm": 2.078125, + "grad_norm_var": 0.025690714518229168, + "learning_rate": 0.0001, + "loss": 4.345, + "loss/crossentropy": 2.278030514717102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22328510880470276, + "step": 11356 + }, + { + "epoch": 0.22716, + "grad_norm": 2.03125, + "grad_norm_var": 0.025031534830729167, + "learning_rate": 0.0001, + "loss": 4.2835, + "loss/crossentropy": 1.7373265027999878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17635879665613174, + "step": 11358 + }, + { + "epoch": 0.2272, + "grad_norm": 2.25, + "grad_norm_var": 0.024461873372395835, + "learning_rate": 0.0001, + "loss": 4.3644, + "loss/crossentropy": 2.1546722650527954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22285740077495575, + "step": 11360 + }, + { + "epoch": 0.22724, + "grad_norm": 2.140625, + "grad_norm_var": 0.0143218994140625, + "learning_rate": 0.0001, + "loss": 4.5415, + "loss/crossentropy": 2.341711401939392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23693612217903137, + "step": 11362 + }, + { + "epoch": 0.22728, + "grad_norm": 2.65625, + "grad_norm_var": 0.0461090087890625, + "learning_rate": 0.0001, + "loss": 4.3131, + "loss/crossentropy": 1.8764930367469788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19202134013175964, + "step": 11364 + }, + { + "epoch": 0.22732, + "grad_norm": 2.0625, + "grad_norm_var": 0.0450439453125, + "learning_rate": 0.0001, + "loss": 4.1172, + "loss/crossentropy": 2.035600185394287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2188200280070305, + "step": 11366 + }, + { + "epoch": 0.22736, + "grad_norm": 2.015625, + "grad_norm_var": 0.045481109619140626, + "learning_rate": 0.0001, + "loss": 4.0469, + "loss/crossentropy": 2.120850682258606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22664117813110352, + "step": 11368 + }, + { + "epoch": 0.2274, + "grad_norm": 2.03125, + "grad_norm_var": 0.050455729166666664, + "learning_rate": 0.0001, + "loss": 3.9552, + "loss/crossentropy": 1.9753262996673584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20000861585140228, + "step": 11370 + }, + { + "epoch": 0.22744, + "grad_norm": 2.703125, + "grad_norm_var": 0.06503499348958333, + "learning_rate": 0.0001, + "loss": 4.2474, + "loss/crossentropy": 2.0264564156532288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2026129513978958, + "step": 11372 + }, + { + "epoch": 0.22748, + "grad_norm": 1.890625, + "grad_norm_var": 0.07055562337239583, + "learning_rate": 0.0001, + "loss": 3.8546, + "loss/crossentropy": 1.9269734025001526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22436296939849854, + "step": 11374 + }, + { + "epoch": 0.22752, + "grad_norm": 2.09375, + "grad_norm_var": 0.07017822265625, + "learning_rate": 0.0001, + "loss": 4.4164, + "loss/crossentropy": 2.1881991624832153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22483092546463013, + "step": 11376 + }, + { + "epoch": 0.22756, + "grad_norm": 2.015625, + "grad_norm_var": 0.07446187337239583, + "learning_rate": 0.0001, + "loss": 4.0277, + "loss/crossentropy": 2.0142401456832886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21913451701402664, + "step": 11378 + }, + { + "epoch": 0.2276, + "grad_norm": 2.09375, + "grad_norm_var": 0.05676676432291667, + "learning_rate": 0.0001, + "loss": 4.3374, + "loss/crossentropy": 2.0229859352111816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.238722562789917, + "step": 11380 + }, + { + "epoch": 0.22764, + "grad_norm": 1.9921875, + "grad_norm_var": 0.05724054972330729, + "learning_rate": 0.0001, + "loss": 4.0223, + "loss/crossentropy": 2.043630540370941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21258120238780975, + "step": 11382 + }, + { + "epoch": 0.22768, + "grad_norm": 2.109375, + "grad_norm_var": 0.06020685831705729, + "learning_rate": 0.0001, + "loss": 4.0752, + "loss/crossentropy": 2.702946662902832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24362115561962128, + "step": 11384 + }, + { + "epoch": 0.22772, + "grad_norm": 2.046875, + "grad_norm_var": 0.05449600219726562, + "learning_rate": 0.0001, + "loss": 3.9806, + "loss/crossentropy": 2.243411898612976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22256402671337128, + "step": 11386 + }, + { + "epoch": 0.22776, + "grad_norm": 2.078125, + "grad_norm_var": 0.033699289957682295, + "learning_rate": 0.0001, + "loss": 3.7808, + "loss/crossentropy": 2.1451956033706665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21471337974071503, + "step": 11388 + }, + { + "epoch": 0.2278, + "grad_norm": 2.140625, + "grad_norm_var": 0.030775705973307293, + "learning_rate": 0.0001, + "loss": 4.2865, + "loss/crossentropy": 2.426279664039612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22846391052007675, + "step": 11390 + }, + { + "epoch": 0.22784, + "grad_norm": 2.15625, + "grad_norm_var": 0.033455149332682295, + "learning_rate": 0.0001, + "loss": 4.2878, + "loss/crossentropy": 2.0890655517578125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23008010536432266, + "step": 11392 + }, + { + "epoch": 0.22788, + "grad_norm": 2.046875, + "grad_norm_var": 0.030452219645182292, + "learning_rate": 0.0001, + "loss": 4.2124, + "loss/crossentropy": 2.3496296405792236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23718956112861633, + "step": 11394 + }, + { + "epoch": 0.22792, + "grad_norm": 2.171875, + "grad_norm_var": 0.013152821858723959, + "learning_rate": 0.0001, + "loss": 4.0025, + "loss/crossentropy": 2.02141535282135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21102941036224365, + "step": 11396 + }, + { + "epoch": 0.22796, + "grad_norm": 2.078125, + "grad_norm_var": 0.01275634765625, + "learning_rate": 0.0001, + "loss": 4.4817, + "loss/crossentropy": 2.213461995124817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21838735044002533, + "step": 11398 + }, + { + "epoch": 0.228, + "grad_norm": 2.0625, + "grad_norm_var": 0.11743062337239583, + "learning_rate": 0.0001, + "loss": 4.2313, + "loss/crossentropy": 2.161481499671936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24635545909404755, + "step": 11400 + }, + { + "epoch": 0.22804, + "grad_norm": 2.125, + "grad_norm_var": 0.1158843994140625, + "learning_rate": 0.0001, + "loss": 4.5372, + "loss/crossentropy": 2.0998951196670532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2205812931060791, + "step": 11402 + }, + { + "epoch": 0.22808, + "grad_norm": 2.0625, + "grad_norm_var": 0.11588109334309896, + "learning_rate": 0.0001, + "loss": 4.1606, + "loss/crossentropy": 1.8785207867622375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20330313593149185, + "step": 11404 + }, + { + "epoch": 0.22812, + "grad_norm": 2.03125, + "grad_norm_var": 0.11588109334309896, + "learning_rate": 0.0001, + "loss": 4.2408, + "loss/crossentropy": 2.254656672477722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.220963753759861, + "step": 11406 + }, + { + "epoch": 0.22816, + "grad_norm": 2.078125, + "grad_norm_var": 0.11553726196289063, + "learning_rate": 0.0001, + "loss": 4.1356, + "loss/crossentropy": 1.7936404347419739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1878006011247635, + "step": 11408 + }, + { + "epoch": 0.2282, + "grad_norm": 2.453125, + "grad_norm_var": 0.12290445963541667, + "learning_rate": 0.0001, + "loss": 4.2199, + "loss/crossentropy": 1.8537682890892029, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1893099844455719, + "step": 11410 + }, + { + "epoch": 0.22824, + "grad_norm": 3.234375, + "grad_norm_var": 0.1904205322265625, + "learning_rate": 0.0001, + "loss": 4.1628, + "loss/crossentropy": 2.0987170338630676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2072230949997902, + "step": 11412 + }, + { + "epoch": 0.22828, + "grad_norm": 2.078125, + "grad_norm_var": 0.1907958984375, + "learning_rate": 0.0001, + "loss": 4.2473, + "loss/crossentropy": 2.049329698085785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2142597660422325, + "step": 11414 + }, + { + "epoch": 0.22832, + "grad_norm": 2.03125, + "grad_norm_var": 0.09533284505208334, + "learning_rate": 0.0001, + "loss": 4.283, + "loss/crossentropy": 2.066833019256592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22086824476718903, + "step": 11416 + }, + { + "epoch": 0.22836, + "grad_norm": 2.15625, + "grad_norm_var": 0.09519755045572917, + "learning_rate": 0.0001, + "loss": 4.1837, + "loss/crossentropy": 2.01213002204895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23191991448402405, + "step": 11418 + }, + { + "epoch": 0.2284, + "grad_norm": 5.6875, + "grad_norm_var": 0.8556495666503906, + "learning_rate": 0.0001, + "loss": 4.1977, + "loss/crossentropy": 2.1349334716796875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22979671508073807, + "step": 11420 + }, + { + "epoch": 0.22844, + "grad_norm": 2.15625, + "grad_norm_var": 0.8487709045410157, + "learning_rate": 0.0001, + "loss": 4.2101, + "loss/crossentropy": 1.9201850295066833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21903745830059052, + "step": 11422 + }, + { + "epoch": 0.22848, + "grad_norm": 2.109375, + "grad_norm_var": 0.8546376546223958, + "learning_rate": 0.0001, + "loss": 4.1927, + "loss/crossentropy": 2.011150360107422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2059009075164795, + "step": 11424 + }, + { + "epoch": 0.22852, + "grad_norm": 2.171875, + "grad_norm_var": 0.8533322652180989, + "learning_rate": 0.0001, + "loss": 4.3762, + "loss/crossentropy": 2.1529648303985596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23028475046157837, + "step": 11426 + }, + { + "epoch": 0.22856, + "grad_norm": 1.8828125, + "grad_norm_var": 0.8176177978515625, + "learning_rate": 0.0001, + "loss": 3.9657, + "loss/crossentropy": 2.1602566838264465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2262946516275406, + "step": 11428 + }, + { + "epoch": 0.2286, + "grad_norm": 1.953125, + "grad_norm_var": 0.829766591389974, + "learning_rate": 0.0001, + "loss": 3.9369, + "loss/crossentropy": 1.7230631113052368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19715693593025208, + "step": 11430 + }, + { + "epoch": 0.22864, + "grad_norm": 1.953125, + "grad_norm_var": 0.8287737528483073, + "learning_rate": 0.0001, + "loss": 4.3791, + "loss/crossentropy": 2.4396276473999023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24283458292484283, + "step": 11432 + }, + { + "epoch": 0.22868, + "grad_norm": 2.171875, + "grad_norm_var": 0.8311480204264323, + "learning_rate": 0.0001, + "loss": 4.2692, + "loss/crossentropy": 1.8615645170211792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21288780122995377, + "step": 11434 + }, + { + "epoch": 0.22872, + "grad_norm": 2.09375, + "grad_norm_var": 0.012941233317057292, + "learning_rate": 0.0001, + "loss": 4.394, + "loss/crossentropy": 1.8972707390785217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18510619550943375, + "step": 11436 + }, + { + "epoch": 0.22876, + "grad_norm": 2.0, + "grad_norm_var": 0.011980946858723958, + "learning_rate": 0.0001, + "loss": 4.1459, + "loss/crossentropy": 2.217754364013672, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21217356622219086, + "step": 11438 + }, + { + "epoch": 0.2288, + "grad_norm": 2.09375, + "grad_norm_var": 0.014818318684895833, + "learning_rate": 0.0001, + "loss": 3.7955, + "loss/crossentropy": 1.6481398940086365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17804963141679764, + "step": 11440 + }, + { + "epoch": 0.22884, + "grad_norm": 2.0625, + "grad_norm_var": 0.01375732421875, + "learning_rate": 0.0001, + "loss": 4.3291, + "loss/crossentropy": 2.026508390903473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20739784091711044, + "step": 11442 + }, + { + "epoch": 0.22888, + "grad_norm": 2.09375, + "grad_norm_var": 0.012334950764973958, + "learning_rate": 0.0001, + "loss": 4.2086, + "loss/crossentropy": 2.05685293674469, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22639908641576767, + "step": 11444 + }, + { + "epoch": 0.22892, + "grad_norm": 2.03125, + "grad_norm_var": 0.01197509765625, + "learning_rate": 0.0001, + "loss": 4.4689, + "loss/crossentropy": 2.073192059993744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21727359294891357, + "step": 11446 + }, + { + "epoch": 0.22896, + "grad_norm": 2.171875, + "grad_norm_var": 0.0092193603515625, + "learning_rate": 0.0001, + "loss": 4.4312, + "loss/crossentropy": 2.1311055421829224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19499187916517258, + "step": 11448 + }, + { + "epoch": 0.229, + "grad_norm": 2.1875, + "grad_norm_var": 0.01021728515625, + "learning_rate": 0.0001, + "loss": 4.2043, + "loss/crossentropy": 2.169051766395569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22266974300146103, + "step": 11450 + }, + { + "epoch": 0.22904, + "grad_norm": 1.875, + "grad_norm_var": 0.013727823893229166, + "learning_rate": 0.0001, + "loss": 4.1784, + "loss/crossentropy": 2.022417426109314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20541484653949738, + "step": 11452 + }, + { + "epoch": 0.22908, + "grad_norm": 2.140625, + "grad_norm_var": 0.015458170572916667, + "learning_rate": 0.0001, + "loss": 4.2921, + "loss/crossentropy": 2.2652071714401245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21639510244131088, + "step": 11454 + }, + { + "epoch": 0.22912, + "grad_norm": 2.203125, + "grad_norm_var": 0.01343994140625, + "learning_rate": 0.0001, + "loss": 4.1575, + "loss/crossentropy": 1.912036418914795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2204061597585678, + "step": 11456 + }, + { + "epoch": 0.22916, + "grad_norm": 2.0625, + "grad_norm_var": 0.014188639322916667, + "learning_rate": 0.0001, + "loss": 4.4497, + "loss/crossentropy": 2.029780328273773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2032444253563881, + "step": 11458 + }, + { + "epoch": 0.2292, + "grad_norm": 2.09375, + "grad_norm_var": 0.016039021809895835, + "learning_rate": 0.0001, + "loss": 4.0627, + "loss/crossentropy": 1.954946756362915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2028508484363556, + "step": 11460 + }, + { + "epoch": 0.22924, + "grad_norm": 2.8125, + "grad_norm_var": 0.049494425455729164, + "learning_rate": 0.0001, + "loss": 4.5787, + "loss/crossentropy": 2.3707855939865112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2460392713546753, + "step": 11462 + }, + { + "epoch": 0.22928, + "grad_norm": 2.078125, + "grad_norm_var": 0.047972615559895834, + "learning_rate": 0.0001, + "loss": 4.1509, + "loss/crossentropy": 2.410157322883606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24578960239887238, + "step": 11464 + }, + { + "epoch": 0.22932, + "grad_norm": 1.953125, + "grad_norm_var": 0.049117024739583334, + "learning_rate": 0.0001, + "loss": 4.0935, + "loss/crossentropy": 2.1403380036354065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20563968271017075, + "step": 11466 + }, + { + "epoch": 0.22936, + "grad_norm": 1.953125, + "grad_norm_var": 0.0471588134765625, + "learning_rate": 0.0001, + "loss": 4.0991, + "loss/crossentropy": 2.1596190333366394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24405791610479355, + "step": 11468 + }, + { + "epoch": 0.2294, + "grad_norm": 2.421875, + "grad_norm_var": 0.0506744384765625, + "learning_rate": 0.0001, + "loss": 4.3973, + "loss/crossentropy": 1.9427857398986816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22412577271461487, + "step": 11470 + }, + { + "epoch": 0.22944, + "grad_norm": 1.8984375, + "grad_norm_var": 0.05306574503580729, + "learning_rate": 0.0001, + "loss": 4.1093, + "loss/crossentropy": 2.078735053539276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20844466239213943, + "step": 11472 + }, + { + "epoch": 0.22948, + "grad_norm": 2.0, + "grad_norm_var": 0.054323069254557294, + "learning_rate": 0.0001, + "loss": 4.0749, + "loss/crossentropy": 2.351656198501587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24430027604103088, + "step": 11474 + }, + { + "epoch": 0.22952, + "grad_norm": 2.09375, + "grad_norm_var": 0.05269139607747396, + "learning_rate": 0.0001, + "loss": 3.9647, + "loss/crossentropy": 2.057854652404785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21224451810121536, + "step": 11476 + }, + { + "epoch": 0.22956, + "grad_norm": 2.03125, + "grad_norm_var": 0.018534088134765626, + "learning_rate": 0.0001, + "loss": 4.1451, + "loss/crossentropy": 2.207979917526245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22855235636234283, + "step": 11478 + }, + { + "epoch": 0.2296, + "grad_norm": 2.125, + "grad_norm_var": 0.01870905558268229, + "learning_rate": 0.0001, + "loss": 4.3748, + "loss/crossentropy": 2.08840012550354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21988992393016815, + "step": 11480 + }, + { + "epoch": 0.22964, + "grad_norm": 2.125, + "grad_norm_var": 0.017144521077473957, + "learning_rate": 0.0001, + "loss": 4.2314, + "loss/crossentropy": 2.1339274644851685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2278972491621971, + "step": 11482 + }, + { + "epoch": 0.22968, + "grad_norm": 2.15625, + "grad_norm_var": 0.015386708577473958, + "learning_rate": 0.0001, + "loss": 4.2013, + "loss/crossentropy": 1.970679223537445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20286859571933746, + "step": 11484 + }, + { + "epoch": 0.22972, + "grad_norm": 1.984375, + "grad_norm_var": 0.005295562744140625, + "learning_rate": 0.0001, + "loss": 4.2443, + "loss/crossentropy": 2.1023008823394775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21906304359436035, + "step": 11486 + }, + { + "epoch": 0.22976, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004870351155598958, + "learning_rate": 0.0001, + "loss": 4.2774, + "loss/crossentropy": 2.1415608525276184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21857701241970062, + "step": 11488 + }, + { + "epoch": 0.2298, + "grad_norm": 2.03125, + "grad_norm_var": 0.005191802978515625, + "learning_rate": 0.0001, + "loss": 4.0697, + "loss/crossentropy": 1.9523325562477112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19859597831964493, + "step": 11490 + }, + { + "epoch": 0.22984, + "grad_norm": 2.0, + "grad_norm_var": 0.0046770731608072914, + "learning_rate": 0.0001, + "loss": 4.2433, + "loss/crossentropy": 2.1532927751541138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2239024043083191, + "step": 11492 + }, + { + "epoch": 0.22988, + "grad_norm": 2.1875, + "grad_norm_var": 0.006705474853515625, + "learning_rate": 0.0001, + "loss": 4.3556, + "loss/crossentropy": 2.160528779029846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23207154870033264, + "step": 11494 + }, + { + "epoch": 0.22992, + "grad_norm": 2.078125, + "grad_norm_var": 0.006528472900390625, + "learning_rate": 0.0001, + "loss": 4.277, + "loss/crossentropy": 2.07854962348938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2189328968524933, + "step": 11496 + }, + { + "epoch": 0.22996, + "grad_norm": 2.296875, + "grad_norm_var": 0.009069569905598958, + "learning_rate": 0.0001, + "loss": 4.4237, + "loss/crossentropy": 2.2270501852035522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23237968981266022, + "step": 11498 + }, + { + "epoch": 0.23, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0125396728515625, + "learning_rate": 0.0001, + "loss": 3.96, + "loss/crossentropy": 1.9641701579093933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063433900475502, + "step": 11500 + }, + { + "epoch": 0.23004, + "grad_norm": 2.0625, + "grad_norm_var": 0.016866048177083332, + "learning_rate": 0.0001, + "loss": 4.3002, + "loss/crossentropy": 1.9243032932281494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20478814095258713, + "step": 11502 + }, + { + "epoch": 0.23008, + "grad_norm": 2.140625, + "grad_norm_var": 0.01587092081705729, + "learning_rate": 0.0001, + "loss": 4.329, + "loss/crossentropy": 2.295292854309082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23184886574745178, + "step": 11504 + }, + { + "epoch": 0.23012, + "grad_norm": 2.046875, + "grad_norm_var": 0.01654052734375, + "learning_rate": 0.0001, + "loss": 4.0697, + "loss/crossentropy": 2.134859561920166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21923956274986267, + "step": 11506 + }, + { + "epoch": 0.23016, + "grad_norm": 2.109375, + "grad_norm_var": 0.0158355712890625, + "learning_rate": 0.0001, + "loss": 4.2523, + "loss/crossentropy": 1.905085265636444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22443564236164093, + "step": 11508 + }, + { + "epoch": 0.2302, + "grad_norm": 2.03125, + "grad_norm_var": 0.014892323811848959, + "learning_rate": 0.0001, + "loss": 4.0296, + "loss/crossentropy": 1.6967324614524841, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19518497586250305, + "step": 11510 + }, + { + "epoch": 0.23024, + "grad_norm": 1.9609375, + "grad_norm_var": 0.015364583333333333, + "learning_rate": 0.0001, + "loss": 4.225, + "loss/crossentropy": 1.638957679271698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19098830223083496, + "step": 11512 + }, + { + "epoch": 0.23028, + "grad_norm": 2.078125, + "grad_norm_var": 0.013732655843098959, + "learning_rate": 0.0001, + "loss": 4.0479, + "loss/crossentropy": 1.8760477900505066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.202397421002388, + "step": 11514 + }, + { + "epoch": 0.23032, + "grad_norm": 2.03125, + "grad_norm_var": 0.012189737955729167, + "learning_rate": 0.0001, + "loss": 4.2265, + "loss/crossentropy": 2.201690912246704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22841450572013855, + "step": 11516 + }, + { + "epoch": 0.23036, + "grad_norm": 2.203125, + "grad_norm_var": 0.007987467447916667, + "learning_rate": 0.0001, + "loss": 4.5451, + "loss/crossentropy": 2.5022183656692505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25180216133594513, + "step": 11518 + }, + { + "epoch": 0.2304, + "grad_norm": 2.0, + "grad_norm_var": 0.007682291666666666, + "learning_rate": 0.0001, + "loss": 4.2848, + "loss/crossentropy": 2.4634610414505005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23911123722791672, + "step": 11520 + }, + { + "epoch": 0.23044, + "grad_norm": 2.046875, + "grad_norm_var": 0.007503000895182291, + "learning_rate": 0.0001, + "loss": 4.2919, + "loss/crossentropy": 2.1176512241363525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20835164189338684, + "step": 11522 + }, + { + "epoch": 0.23048, + "grad_norm": 2.0625, + "grad_norm_var": 0.007252756754557292, + "learning_rate": 0.0001, + "loss": 4.0803, + "loss/crossentropy": 1.7708171606063843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18085652589797974, + "step": 11524 + }, + { + "epoch": 0.23052, + "grad_norm": 2.09375, + "grad_norm_var": 0.008504231770833334, + "learning_rate": 0.0001, + "loss": 4.1159, + "loss/crossentropy": 1.6740695238113403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17464587092399597, + "step": 11526 + }, + { + "epoch": 0.23056, + "grad_norm": 1.984375, + "grad_norm_var": 0.008135732014973958, + "learning_rate": 0.0001, + "loss": 4.1829, + "loss/crossentropy": 1.8153178691864014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19107046723365784, + "step": 11528 + }, + { + "epoch": 0.2306, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006231435139973958, + "learning_rate": 0.0001, + "loss": 4.2074, + "loss/crossentropy": 2.2833873629570007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19900934398174286, + "step": 11530 + }, + { + "epoch": 0.23064, + "grad_norm": 2.03125, + "grad_norm_var": 0.008593495686848958, + "learning_rate": 0.0001, + "loss": 4.3154, + "loss/crossentropy": 1.9106029272079468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1996043100953102, + "step": 11532 + }, + { + "epoch": 0.23068, + "grad_norm": 2.109375, + "grad_norm_var": 0.006030019124348958, + "learning_rate": 0.0001, + "loss": 4.3849, + "loss/crossentropy": 2.049258530139923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21438845992088318, + "step": 11534 + }, + { + "epoch": 0.23072, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006029256184895833, + "learning_rate": 0.0001, + "loss": 3.9236, + "loss/crossentropy": 1.9124351739883423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20111311972141266, + "step": 11536 + }, + { + "epoch": 0.23076, + "grad_norm": 2.015625, + "grad_norm_var": 0.0059397379557291664, + "learning_rate": 0.0001, + "loss": 4.0781, + "loss/crossentropy": 1.8045400381088257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20846854895353317, + "step": 11538 + }, + { + "epoch": 0.2308, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007297515869140625, + "learning_rate": 0.0001, + "loss": 4.1804, + "loss/crossentropy": 1.9970109462738037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20165999233722687, + "step": 11540 + }, + { + "epoch": 0.23084, + "grad_norm": 2.09375, + "grad_norm_var": 0.013065338134765625, + "learning_rate": 0.0001, + "loss": 4.1756, + "loss/crossentropy": 2.0300097465515137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24561113119125366, + "step": 11542 + }, + { + "epoch": 0.23088, + "grad_norm": 2.5, + "grad_norm_var": 0.025402577718098958, + "learning_rate": 0.0001, + "loss": 4.4841, + "loss/crossentropy": 2.194391131401062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24075081944465637, + "step": 11544 + }, + { + "epoch": 0.23092, + "grad_norm": 2.03125, + "grad_norm_var": 0.0239898681640625, + "learning_rate": 0.0001, + "loss": 3.9177, + "loss/crossentropy": 2.0229761600494385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21907109022140503, + "step": 11546 + }, + { + "epoch": 0.23096, + "grad_norm": 2.171875, + "grad_norm_var": 0.022606404622395833, + "learning_rate": 0.0001, + "loss": 4.3089, + "loss/crossentropy": 1.831793487071991, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18943443894386292, + "step": 11548 + }, + { + "epoch": 0.231, + "grad_norm": 1.984375, + "grad_norm_var": 0.023395792643229166, + "learning_rate": 0.0001, + "loss": 4.19, + "loss/crossentropy": 2.4099985361099243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22612392157316208, + "step": 11550 + }, + { + "epoch": 0.23104, + "grad_norm": 2.109375, + "grad_norm_var": 0.022946929931640624, + "learning_rate": 0.0001, + "loss": 4.2512, + "loss/crossentropy": 2.0496281385421753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20928414165973663, + "step": 11552 + }, + { + "epoch": 0.23108, + "grad_norm": 1.875, + "grad_norm_var": 0.025986480712890624, + "learning_rate": 0.0001, + "loss": 4.196, + "loss/crossentropy": 1.9366755485534668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19255827367305756, + "step": 11554 + }, + { + "epoch": 0.23112, + "grad_norm": 1.9296875, + "grad_norm_var": 0.02516454060872396, + "learning_rate": 0.0001, + "loss": 4.3758, + "loss/crossentropy": 1.9077317714691162, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20340882241725922, + "step": 11556 + }, + { + "epoch": 0.23116, + "grad_norm": 2.046875, + "grad_norm_var": 0.02020848592122396, + "learning_rate": 0.0001, + "loss": 4.091, + "loss/crossentropy": 1.9167283773422241, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21566946804523468, + "step": 11558 + }, + { + "epoch": 0.2312, + "grad_norm": 2.125, + "grad_norm_var": 0.0059506734212239586, + "learning_rate": 0.0001, + "loss": 4.5758, + "loss/crossentropy": 2.4604564905166626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22104239463806152, + "step": 11560 + }, + { + "epoch": 0.23124, + "grad_norm": 1.828125, + "grad_norm_var": 0.009124501546223959, + "learning_rate": 0.0001, + "loss": 4.0398, + "loss/crossentropy": 2.359586775302887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064266949892044, + "step": 11562 + }, + { + "epoch": 0.23128, + "grad_norm": 1.875, + "grad_norm_var": 0.009456125895182292, + "learning_rate": 0.0001, + "loss": 4.0065, + "loss/crossentropy": 1.9757064580917358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19224581122398376, + "step": 11564 + }, + { + "epoch": 0.23132, + "grad_norm": 2.125, + "grad_norm_var": 0.03843561808268229, + "learning_rate": 0.0001, + "loss": 4.5544, + "loss/crossentropy": 1.9888432025909424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20799735933542252, + "step": 11566 + }, + { + "epoch": 0.23136, + "grad_norm": 2.0625, + "grad_norm_var": 0.03802057902018229, + "learning_rate": 0.0001, + "loss": 4.2281, + "loss/crossentropy": 2.0829046964645386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22345586121082306, + "step": 11568 + }, + { + "epoch": 0.2314, + "grad_norm": 2.203125, + "grad_norm_var": 0.036649322509765624, + "learning_rate": 0.0001, + "loss": 4.3421, + "loss/crossentropy": 1.964626431465149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21068184822797775, + "step": 11570 + }, + { + "epoch": 0.23144, + "grad_norm": 2.125, + "grad_norm_var": 0.034993489583333336, + "learning_rate": 0.0001, + "loss": 4.4206, + "loss/crossentropy": 2.313928008079529, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23291154205799103, + "step": 11572 + }, + { + "epoch": 0.23148, + "grad_norm": 1.890625, + "grad_norm_var": 0.038386027018229164, + "learning_rate": 0.0001, + "loss": 3.9609, + "loss/crossentropy": 2.221343159675598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20674917846918106, + "step": 11574 + }, + { + "epoch": 0.23152, + "grad_norm": 2.0625, + "grad_norm_var": 0.03853759765625, + "learning_rate": 0.0001, + "loss": 4.4097, + "loss/crossentropy": 2.070296823978424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22503886371850967, + "step": 11576 + }, + { + "epoch": 0.23156, + "grad_norm": 2.09375, + "grad_norm_var": 0.03483784993489583, + "learning_rate": 0.0001, + "loss": 3.9417, + "loss/crossentropy": 2.21540367603302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2334480583667755, + "step": 11578 + }, + { + "epoch": 0.2316, + "grad_norm": 1.8359375, + "grad_norm_var": 0.03614679972330729, + "learning_rate": 0.0001, + "loss": 4.2491, + "loss/crossentropy": 1.9438464641571045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20312584936618805, + "step": 11580 + }, + { + "epoch": 0.23164, + "grad_norm": 1.90625, + "grad_norm_var": 0.010994211832682291, + "learning_rate": 0.0001, + "loss": 4.0622, + "loss/crossentropy": 1.4970324039459229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17415452748537064, + "step": 11582 + }, + { + "epoch": 0.23168, + "grad_norm": 2.515625, + "grad_norm_var": 0.026244099934895834, + "learning_rate": 0.0001, + "loss": 4.2293, + "loss/crossentropy": 2.2618579864501953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23066900670528412, + "step": 11584 + }, + { + "epoch": 0.23172, + "grad_norm": 1.9453125, + "grad_norm_var": 0.02603123982747396, + "learning_rate": 0.0001, + "loss": 4.0974, + "loss/crossentropy": 2.2342761754989624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20567959547042847, + "step": 11586 + }, + { + "epoch": 0.23176, + "grad_norm": 2.03125, + "grad_norm_var": 0.026151275634765624, + "learning_rate": 0.0001, + "loss": 4.0417, + "loss/crossentropy": 2.144785463809967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19473369419574738, + "step": 11588 + }, + { + "epoch": 0.2318, + "grad_norm": 2.015625, + "grad_norm_var": 0.02851130167643229, + "learning_rate": 0.0001, + "loss": 4.1996, + "loss/crossentropy": 1.9032491445541382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1956048086285591, + "step": 11590 + }, + { + "epoch": 0.23184, + "grad_norm": 2.234375, + "grad_norm_var": 0.03227717081705729, + "learning_rate": 0.0001, + "loss": 4.4689, + "loss/crossentropy": 1.934233546257019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22426530718803406, + "step": 11592 + }, + { + "epoch": 0.23188, + "grad_norm": 2.0, + "grad_norm_var": 0.03166071573893229, + "learning_rate": 0.0001, + "loss": 4.2321, + "loss/crossentropy": 2.1792030930519104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22469744831323624, + "step": 11594 + }, + { + "epoch": 0.23192, + "grad_norm": 2.109375, + "grad_norm_var": 0.027705891927083334, + "learning_rate": 0.0001, + "loss": 4.3779, + "loss/crossentropy": 2.3242534399032593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23015478998422623, + "step": 11596 + }, + { + "epoch": 0.23196, + "grad_norm": 2.5, + "grad_norm_var": 0.036622873942057294, + "learning_rate": 0.0001, + "loss": 4.3553, + "loss/crossentropy": 2.2343804836273193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2392263486981392, + "step": 11598 + }, + { + "epoch": 0.232, + "grad_norm": 2.171875, + "grad_norm_var": 0.024559529622395833, + "learning_rate": 0.0001, + "loss": 4.3019, + "loss/crossentropy": 1.9622855186462402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22550886124372482, + "step": 11600 + }, + { + "epoch": 0.23204, + "grad_norm": 1.9140625, + "grad_norm_var": 0.023981730143229168, + "learning_rate": 0.0001, + "loss": 4.4955, + "loss/crossentropy": 2.2274144887924194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21090717613697052, + "step": 11602 + }, + { + "epoch": 0.23208, + "grad_norm": 2.078125, + "grad_norm_var": 0.021683756510416666, + "learning_rate": 0.0001, + "loss": 4.3011, + "loss/crossentropy": 2.092648506164551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21386967599391937, + "step": 11604 + }, + { + "epoch": 0.23212, + "grad_norm": 1.9609375, + "grad_norm_var": 0.020643870035807293, + "learning_rate": 0.0001, + "loss": 4.2739, + "loss/crossentropy": 2.3358936309814453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22385042905807495, + "step": 11606 + }, + { + "epoch": 0.23216, + "grad_norm": 2.046875, + "grad_norm_var": 0.01883112589518229, + "learning_rate": 0.0001, + "loss": 4.1968, + "loss/crossentropy": 1.9800407886505127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20614364743232727, + "step": 11608 + }, + { + "epoch": 0.2322, + "grad_norm": 2.140625, + "grad_norm_var": 0.018536122639973958, + "learning_rate": 0.0001, + "loss": 4.2385, + "loss/crossentropy": 1.9646947979927063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21902073919773102, + "step": 11610 + }, + { + "epoch": 0.23224, + "grad_norm": 2.09375, + "grad_norm_var": 0.018930816650390626, + "learning_rate": 0.0001, + "loss": 4.1564, + "loss/crossentropy": 2.0113691687583923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21262314170598984, + "step": 11612 + }, + { + "epoch": 0.23228, + "grad_norm": 2.328125, + "grad_norm_var": 0.0254791259765625, + "learning_rate": 0.0001, + "loss": 4.3179, + "loss/crossentropy": 1.8359833359718323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125518098473549, + "step": 11614 + }, + { + "epoch": 0.23232, + "grad_norm": 2.15625, + "grad_norm_var": 0.02577489217122396, + "learning_rate": 0.0001, + "loss": 4.0954, + "loss/crossentropy": 1.934333622455597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148296758532524, + "step": 11616 + }, + { + "epoch": 0.23236, + "grad_norm": 2.359375, + "grad_norm_var": 0.026192220052083333, + "learning_rate": 0.0001, + "loss": 4.3556, + "loss/crossentropy": 1.9486380815505981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.201569102704525, + "step": 11618 + }, + { + "epoch": 0.2324, + "grad_norm": 2.140625, + "grad_norm_var": 0.0251708984375, + "learning_rate": 0.0001, + "loss": 4.3761, + "loss/crossentropy": 1.9379103183746338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2109408900141716, + "step": 11620 + }, + { + "epoch": 0.23244, + "grad_norm": 2.203125, + "grad_norm_var": 0.02165705362955729, + "learning_rate": 0.0001, + "loss": 4.3957, + "loss/crossentropy": 2.1635884046554565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23330900818109512, + "step": 11622 + }, + { + "epoch": 0.23248, + "grad_norm": 1.96875, + "grad_norm_var": 0.02851130167643229, + "learning_rate": 0.0001, + "loss": 4.1035, + "loss/crossentropy": 2.3183244466781616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21685285866260529, + "step": 11624 + }, + { + "epoch": 0.23252, + "grad_norm": 2.0, + "grad_norm_var": 0.03022028605143229, + "learning_rate": 0.0001, + "loss": 3.9465, + "loss/crossentropy": 2.2932451367378235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22147410362958908, + "step": 11626 + }, + { + "epoch": 0.23256, + "grad_norm": 1.953125, + "grad_norm_var": 0.03144505818684896, + "learning_rate": 0.0001, + "loss": 4.0598, + "loss/crossentropy": 2.1384140253067017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22141631692647934, + "step": 11628 + }, + { + "epoch": 0.2326, + "grad_norm": 2.0, + "grad_norm_var": 0.0146728515625, + "learning_rate": 0.0001, + "loss": 3.9313, + "loss/crossentropy": 1.7338963747024536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18959754705429077, + "step": 11630 + }, + { + "epoch": 0.23264, + "grad_norm": 2.046875, + "grad_norm_var": 0.014943186442057292, + "learning_rate": 0.0001, + "loss": 4.3293, + "loss/crossentropy": 2.186310887336731, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19735413044691086, + "step": 11632 + }, + { + "epoch": 0.23268, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01092529296875, + "learning_rate": 0.0001, + "loss": 3.9636, + "loss/crossentropy": 2.0566734075546265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21930715441703796, + "step": 11634 + }, + { + "epoch": 0.23272, + "grad_norm": 1.984375, + "grad_norm_var": 0.011631011962890625, + "learning_rate": 0.0001, + "loss": 3.8382, + "loss/crossentropy": 1.9993655681610107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18999667465686798, + "step": 11636 + }, + { + "epoch": 0.23276, + "grad_norm": 2.015625, + "grad_norm_var": 0.008316802978515624, + "learning_rate": 0.0001, + "loss": 4.2309, + "loss/crossentropy": 2.2788418531417847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21202587336301804, + "step": 11638 + }, + { + "epoch": 0.2328, + "grad_norm": 1.8828125, + "grad_norm_var": 0.00792236328125, + "learning_rate": 0.0001, + "loss": 3.7837, + "loss/crossentropy": 1.7248046398162842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21424376964569092, + "step": 11640 + }, + { + "epoch": 0.23284, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008235422770182292, + "learning_rate": 0.0001, + "loss": 4.0023, + "loss/crossentropy": 1.9053270816802979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19046200066804886, + "step": 11642 + }, + { + "epoch": 0.23288, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0118408203125, + "learning_rate": 0.0001, + "loss": 4.3561, + "loss/crossentropy": 2.3588117361068726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23569310456514359, + "step": 11644 + }, + { + "epoch": 0.23292, + "grad_norm": 2.078125, + "grad_norm_var": 0.012271881103515625, + "learning_rate": 0.0001, + "loss": 4.4722, + "loss/crossentropy": 2.15751576423645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22107885777950287, + "step": 11646 + }, + { + "epoch": 0.23296, + "grad_norm": 2.03125, + "grad_norm_var": 0.009720611572265624, + "learning_rate": 0.0001, + "loss": 4.4075, + "loss/crossentropy": 2.094591200351715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20681703090667725, + "step": 11648 + }, + { + "epoch": 0.233, + "grad_norm": 2.03125, + "grad_norm_var": 0.009523264567057292, + "learning_rate": 0.0001, + "loss": 3.7832, + "loss/crossentropy": 1.7536470890045166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18886109441518784, + "step": 11650 + }, + { + "epoch": 0.23304, + "grad_norm": 2.0625, + "grad_norm_var": 0.0082427978515625, + "learning_rate": 0.0001, + "loss": 3.9905, + "loss/crossentropy": 1.8172362446784973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18359588831663132, + "step": 11652 + }, + { + "epoch": 0.23308, + "grad_norm": 2.0625, + "grad_norm_var": 0.00841064453125, + "learning_rate": 0.0001, + "loss": 4.2404, + "loss/crossentropy": 2.2346678376197815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2154158428311348, + "step": 11654 + }, + { + "epoch": 0.23312, + "grad_norm": 2.03125, + "grad_norm_var": 0.008040110270182291, + "learning_rate": 0.0001, + "loss": 4.1559, + "loss/crossentropy": 2.0573307275772095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2160855233669281, + "step": 11656 + }, + { + "epoch": 0.23316, + "grad_norm": 2.03125, + "grad_norm_var": 0.007621256510416666, + "learning_rate": 0.0001, + "loss": 4.0669, + "loss/crossentropy": 2.4358904361724854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22637036442756653, + "step": 11658 + }, + { + "epoch": 0.2332, + "grad_norm": 1.96875, + "grad_norm_var": 0.004644521077473958, + "learning_rate": 0.0001, + "loss": 3.9938, + "loss/crossentropy": 2.387966513633728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2385111078619957, + "step": 11660 + }, + { + "epoch": 0.23324, + "grad_norm": 2.046875, + "grad_norm_var": 0.004709625244140625, + "learning_rate": 0.0001, + "loss": 4.3371, + "loss/crossentropy": 2.30223548412323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2149546965956688, + "step": 11662 + }, + { + "epoch": 0.23328, + "grad_norm": 2.109375, + "grad_norm_var": 0.005494944254557292, + "learning_rate": 0.0001, + "loss": 4.3242, + "loss/crossentropy": 2.001839280128479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2001148834824562, + "step": 11664 + }, + { + "epoch": 0.23332, + "grad_norm": 2.25, + "grad_norm_var": 0.00693359375, + "learning_rate": 0.0001, + "loss": 4.4932, + "loss/crossentropy": 2.387674927711487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22714952379465103, + "step": 11666 + }, + { + "epoch": 0.23336, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007470448811848958, + "learning_rate": 0.0001, + "loss": 4.1333, + "loss/crossentropy": 2.133235454559326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22263716161251068, + "step": 11668 + }, + { + "epoch": 0.2334, + "grad_norm": 1.90625, + "grad_norm_var": 0.008874257405598959, + "learning_rate": 0.0001, + "loss": 4.1392, + "loss/crossentropy": 2.1041141748428345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20474642515182495, + "step": 11670 + }, + { + "epoch": 0.23344, + "grad_norm": 2.25, + "grad_norm_var": 0.010796864827473959, + "learning_rate": 0.0001, + "loss": 4.4075, + "loss/crossentropy": 2.0425861477851868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23144961893558502, + "step": 11672 + }, + { + "epoch": 0.23348, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009883626302083334, + "learning_rate": 0.0001, + "loss": 4.0878, + "loss/crossentropy": 2.0170212388038635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2134404480457306, + "step": 11674 + }, + { + "epoch": 0.23352, + "grad_norm": 2.015625, + "grad_norm_var": 0.010489908854166667, + "learning_rate": 0.0001, + "loss": 4.2275, + "loss/crossentropy": 2.205981433391571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.220575213432312, + "step": 11676 + }, + { + "epoch": 0.23356, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011429595947265624, + "learning_rate": 0.0001, + "loss": 4.1845, + "loss/crossentropy": 1.9121403694152832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23312295228242874, + "step": 11678 + }, + { + "epoch": 0.2336, + "grad_norm": 2.296875, + "grad_norm_var": 0.0156890869140625, + "learning_rate": 0.0001, + "loss": 4.1112, + "loss/crossentropy": 1.6793898940086365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19598360359668732, + "step": 11680 + }, + { + "epoch": 0.23364, + "grad_norm": 2.359375, + "grad_norm_var": 0.0222564697265625, + "learning_rate": 0.0001, + "loss": 4.4814, + "loss/crossentropy": 2.1657907962799072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2281472533941269, + "step": 11682 + }, + { + "epoch": 0.23368, + "grad_norm": 2.015625, + "grad_norm_var": 0.022027333577473957, + "learning_rate": 0.0001, + "loss": 4.0869, + "loss/crossentropy": 2.1918715238571167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22837525606155396, + "step": 11684 + }, + { + "epoch": 0.23372, + "grad_norm": 1.9765625, + "grad_norm_var": 0.020699055989583333, + "learning_rate": 0.0001, + "loss": 4.27, + "loss/crossentropy": 2.296495795249939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23491691797971725, + "step": 11686 + }, + { + "epoch": 0.23376, + "grad_norm": 2.1875, + "grad_norm_var": 0.019559733072916665, + "learning_rate": 0.0001, + "loss": 4.1422, + "loss/crossentropy": 1.9278987646102905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1960090771317482, + "step": 11688 + }, + { + "epoch": 0.2338, + "grad_norm": 2.0625, + "grad_norm_var": 0.019461822509765626, + "learning_rate": 0.0001, + "loss": 4.3393, + "loss/crossentropy": 2.0491825938224792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21345915645360947, + "step": 11690 + }, + { + "epoch": 0.23384, + "grad_norm": 2.015625, + "grad_norm_var": 0.02127863566080729, + "learning_rate": 0.0001, + "loss": 3.9209, + "loss/crossentropy": 1.7594041228294373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19004464149475098, + "step": 11692 + }, + { + "epoch": 0.23388, + "grad_norm": 1.9765625, + "grad_norm_var": 0.020961252848307292, + "learning_rate": 0.0001, + "loss": 4.3645, + "loss/crossentropy": 2.0313411951065063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20986144989728928, + "step": 11694 + }, + { + "epoch": 0.23392, + "grad_norm": 2.109375, + "grad_norm_var": 0.016068522135416666, + "learning_rate": 0.0001, + "loss": 4.4644, + "loss/crossentropy": 2.3580493927001953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2253532111644745, + "step": 11696 + }, + { + "epoch": 0.23396, + "grad_norm": 2.203125, + "grad_norm_var": 0.008548990885416666, + "learning_rate": 0.0001, + "loss": 4.3954, + "loss/crossentropy": 2.2036253213882446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22294757515192032, + "step": 11698 + }, + { + "epoch": 0.234, + "grad_norm": 2.03125, + "grad_norm_var": 0.007972971598307291, + "learning_rate": 0.0001, + "loss": 4.2602, + "loss/crossentropy": 1.8575093150138855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19865535199642181, + "step": 11700 + }, + { + "epoch": 0.23404, + "grad_norm": 1.96875, + "grad_norm_var": 0.008571116129557292, + "learning_rate": 0.0001, + "loss": 4.2619, + "loss/crossentropy": 2.2169028520584106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22326484322547913, + "step": 11702 + }, + { + "epoch": 0.23408, + "grad_norm": 2.015625, + "grad_norm_var": 0.007500966389973958, + "learning_rate": 0.0001, + "loss": 4.4045, + "loss/crossentropy": 2.1897542476654053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2097000628709793, + "step": 11704 + }, + { + "epoch": 0.23412, + "grad_norm": 2.015625, + "grad_norm_var": 0.0063168843587239586, + "learning_rate": 0.0001, + "loss": 4.271, + "loss/crossentropy": 2.214607834815979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24112706631422043, + "step": 11706 + }, + { + "epoch": 0.23416, + "grad_norm": 2.046875, + "grad_norm_var": 0.004780832926432292, + "learning_rate": 0.0001, + "loss": 4.1278, + "loss/crossentropy": 2.124355912208557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2222435548901558, + "step": 11708 + }, + { + "epoch": 0.2342, + "grad_norm": 2.171875, + "grad_norm_var": 0.0051025390625, + "learning_rate": 0.0001, + "loss": 4.3118, + "loss/crossentropy": 2.35421621799469, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26120802760124207, + "step": 11710 + }, + { + "epoch": 0.23424, + "grad_norm": 2.0, + "grad_norm_var": 0.00521240234375, + "learning_rate": 0.0001, + "loss": 4.1781, + "loss/crossentropy": 1.9134620428085327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21830002963542938, + "step": 11712 + }, + { + "epoch": 0.23428, + "grad_norm": 2.078125, + "grad_norm_var": 0.003763580322265625, + "learning_rate": 0.0001, + "loss": 4.2445, + "loss/crossentropy": 1.9336887001991272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2025599479675293, + "step": 11714 + }, + { + "epoch": 0.23432, + "grad_norm": 2.015625, + "grad_norm_var": 0.004133097330729167, + "learning_rate": 0.0001, + "loss": 4.0775, + "loss/crossentropy": 1.9964489936828613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2020488828420639, + "step": 11716 + }, + { + "epoch": 0.23436, + "grad_norm": 2.078125, + "grad_norm_var": 0.00423583984375, + "learning_rate": 0.0001, + "loss": 4.264, + "loss/crossentropy": 2.108368992805481, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20985107123851776, + "step": 11718 + }, + { + "epoch": 0.2344, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005008697509765625, + "learning_rate": 0.0001, + "loss": 3.9458, + "loss/crossentropy": 2.0314669013023376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20942936092615128, + "step": 11720 + }, + { + "epoch": 0.23444, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0054107666015625, + "learning_rate": 0.0001, + "loss": 4.1098, + "loss/crossentropy": 2.343130350112915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.237278014421463, + "step": 11722 + }, + { + "epoch": 0.23448, + "grad_norm": 1.9609375, + "grad_norm_var": 0.010658518473307291, + "learning_rate": 0.0001, + "loss": 4.2497, + "loss/crossentropy": 2.1542173624038696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22762110829353333, + "step": 11724 + }, + { + "epoch": 0.23452, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009395090738932292, + "learning_rate": 0.0001, + "loss": 3.9056, + "loss/crossentropy": 1.638447105884552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1707247570157051, + "step": 11726 + }, + { + "epoch": 0.23456, + "grad_norm": 2.09375, + "grad_norm_var": 0.010573069254557291, + "learning_rate": 0.0001, + "loss": 4.0939, + "loss/crossentropy": 2.1509228944778442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2091827318072319, + "step": 11728 + }, + { + "epoch": 0.2346, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011677805582682292, + "learning_rate": 0.0001, + "loss": 4.325, + "loss/crossentropy": 2.193585455417633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2697590962052345, + "step": 11730 + }, + { + "epoch": 0.23464, + "grad_norm": 2.109375, + "grad_norm_var": 0.012648264567057291, + "learning_rate": 0.0001, + "loss": 4.2239, + "loss/crossentropy": 2.2058298587799072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21958298981189728, + "step": 11732 + }, + { + "epoch": 0.23468, + "grad_norm": 2.078125, + "grad_norm_var": 0.012894439697265624, + "learning_rate": 0.0001, + "loss": 4.3269, + "loss/crossentropy": 2.0816246271133423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20916947722434998, + "step": 11734 + }, + { + "epoch": 0.23472, + "grad_norm": 2.03125, + "grad_norm_var": 0.0150543212890625, + "learning_rate": 0.0001, + "loss": 4.1495, + "loss/crossentropy": 1.838355541229248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20002590864896774, + "step": 11736 + }, + { + "epoch": 0.23476, + "grad_norm": 1.9765625, + "grad_norm_var": 0.016795857747395834, + "learning_rate": 0.0001, + "loss": 4.0559, + "loss/crossentropy": 2.1602721214294434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21018436551094055, + "step": 11738 + }, + { + "epoch": 0.2348, + "grad_norm": 2.03125, + "grad_norm_var": 0.013181304931640625, + "learning_rate": 0.0001, + "loss": 4.5376, + "loss/crossentropy": 2.6967735290527344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23233920335769653, + "step": 11740 + }, + { + "epoch": 0.23484, + "grad_norm": 2.03125, + "grad_norm_var": 0.012715403238932292, + "learning_rate": 0.0001, + "loss": 3.9772, + "loss/crossentropy": 2.033313810825348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21240226924419403, + "step": 11742 + }, + { + "epoch": 0.23488, + "grad_norm": 1.96875, + "grad_norm_var": 0.012245432535807291, + "learning_rate": 0.0001, + "loss": 4.21, + "loss/crossentropy": 2.123607873916626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.207147017121315, + "step": 11744 + }, + { + "epoch": 0.23492, + "grad_norm": 1.8125, + "grad_norm_var": 0.013719685872395833, + "learning_rate": 0.0001, + "loss": 4.1121, + "loss/crossentropy": 2.1173813343048096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21387682110071182, + "step": 11746 + }, + { + "epoch": 0.23496, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011456044514973958, + "learning_rate": 0.0001, + "loss": 4.1219, + "loss/crossentropy": 2.27209734916687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2254154533147812, + "step": 11748 + }, + { + "epoch": 0.235, + "grad_norm": 2.0625, + "grad_norm_var": 0.011156209309895833, + "learning_rate": 0.0001, + "loss": 4.3138, + "loss/crossentropy": 1.844546616077423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1740722581744194, + "step": 11750 + }, + { + "epoch": 0.23504, + "grad_norm": 2.078125, + "grad_norm_var": 0.009346516927083333, + "learning_rate": 0.0001, + "loss": 4.4615, + "loss/crossentropy": 2.341967821121216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23638595640659332, + "step": 11752 + }, + { + "epoch": 0.23508, + "grad_norm": 2.09375, + "grad_norm_var": 0.009354400634765624, + "learning_rate": 0.0001, + "loss": 4.4113, + "loss/crossentropy": 2.2561213970184326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23169096559286118, + "step": 11754 + }, + { + "epoch": 0.23512, + "grad_norm": 1.96875, + "grad_norm_var": 0.007972971598307291, + "learning_rate": 0.0001, + "loss": 4.1603, + "loss/crossentropy": 1.8693158030509949, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19997497648000717, + "step": 11756 + }, + { + "epoch": 0.23516, + "grad_norm": 1.953125, + "grad_norm_var": 0.0085845947265625, + "learning_rate": 0.0001, + "loss": 3.9721, + "loss/crossentropy": 2.2228434085845947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19779722392559052, + "step": 11758 + }, + { + "epoch": 0.2352, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007899729410807292, + "learning_rate": 0.0001, + "loss": 4.2572, + "loss/crossentropy": 2.177064299583435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21091558039188385, + "step": 11760 + }, + { + "epoch": 0.23524, + "grad_norm": 1.984375, + "grad_norm_var": 0.005020904541015625, + "learning_rate": 0.0001, + "loss": 4.398, + "loss/crossentropy": 2.2667617797851562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2120228409767151, + "step": 11762 + }, + { + "epoch": 0.23528, + "grad_norm": 2.109375, + "grad_norm_var": 0.004713694254557292, + "learning_rate": 0.0001, + "loss": 4.198, + "loss/crossentropy": 2.0310307145118713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2194480448961258, + "step": 11764 + }, + { + "epoch": 0.23532, + "grad_norm": 1.953125, + "grad_norm_var": 0.005033111572265625, + "learning_rate": 0.0001, + "loss": 3.9906, + "loss/crossentropy": 1.8481100797653198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20181456953287125, + "step": 11766 + }, + { + "epoch": 0.23536, + "grad_norm": 2.140625, + "grad_norm_var": 0.008642323811848958, + "learning_rate": 0.0001, + "loss": 4.3486, + "loss/crossentropy": 2.254691958427429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26685942709445953, + "step": 11768 + }, + { + "epoch": 0.2354, + "grad_norm": 1.921875, + "grad_norm_var": 0.007993316650390625, + "learning_rate": 0.0001, + "loss": 4.144, + "loss/crossentropy": 2.112728714942932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19724663347005844, + "step": 11770 + }, + { + "epoch": 0.23544, + "grad_norm": 2.265625, + "grad_norm_var": 0.011195627848307292, + "learning_rate": 0.0001, + "loss": 4.089, + "loss/crossentropy": 2.0724143981933594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20518633723258972, + "step": 11772 + }, + { + "epoch": 0.23548, + "grad_norm": 2.015625, + "grad_norm_var": 0.010064442952473959, + "learning_rate": 0.0001, + "loss": 4.1882, + "loss/crossentropy": 2.0743810534477234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2181471511721611, + "step": 11774 + }, + { + "epoch": 0.23552, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011740875244140626, + "learning_rate": 0.0001, + "loss": 4.1121, + "loss/crossentropy": 2.02871835231781, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22081361711025238, + "step": 11776 + }, + { + "epoch": 0.23556, + "grad_norm": 2.15625, + "grad_norm_var": 0.013185373942057292, + "learning_rate": 0.0001, + "loss": 4.5607, + "loss/crossentropy": 2.1898428201675415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22084421664476395, + "step": 11778 + }, + { + "epoch": 0.2356, + "grad_norm": 2.0625, + "grad_norm_var": 0.012748209635416667, + "learning_rate": 0.0001, + "loss": 4.2404, + "loss/crossentropy": 2.0893847346305847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21616562455892563, + "step": 11780 + }, + { + "epoch": 0.23564, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011769358317057292, + "learning_rate": 0.0001, + "loss": 4.215, + "loss/crossentropy": 1.8590916991233826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18936381489038467, + "step": 11782 + }, + { + "epoch": 0.23568, + "grad_norm": 2.296875, + "grad_norm_var": 0.012741851806640624, + "learning_rate": 0.0001, + "loss": 3.9544, + "loss/crossentropy": 2.105339765548706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22275932878255844, + "step": 11784 + }, + { + "epoch": 0.23572, + "grad_norm": 2.140625, + "grad_norm_var": 0.011987050374348959, + "learning_rate": 0.0001, + "loss": 4.2585, + "loss/crossentropy": 2.1720080375671387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20734921097755432, + "step": 11786 + }, + { + "epoch": 0.23576, + "grad_norm": 1.984375, + "grad_norm_var": 0.012640126546223958, + "learning_rate": 0.0001, + "loss": 4.2715, + "loss/crossentropy": 2.2301958799362183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21865415573120117, + "step": 11788 + }, + { + "epoch": 0.2358, + "grad_norm": 2.1875, + "grad_norm_var": 0.013626861572265624, + "learning_rate": 0.0001, + "loss": 3.7518, + "loss/crossentropy": 1.588155210018158, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19334131479263306, + "step": 11790 + }, + { + "epoch": 0.23584, + "grad_norm": 2.046875, + "grad_norm_var": 0.025248209635416668, + "learning_rate": 0.0001, + "loss": 4.4052, + "loss/crossentropy": 2.208239734172821, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21055688709020615, + "step": 11792 + }, + { + "epoch": 0.23588, + "grad_norm": 1.9140625, + "grad_norm_var": 0.030368804931640625, + "learning_rate": 0.0001, + "loss": 4.1996, + "loss/crossentropy": 2.46909761428833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.225817009806633, + "step": 11794 + }, + { + "epoch": 0.23592, + "grad_norm": 1.984375, + "grad_norm_var": 0.03367691040039063, + "learning_rate": 0.0001, + "loss": 4.2792, + "loss/crossentropy": 2.022938549518585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21232923865318298, + "step": 11796 + }, + { + "epoch": 0.23596, + "grad_norm": 1.9765625, + "grad_norm_var": 0.03394953409830729, + "learning_rate": 0.0001, + "loss": 4.3949, + "loss/crossentropy": 2.333058714866638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23842789232730865, + "step": 11798 + }, + { + "epoch": 0.236, + "grad_norm": 2.015625, + "grad_norm_var": 0.0366363525390625, + "learning_rate": 0.0001, + "loss": 3.8698, + "loss/crossentropy": 1.9570570588111877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20660093426704407, + "step": 11800 + }, + { + "epoch": 0.23604, + "grad_norm": 2.0, + "grad_norm_var": 0.0335601806640625, + "learning_rate": 0.0001, + "loss": 4.3649, + "loss/crossentropy": 2.2074697017669678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23766764998435974, + "step": 11802 + }, + { + "epoch": 0.23608, + "grad_norm": 1.9921875, + "grad_norm_var": 0.03026301066080729, + "learning_rate": 0.0001, + "loss": 4.4127, + "loss/crossentropy": 2.327863335609436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22967173904180527, + "step": 11804 + }, + { + "epoch": 0.23612, + "grad_norm": 2.03125, + "grad_norm_var": 0.02802734375, + "learning_rate": 0.0001, + "loss": 4.3367, + "loss/crossentropy": 2.135041356086731, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21708428114652634, + "step": 11806 + }, + { + "epoch": 0.23616, + "grad_norm": 2.078125, + "grad_norm_var": 0.00665283203125, + "learning_rate": 0.0001, + "loss": 3.9805, + "loss/crossentropy": 1.9111011624336243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20122328400611877, + "step": 11808 + }, + { + "epoch": 0.2362, + "grad_norm": 1.984375, + "grad_norm_var": 0.006436920166015625, + "learning_rate": 0.0001, + "loss": 4.19, + "loss/crossentropy": 2.0823878049850464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23593349009752274, + "step": 11810 + }, + { + "epoch": 0.23624, + "grad_norm": 2.078125, + "grad_norm_var": 0.0069000244140625, + "learning_rate": 0.0001, + "loss": 3.9554, + "loss/crossentropy": 2.298948645591736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22076356410980225, + "step": 11812 + }, + { + "epoch": 0.23628, + "grad_norm": 1.96875, + "grad_norm_var": 0.006483713785807292, + "learning_rate": 0.0001, + "loss": 4.1119, + "loss/crossentropy": 2.383033037185669, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20850034803152084, + "step": 11814 + }, + { + "epoch": 0.23632, + "grad_norm": 2.109375, + "grad_norm_var": 0.10857645670572917, + "learning_rate": 0.0001, + "loss": 4.3648, + "loss/crossentropy": 2.004193425178528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27853623032569885, + "step": 11816 + }, + { + "epoch": 0.23636, + "grad_norm": 2.515625, + "grad_norm_var": 0.11877339680989583, + "learning_rate": 0.0001, + "loss": 4.4454, + "loss/crossentropy": 1.9110660552978516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21773407608270645, + "step": 11818 + }, + { + "epoch": 0.2364, + "grad_norm": 2.140625, + "grad_norm_var": 0.11894505818684896, + "learning_rate": 0.0001, + "loss": 4.5428, + "loss/crossentropy": 2.300672471523285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23710736632347107, + "step": 11820 + }, + { + "epoch": 0.23644, + "grad_norm": 1.984375, + "grad_norm_var": 0.11943333943684896, + "learning_rate": 0.0001, + "loss": 4.1413, + "loss/crossentropy": 2.270769238471985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2182588130235672, + "step": 11822 + }, + { + "epoch": 0.23648, + "grad_norm": 2.078125, + "grad_norm_var": 0.11885960896809895, + "learning_rate": 0.0001, + "loss": 4.2744, + "loss/crossentropy": 2.085016667842865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19494660943746567, + "step": 11824 + }, + { + "epoch": 0.23652, + "grad_norm": 2.078125, + "grad_norm_var": 0.11644261678059896, + "learning_rate": 0.0001, + "loss": 4.5554, + "loss/crossentropy": 2.384607672691345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22075054794549942, + "step": 11826 + }, + { + "epoch": 0.23656, + "grad_norm": 1.8984375, + "grad_norm_var": 0.11926167805989583, + "learning_rate": 0.0001, + "loss": 3.9076, + "loss/crossentropy": 2.1217936277389526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126227766275406, + "step": 11828 + }, + { + "epoch": 0.2366, + "grad_norm": 1.875, + "grad_norm_var": 0.12241185506184896, + "learning_rate": 0.0001, + "loss": 3.8285, + "loss/crossentropy": 2.1690168380737305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20450890809297562, + "step": 11830 + }, + { + "epoch": 0.23664, + "grad_norm": 1.9765625, + "grad_norm_var": 0.031998697916666666, + "learning_rate": 0.0001, + "loss": 4.2817, + "loss/crossentropy": 2.2614429593086243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22817185521125793, + "step": 11832 + }, + { + "epoch": 0.23668, + "grad_norm": 1.9296875, + "grad_norm_var": 0.023545074462890624, + "learning_rate": 0.0001, + "loss": 4.3074, + "loss/crossentropy": 2.177332043647766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.226592555642128, + "step": 11834 + }, + { + "epoch": 0.23672, + "grad_norm": 1.859375, + "grad_norm_var": 0.01765925089518229, + "learning_rate": 0.0001, + "loss": 4.0471, + "loss/crossentropy": 1.947661578655243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1762884557247162, + "step": 11836 + }, + { + "epoch": 0.23676, + "grad_norm": 2.015625, + "grad_norm_var": 0.018184153238932292, + "learning_rate": 0.0001, + "loss": 3.9014, + "loss/crossentropy": 1.7112661004066467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18696465343236923, + "step": 11838 + }, + { + "epoch": 0.2368, + "grad_norm": 2.078125, + "grad_norm_var": 0.017439524332682293, + "learning_rate": 0.0001, + "loss": 4.1761, + "loss/crossentropy": 2.0478790402412415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21436551213264465, + "step": 11840 + }, + { + "epoch": 0.23684, + "grad_norm": 2.078125, + "grad_norm_var": 0.01718724568684896, + "learning_rate": 0.0001, + "loss": 4.1702, + "loss/crossentropy": 2.0541738867759705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23121927678585052, + "step": 11842 + }, + { + "epoch": 0.23688, + "grad_norm": 2.109375, + "grad_norm_var": 0.01876805623372396, + "learning_rate": 0.0001, + "loss": 4.208, + "loss/crossentropy": 2.04353004693985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22237974405288696, + "step": 11844 + }, + { + "epoch": 0.23692, + "grad_norm": 2.1875, + "grad_norm_var": 0.017488606770833335, + "learning_rate": 0.0001, + "loss": 4.4641, + "loss/crossentropy": 2.110148549079895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23324476927518845, + "step": 11846 + }, + { + "epoch": 0.23696, + "grad_norm": 1.9140625, + "grad_norm_var": 0.018623860677083333, + "learning_rate": 0.0001, + "loss": 4.0831, + "loss/crossentropy": 2.207589864730835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.209608756005764, + "step": 11848 + }, + { + "epoch": 0.237, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012180328369140625, + "learning_rate": 0.0001, + "loss": 4.1896, + "loss/crossentropy": 2.2447429895401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016080766916275, + "step": 11850 + }, + { + "epoch": 0.23704, + "grad_norm": 2.0625, + "grad_norm_var": 0.009771474202473958, + "learning_rate": 0.0001, + "loss": 4.3133, + "loss/crossentropy": 2.3349474668502808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22105249762535095, + "step": 11852 + }, + { + "epoch": 0.23708, + "grad_norm": 1.9765625, + "grad_norm_var": 0.008024088541666667, + "learning_rate": 0.0001, + "loss": 4.1091, + "loss/crossentropy": 1.8444748520851135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993526816368103, + "step": 11854 + }, + { + "epoch": 0.23712, + "grad_norm": 2.109375, + "grad_norm_var": 0.009323883056640624, + "learning_rate": 0.0001, + "loss": 4.0758, + "loss/crossentropy": 1.9181615710258484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18698236346244812, + "step": 11856 + }, + { + "epoch": 0.23716, + "grad_norm": 2.15625, + "grad_norm_var": 0.009627024332682291, + "learning_rate": 0.0001, + "loss": 4.1454, + "loss/crossentropy": 2.1930960416793823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2122444212436676, + "step": 11858 + }, + { + "epoch": 0.2372, + "grad_norm": 2.171875, + "grad_norm_var": 0.009439849853515625, + "learning_rate": 0.0001, + "loss": 4.3177, + "loss/crossentropy": 1.7835432887077332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20900961011648178, + "step": 11860 + }, + { + "epoch": 0.23724, + "grad_norm": 2.15625, + "grad_norm_var": 0.011775461832682292, + "learning_rate": 0.0001, + "loss": 4.6362, + "loss/crossentropy": 2.1839439868927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21574077755212784, + "step": 11862 + }, + { + "epoch": 0.23728, + "grad_norm": 1.96875, + "grad_norm_var": 0.010791015625, + "learning_rate": 0.0001, + "loss": 3.8916, + "loss/crossentropy": 1.9817007184028625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21773921698331833, + "step": 11864 + }, + { + "epoch": 0.23732, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008923085530598958, + "learning_rate": 0.0001, + "loss": 4.2065, + "loss/crossentropy": 2.017501652240753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2247873619198799, + "step": 11866 + }, + { + "epoch": 0.23736, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010992177327473958, + "learning_rate": 0.0001, + "loss": 3.8813, + "loss/crossentropy": 2.0779114961624146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21136770397424698, + "step": 11868 + }, + { + "epoch": 0.2374, + "grad_norm": 2.109375, + "grad_norm_var": 0.010497029622395833, + "learning_rate": 0.0001, + "loss": 4.341, + "loss/crossentropy": 2.3987231254577637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2326417714357376, + "step": 11870 + }, + { + "epoch": 0.23744, + "grad_norm": 2.0625, + "grad_norm_var": 0.008975982666015625, + "learning_rate": 0.0001, + "loss": 4.0466, + "loss/crossentropy": 1.7145346999168396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19222228974103928, + "step": 11872 + }, + { + "epoch": 0.23748, + "grad_norm": 2.015625, + "grad_norm_var": 0.008829498291015625, + "learning_rate": 0.0001, + "loss": 4.2683, + "loss/crossentropy": 2.378560423851013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21374844759702682, + "step": 11874 + }, + { + "epoch": 0.23752, + "grad_norm": 2.0625, + "grad_norm_var": 0.0073626200358072914, + "learning_rate": 0.0001, + "loss": 4.2548, + "loss/crossentropy": 1.951455295085907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2217455804347992, + "step": 11876 + }, + { + "epoch": 0.23756, + "grad_norm": 1.8671875, + "grad_norm_var": 0.00513916015625, + "learning_rate": 0.0001, + "loss": 3.8867, + "loss/crossentropy": 1.9309074878692627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100483626127243, + "step": 11878 + }, + { + "epoch": 0.2376, + "grad_norm": 2.015625, + "grad_norm_var": 0.00445556640625, + "learning_rate": 0.0001, + "loss": 4.102, + "loss/crossentropy": 2.2295292615890503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2196313813328743, + "step": 11880 + }, + { + "epoch": 0.23764, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004964192708333333, + "learning_rate": 0.0001, + "loss": 4.0514, + "loss/crossentropy": 2.0665449500083923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20752248913049698, + "step": 11882 + }, + { + "epoch": 0.23768, + "grad_norm": 2.1875, + "grad_norm_var": 0.00560302734375, + "learning_rate": 0.0001, + "loss": 4.4022, + "loss/crossentropy": 1.9645958542823792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20391641557216644, + "step": 11884 + }, + { + "epoch": 0.23772, + "grad_norm": 1.984375, + "grad_norm_var": 0.005956013997395833, + "learning_rate": 0.0001, + "loss": 4.1698, + "loss/crossentropy": 2.1280174255371094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21947231888771057, + "step": 11886 + }, + { + "epoch": 0.23776, + "grad_norm": 2.03125, + "grad_norm_var": 0.0073964436848958336, + "learning_rate": 0.0001, + "loss": 4.2043, + "loss/crossentropy": 1.9462851285934448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1961178332567215, + "step": 11888 + }, + { + "epoch": 0.2378, + "grad_norm": 2.203125, + "grad_norm_var": 0.009422810872395833, + "learning_rate": 0.0001, + "loss": 4.4846, + "loss/crossentropy": 2.249086618423462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22473402321338654, + "step": 11890 + }, + { + "epoch": 0.23784, + "grad_norm": 1.9375, + "grad_norm_var": 0.010472615559895834, + "learning_rate": 0.0001, + "loss": 4.1368, + "loss/crossentropy": 2.1972378492355347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2164871245622635, + "step": 11892 + }, + { + "epoch": 0.23788, + "grad_norm": 2.15625, + "grad_norm_var": 0.009409332275390625, + "learning_rate": 0.0001, + "loss": 4.228, + "loss/crossentropy": 2.0067209601402283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20195162296295166, + "step": 11894 + }, + { + "epoch": 0.23792, + "grad_norm": 2.03125, + "grad_norm_var": 0.009673817952473959, + "learning_rate": 0.0001, + "loss": 4.1767, + "loss/crossentropy": 1.9103696942329407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19616805016994476, + "step": 11896 + }, + { + "epoch": 0.23796, + "grad_norm": 1.984375, + "grad_norm_var": 0.011595662434895833, + "learning_rate": 0.0001, + "loss": 4.228, + "loss/crossentropy": 2.1524049639701843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21107815951108932, + "step": 11898 + }, + { + "epoch": 0.238, + "grad_norm": 2.0, + "grad_norm_var": 0.0110260009765625, + "learning_rate": 0.0001, + "loss": 4.183, + "loss/crossentropy": 2.2112287878990173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2099093720316887, + "step": 11900 + }, + { + "epoch": 0.23804, + "grad_norm": 2.390625, + "grad_norm_var": 0.8914347330729167, + "learning_rate": 0.0001, + "loss": 4.647, + "loss/crossentropy": 2.2945204973220825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2713918164372444, + "step": 11902 + }, + { + "epoch": 0.23808, + "grad_norm": 2.078125, + "grad_norm_var": 0.8781575520833333, + "learning_rate": 0.0001, + "loss": 4.4699, + "loss/crossentropy": 2.1838968992233276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22640438377857208, + "step": 11904 + }, + { + "epoch": 0.23812, + "grad_norm": 2.0, + "grad_norm_var": 0.89010009765625, + "learning_rate": 0.0001, + "loss": 4.0843, + "loss/crossentropy": 1.9089699983596802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19553899765014648, + "step": 11906 + }, + { + "epoch": 0.23816, + "grad_norm": 2.03125, + "grad_norm_var": 0.8873443603515625, + "learning_rate": 0.0001, + "loss": 4.2573, + "loss/crossentropy": 2.105385661125183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21528839319944382, + "step": 11908 + }, + { + "epoch": 0.2382, + "grad_norm": 2.015625, + "grad_norm_var": 0.8921946207682292, + "learning_rate": 0.0001, + "loss": 4.0381, + "loss/crossentropy": 1.565223515033722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17765602469444275, + "step": 11910 + }, + { + "epoch": 0.23824, + "grad_norm": 2.078125, + "grad_norm_var": 0.8919016520182291, + "learning_rate": 0.0001, + "loss": 4.3014, + "loss/crossentropy": 2.220748543739319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21904101967811584, + "step": 11912 + }, + { + "epoch": 0.23828, + "grad_norm": 2.15625, + "grad_norm_var": 0.889306640625, + "learning_rate": 0.0001, + "loss": 4.2965, + "loss/crossentropy": 2.294031500816345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21700909733772278, + "step": 11914 + }, + { + "epoch": 0.23832, + "grad_norm": 2.171875, + "grad_norm_var": 0.8795237223307292, + "learning_rate": 0.0001, + "loss": 4.3439, + "loss/crossentropy": 2.072917103767395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20462248474359512, + "step": 11916 + }, + { + "epoch": 0.23836, + "grad_norm": 2.03125, + "grad_norm_var": 0.00533447265625, + "learning_rate": 0.0001, + "loss": 4.1843, + "loss/crossentropy": 2.0541720390319824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2127074897289276, + "step": 11918 + }, + { + "epoch": 0.2384, + "grad_norm": 2.109375, + "grad_norm_var": 0.0046539306640625, + "learning_rate": 0.0001, + "loss": 3.8922, + "loss/crossentropy": 1.9873629808425903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19215020537376404, + "step": 11920 + }, + { + "epoch": 0.23844, + "grad_norm": 2.09375, + "grad_norm_var": 0.0032135009765625, + "learning_rate": 0.0001, + "loss": 4.1943, + "loss/crossentropy": 2.447067141532898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2318919375538826, + "step": 11922 + }, + { + "epoch": 0.23848, + "grad_norm": 2.0, + "grad_norm_var": 0.0038157145182291666, + "learning_rate": 0.0001, + "loss": 3.8756, + "loss/crossentropy": 1.9918025732040405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20865648984909058, + "step": 11924 + }, + { + "epoch": 0.23852, + "grad_norm": 2.046875, + "grad_norm_var": 0.0026041666666666665, + "learning_rate": 0.0001, + "loss": 4.3243, + "loss/crossentropy": 2.0803070068359375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21753622591495514, + "step": 11926 + }, + { + "epoch": 0.23856, + "grad_norm": 1.9921875, + "grad_norm_var": 0.002976226806640625, + "learning_rate": 0.0001, + "loss": 4.1828, + "loss/crossentropy": 2.118411421775818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21741919964551926, + "step": 11928 + }, + { + "epoch": 0.2386, + "grad_norm": 1.9375, + "grad_norm_var": 0.0034075419108072916, + "learning_rate": 0.0001, + "loss": 4.0056, + "loss/crossentropy": 1.8924900889396667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20609601587057114, + "step": 11930 + }, + { + "epoch": 0.23864, + "grad_norm": 2.046875, + "grad_norm_var": 0.002418772379557292, + "learning_rate": 0.0001, + "loss": 4.0012, + "loss/crossentropy": 1.743731439113617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19873473048210144, + "step": 11932 + }, + { + "epoch": 0.23868, + "grad_norm": 2.015625, + "grad_norm_var": 0.002929433186848958, + "learning_rate": 0.0001, + "loss": 4.2564, + "loss/crossentropy": 2.291381061077118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21248165518045425, + "step": 11934 + }, + { + "epoch": 0.23872, + "grad_norm": 2.109375, + "grad_norm_var": 0.007452138264973958, + "learning_rate": 0.0001, + "loss": 4.3938, + "loss/crossentropy": 1.7672501802444458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19201595336198807, + "step": 11936 + }, + { + "epoch": 0.23876, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008162434895833333, + "learning_rate": 0.0001, + "loss": 3.8693, + "loss/crossentropy": 1.8719280362129211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1960783526301384, + "step": 11938 + }, + { + "epoch": 0.2388, + "grad_norm": 2.28125, + "grad_norm_var": 0.010978190104166667, + "learning_rate": 0.0001, + "loss": 4.3345, + "loss/crossentropy": 1.8103876113891602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1964937001466751, + "step": 11940 + }, + { + "epoch": 0.23884, + "grad_norm": 2.03125, + "grad_norm_var": 0.011901601155598959, + "learning_rate": 0.0001, + "loss": 4.2134, + "loss/crossentropy": 1.8743855953216553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19404225796461105, + "step": 11942 + }, + { + "epoch": 0.23888, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012562815348307292, + "learning_rate": 0.0001, + "loss": 4.2379, + "loss/crossentropy": 2.3096803426742554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23256508260965347, + "step": 11944 + }, + { + "epoch": 0.23892, + "grad_norm": 2.109375, + "grad_norm_var": 0.012123362223307291, + "learning_rate": 0.0001, + "loss": 4.5089, + "loss/crossentropy": 2.207027554512024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20778407156467438, + "step": 11946 + }, + { + "epoch": 0.23896, + "grad_norm": 2.015625, + "grad_norm_var": 0.011671702067057291, + "learning_rate": 0.0001, + "loss": 4.0423, + "loss/crossentropy": 1.845999002456665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19381015002727509, + "step": 11948 + }, + { + "epoch": 0.239, + "grad_norm": 2.03125, + "grad_norm_var": 0.011628977457682292, + "learning_rate": 0.0001, + "loss": 4.1054, + "loss/crossentropy": 1.677983045578003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17493421584367752, + "step": 11950 + }, + { + "epoch": 0.23904, + "grad_norm": 2.140625, + "grad_norm_var": 0.008107248942057292, + "learning_rate": 0.0001, + "loss": 4.1729, + "loss/crossentropy": 2.1798466444015503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22395183145999908, + "step": 11952 + }, + { + "epoch": 0.23908, + "grad_norm": 1.9140625, + "grad_norm_var": 0.013525390625, + "learning_rate": 0.0001, + "loss": 4.0162, + "loss/crossentropy": 2.008498191833496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21069375425577164, + "step": 11954 + }, + { + "epoch": 0.23912, + "grad_norm": 2.140625, + "grad_norm_var": 0.014546712239583334, + "learning_rate": 0.0001, + "loss": 4.3736, + "loss/crossentropy": 2.1712071895599365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21842175722122192, + "step": 11956 + }, + { + "epoch": 0.23916, + "grad_norm": 2.578125, + "grad_norm_var": 0.03242365519205729, + "learning_rate": 0.0001, + "loss": 4.1338, + "loss/crossentropy": 2.2769562005996704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22064895182847977, + "step": 11958 + }, + { + "epoch": 0.2392, + "grad_norm": 2.09375, + "grad_norm_var": 0.032572428385416664, + "learning_rate": 0.0001, + "loss": 4.3401, + "loss/crossentropy": 2.127632260322571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2139139398932457, + "step": 11960 + }, + { + "epoch": 0.23924, + "grad_norm": 2.046875, + "grad_norm_var": 0.03223368326822917, + "learning_rate": 0.0001, + "loss": 4.3547, + "loss/crossentropy": 2.2687970399856567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23024404793977737, + "step": 11962 + }, + { + "epoch": 0.23928, + "grad_norm": 2.0625, + "grad_norm_var": 0.032136027018229166, + "learning_rate": 0.0001, + "loss": 4.3532, + "loss/crossentropy": 2.2271196246147156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22528529912233353, + "step": 11964 + }, + { + "epoch": 0.23932, + "grad_norm": 2.03125, + "grad_norm_var": 0.032136027018229166, + "learning_rate": 0.0001, + "loss": 4.1898, + "loss/crossentropy": 2.3209941387176514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21150042116641998, + "step": 11966 + }, + { + "epoch": 0.23936, + "grad_norm": 2.046875, + "grad_norm_var": 0.03208719889322917, + "learning_rate": 0.0001, + "loss": 4.197, + "loss/crossentropy": 1.9936136603355408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22582116723060608, + "step": 11968 + }, + { + "epoch": 0.2394, + "grad_norm": 2.03125, + "grad_norm_var": 0.024055989583333333, + "learning_rate": 0.0001, + "loss": 3.9045, + "loss/crossentropy": 1.7509311437606812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19028881192207336, + "step": 11970 + }, + { + "epoch": 0.23944, + "grad_norm": 2.140625, + "grad_norm_var": 0.02213134765625, + "learning_rate": 0.0001, + "loss": 4.2365, + "loss/crossentropy": 2.2120620012283325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23357221484184265, + "step": 11972 + }, + { + "epoch": 0.23948, + "grad_norm": 2.109375, + "grad_norm_var": 0.005686187744140625, + "learning_rate": 0.0001, + "loss": 4.0129, + "loss/crossentropy": 1.794329285621643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17914240062236786, + "step": 11974 + }, + { + "epoch": 0.23952, + "grad_norm": 2.03125, + "grad_norm_var": 0.010550689697265626, + "learning_rate": 0.0001, + "loss": 4.1347, + "loss/crossentropy": 2.1647136211395264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21009670197963715, + "step": 11976 + }, + { + "epoch": 0.23956, + "grad_norm": 2.046875, + "grad_norm_var": 0.010660552978515625, + "learning_rate": 0.0001, + "loss": 4.2365, + "loss/crossentropy": 1.674091637134552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19617585837841034, + "step": 11978 + }, + { + "epoch": 0.2396, + "grad_norm": 2.015625, + "grad_norm_var": 0.011739095052083334, + "learning_rate": 0.0001, + "loss": 4.2474, + "loss/crossentropy": 1.9591755867004395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18309858441352844, + "step": 11980 + }, + { + "epoch": 0.23964, + "grad_norm": 2.34375, + "grad_norm_var": 0.017862955729166668, + "learning_rate": 0.0001, + "loss": 4.3642, + "loss/crossentropy": 2.0550093054771423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22483228147029877, + "step": 11982 + }, + { + "epoch": 0.23968, + "grad_norm": 2.203125, + "grad_norm_var": 0.019291178385416666, + "learning_rate": 0.0001, + "loss": 4.3937, + "loss/crossentropy": 2.264032781124115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2101321816444397, + "step": 11984 + }, + { + "epoch": 0.23972, + "grad_norm": 2.015625, + "grad_norm_var": 0.020213826497395834, + "learning_rate": 0.0001, + "loss": 4.4341, + "loss/crossentropy": 2.06082820892334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2090815082192421, + "step": 11986 + }, + { + "epoch": 0.23976, + "grad_norm": 2.0, + "grad_norm_var": 0.0197662353515625, + "learning_rate": 0.0001, + "loss": 4.4159, + "loss/crossentropy": 2.248009443283081, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22642739117145538, + "step": 11988 + }, + { + "epoch": 0.2398, + "grad_norm": 1.9140625, + "grad_norm_var": 0.02211278279622396, + "learning_rate": 0.0001, + "loss": 3.9601, + "loss/crossentropy": 1.797426462173462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18600185215473175, + "step": 11990 + }, + { + "epoch": 0.23984, + "grad_norm": 2.015625, + "grad_norm_var": 0.01573460896809896, + "learning_rate": 0.0001, + "loss": 4.1666, + "loss/crossentropy": 2.5529314279556274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23957456648349762, + "step": 11992 + }, + { + "epoch": 0.23988, + "grad_norm": 2.125, + "grad_norm_var": 0.015933990478515625, + "learning_rate": 0.0001, + "loss": 4.2971, + "loss/crossentropy": 1.9974916577339172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23130090534687042, + "step": 11994 + }, + { + "epoch": 0.23992, + "grad_norm": 2.046875, + "grad_norm_var": 0.014679972330729167, + "learning_rate": 0.0001, + "loss": 4.0358, + "loss/crossentropy": 1.9714577794075012, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1969093456864357, + "step": 11996 + }, + { + "epoch": 0.23996, + "grad_norm": 2.25, + "grad_norm_var": 0.011055501302083333, + "learning_rate": 0.0001, + "loss": 4.282, + "loss/crossentropy": 1.8047285079956055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19191773235797882, + "step": 11998 + }, + { + "epoch": 0.24, + "grad_norm": 2.09375, + "grad_norm_var": 0.009187825520833333, + "learning_rate": 0.0001, + "loss": 4.4078, + "loss/crossentropy": 1.9240365028381348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19021467864513397, + "step": 12000 + }, + { + "epoch": 0.24004, + "grad_norm": 2.125, + "grad_norm_var": 0.009041086832682291, + "learning_rate": 0.0001, + "loss": 4.1657, + "loss/crossentropy": 1.6503748297691345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18838192522525787, + "step": 12002 + }, + { + "epoch": 0.24008, + "grad_norm": 2.109375, + "grad_norm_var": 0.009368642171223959, + "learning_rate": 0.0001, + "loss": 4.3451, + "loss/crossentropy": 2.238506555557251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2139800265431404, + "step": 12004 + }, + { + "epoch": 0.24012, + "grad_norm": 1.890625, + "grad_norm_var": 0.008695220947265625, + "learning_rate": 0.0001, + "loss": 4.0978, + "loss/crossentropy": 1.8623422384262085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19039735198020935, + "step": 12006 + }, + { + "epoch": 0.24016, + "grad_norm": 2.109375, + "grad_norm_var": 0.0096588134765625, + "learning_rate": 0.0001, + "loss": 3.9741, + "loss/crossentropy": 1.975899577140808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20808710902929306, + "step": 12008 + }, + { + "epoch": 0.2402, + "grad_norm": 2.15625, + "grad_norm_var": 0.0108642578125, + "learning_rate": 0.0001, + "loss": 4.4332, + "loss/crossentropy": 2.2959831953048706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23725779354572296, + "step": 12010 + }, + { + "epoch": 0.24024, + "grad_norm": 1.8515625, + "grad_norm_var": 0.015130360921223959, + "learning_rate": 0.0001, + "loss": 3.9117, + "loss/crossentropy": 2.1626380681991577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2068287953734398, + "step": 12012 + }, + { + "epoch": 0.24028, + "grad_norm": 2.15625, + "grad_norm_var": 0.013474273681640624, + "learning_rate": 0.0001, + "loss": 4.1614, + "loss/crossentropy": 1.925924837589264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2097126841545105, + "step": 12014 + }, + { + "epoch": 0.24032, + "grad_norm": 2.046875, + "grad_norm_var": 0.018302154541015626, + "learning_rate": 0.0001, + "loss": 4.5545, + "loss/crossentropy": 2.147459626197815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2345646619796753, + "step": 12016 + }, + { + "epoch": 0.24036, + "grad_norm": 1.8359375, + "grad_norm_var": 0.021418253580729168, + "learning_rate": 0.0001, + "loss": 4.0219, + "loss/crossentropy": 2.052124857902527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2058626338839531, + "step": 12018 + }, + { + "epoch": 0.2404, + "grad_norm": 1.8828125, + "grad_norm_var": 0.021329752604166665, + "learning_rate": 0.0001, + "loss": 3.8619, + "loss/crossentropy": 2.377007842063904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2124926745891571, + "step": 12020 + }, + { + "epoch": 0.24044, + "grad_norm": 2.0, + "grad_norm_var": 0.020173136393229166, + "learning_rate": 0.0001, + "loss": 3.9542, + "loss/crossentropy": 1.9117819666862488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1976870447397232, + "step": 12022 + }, + { + "epoch": 0.24048, + "grad_norm": 2.09375, + "grad_norm_var": 0.021478017171223957, + "learning_rate": 0.0001, + "loss": 4.5771, + "loss/crossentropy": 2.439339756965637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22178436815738678, + "step": 12024 + }, + { + "epoch": 0.24052, + "grad_norm": 2.140625, + "grad_norm_var": 0.02029596964518229, + "learning_rate": 0.0001, + "loss": 4.1182, + "loss/crossentropy": 1.6923209428787231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19288206100463867, + "step": 12026 + }, + { + "epoch": 0.24056, + "grad_norm": 1.9921875, + "grad_norm_var": 0.016434478759765624, + "learning_rate": 0.0001, + "loss": 4.1733, + "loss/crossentropy": 2.071919083595276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21092405915260315, + "step": 12028 + }, + { + "epoch": 0.2406, + "grad_norm": 1.984375, + "grad_norm_var": 0.016993967692057292, + "learning_rate": 0.0001, + "loss": 3.9688, + "loss/crossentropy": 1.994953691959381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19953829050064087, + "step": 12030 + }, + { + "epoch": 0.24064, + "grad_norm": 2.109375, + "grad_norm_var": 0.011201731363932292, + "learning_rate": 0.0001, + "loss": 4.3252, + "loss/crossentropy": 2.0545560121536255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20861412584781647, + "step": 12032 + }, + { + "epoch": 0.24068, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0103424072265625, + "learning_rate": 0.0001, + "loss": 4.0859, + "loss/crossentropy": 2.0211291909217834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2146567404270172, + "step": 12034 + }, + { + "epoch": 0.24072, + "grad_norm": 2.0625, + "grad_norm_var": 0.012312825520833333, + "learning_rate": 0.0001, + "loss": 4.3156, + "loss/crossentropy": 2.165773868560791, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21522508561611176, + "step": 12036 + }, + { + "epoch": 0.24076, + "grad_norm": 1.9609375, + "grad_norm_var": 0.013216145833333333, + "learning_rate": 0.0001, + "loss": 4.2253, + "loss/crossentropy": 2.0272024273872375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20012138038873672, + "step": 12038 + }, + { + "epoch": 0.2408, + "grad_norm": 1.9453125, + "grad_norm_var": 0.011946360270182291, + "learning_rate": 0.0001, + "loss": 4.2419, + "loss/crossentropy": 1.9311429262161255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19755128771066666, + "step": 12040 + }, + { + "epoch": 0.24084, + "grad_norm": 2.0625, + "grad_norm_var": 0.012086741129557292, + "learning_rate": 0.0001, + "loss": 4.1857, + "loss/crossentropy": 2.0740894079208374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20651141554117203, + "step": 12042 + }, + { + "epoch": 0.24088, + "grad_norm": 1.96875, + "grad_norm_var": 0.012876129150390625, + "learning_rate": 0.0001, + "loss": 3.9882, + "loss/crossentropy": 2.215203881263733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21735452860593796, + "step": 12044 + }, + { + "epoch": 0.24092, + "grad_norm": 2.015625, + "grad_norm_var": 0.02371190388997396, + "learning_rate": 0.0001, + "loss": 4.3755, + "loss/crossentropy": 1.9994693994522095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22001128643751144, + "step": 12046 + }, + { + "epoch": 0.24096, + "grad_norm": 2.15625, + "grad_norm_var": 0.025187174479166668, + "learning_rate": 0.0001, + "loss": 4.1145, + "loss/crossentropy": 2.195701003074646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20777013897895813, + "step": 12048 + }, + { + "epoch": 0.241, + "grad_norm": 2.109375, + "grad_norm_var": 0.02474950154622396, + "learning_rate": 0.0001, + "loss": 4.3486, + "loss/crossentropy": 2.1975715160369873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21987678855657578, + "step": 12050 + }, + { + "epoch": 0.24104, + "grad_norm": 1.953125, + "grad_norm_var": 0.02188695271809896, + "learning_rate": 0.0001, + "loss": 4.1337, + "loss/crossentropy": 2.225023865699768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24454358220100403, + "step": 12052 + }, + { + "epoch": 0.24108, + "grad_norm": 1.9296875, + "grad_norm_var": 0.022200520833333334, + "learning_rate": 0.0001, + "loss": 4.0404, + "loss/crossentropy": 1.892149806022644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2110423520207405, + "step": 12054 + }, + { + "epoch": 0.24112, + "grad_norm": 1.8984375, + "grad_norm_var": 0.023579915364583332, + "learning_rate": 0.0001, + "loss": 4.1604, + "loss/crossentropy": 1.9996158480644226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1821269765496254, + "step": 12056 + }, + { + "epoch": 0.24116, + "grad_norm": 2.109375, + "grad_norm_var": 0.02569580078125, + "learning_rate": 0.0001, + "loss": 4.0591, + "loss/crossentropy": 2.0965115427970886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2140883505344391, + "step": 12058 + }, + { + "epoch": 0.2412, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0256500244140625, + "learning_rate": 0.0001, + "loss": 3.9494, + "loss/crossentropy": 1.9169449210166931, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2034926936030388, + "step": 12060 + }, + { + "epoch": 0.24124, + "grad_norm": 2.0625, + "grad_norm_var": 0.015135701497395833, + "learning_rate": 0.0001, + "loss": 4.5, + "loss/crossentropy": 2.3266680240631104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24314038455486298, + "step": 12062 + }, + { + "epoch": 0.24128, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013509114583333334, + "learning_rate": 0.0001, + "loss": 4.1009, + "loss/crossentropy": 2.0337759256362915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20842785388231277, + "step": 12064 + }, + { + "epoch": 0.24132, + "grad_norm": 1.828125, + "grad_norm_var": 0.013618977864583333, + "learning_rate": 0.0001, + "loss": 3.9186, + "loss/crossentropy": 1.9502894878387451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1812899112701416, + "step": 12066 + }, + { + "epoch": 0.24136, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013822174072265625, + "learning_rate": 0.0001, + "loss": 4.269, + "loss/crossentropy": 2.13326895236969, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21872679144144058, + "step": 12068 + }, + { + "epoch": 0.2414, + "grad_norm": 2.125, + "grad_norm_var": 0.013304646809895833, + "learning_rate": 0.0001, + "loss": 4.3594, + "loss/crossentropy": 2.0836809873580933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.212929405272007, + "step": 12070 + }, + { + "epoch": 0.24144, + "grad_norm": 2.03125, + "grad_norm_var": 0.012516276041666666, + "learning_rate": 0.0001, + "loss": 4.0467, + "loss/crossentropy": 2.207913398742676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20640508085489273, + "step": 12072 + }, + { + "epoch": 0.24148, + "grad_norm": 2.03125, + "grad_norm_var": 0.01080322265625, + "learning_rate": 0.0001, + "loss": 4.1877, + "loss/crossentropy": 2.0770451426506042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20633003860712051, + "step": 12074 + }, + { + "epoch": 0.24152, + "grad_norm": 2.0625, + "grad_norm_var": 0.010786946614583333, + "learning_rate": 0.0001, + "loss": 4.2237, + "loss/crossentropy": 2.0042858719825745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18523608148097992, + "step": 12076 + }, + { + "epoch": 0.24156, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010141754150390625, + "learning_rate": 0.0001, + "loss": 4.2755, + "loss/crossentropy": 1.8673237562179565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19267796725034714, + "step": 12078 + }, + { + "epoch": 0.2416, + "grad_norm": 2.046875, + "grad_norm_var": 0.013793690999348959, + "learning_rate": 0.0001, + "loss": 3.9766, + "loss/crossentropy": 2.0686238408088684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19344990700483322, + "step": 12080 + }, + { + "epoch": 0.24164, + "grad_norm": 2.03125, + "grad_norm_var": 0.009504954020182291, + "learning_rate": 0.0001, + "loss": 4.0366, + "loss/crossentropy": 2.0069685578346252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1971682757139206, + "step": 12082 + }, + { + "epoch": 0.24168, + "grad_norm": 2.0625, + "grad_norm_var": 0.009468587239583333, + "learning_rate": 0.0001, + "loss": 4.1975, + "loss/crossentropy": 2.2052754163742065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20646511763334274, + "step": 12084 + }, + { + "epoch": 0.24172, + "grad_norm": 2.078125, + "grad_norm_var": 0.008983357747395834, + "learning_rate": 0.0001, + "loss": 4.1362, + "loss/crossentropy": 2.0343621373176575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19575881958007812, + "step": 12086 + }, + { + "epoch": 0.24176, + "grad_norm": 1.890625, + "grad_norm_var": 0.009894816080729167, + "learning_rate": 0.0001, + "loss": 3.9695, + "loss/crossentropy": 2.2500641345977783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23151995986700058, + "step": 12088 + }, + { + "epoch": 0.2418, + "grad_norm": 2.421875, + "grad_norm_var": 0.0375640869140625, + "learning_rate": 0.0001, + "loss": 4.4361, + "loss/crossentropy": 2.2596821784973145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22305716574192047, + "step": 12090 + }, + { + "epoch": 0.24184, + "grad_norm": 1.9375, + "grad_norm_var": 0.03715184529622396, + "learning_rate": 0.0001, + "loss": 4.0381, + "loss/crossentropy": 2.0940088033676147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23409207165241241, + "step": 12092 + }, + { + "epoch": 0.24188, + "grad_norm": 2.046875, + "grad_norm_var": 0.03586018880208333, + "learning_rate": 0.0001, + "loss": 3.9715, + "loss/crossentropy": 1.6980834603309631, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19339510053396225, + "step": 12094 + }, + { + "epoch": 0.24192, + "grad_norm": 2.09375, + "grad_norm_var": 0.03047459920247396, + "learning_rate": 0.0001, + "loss": 4.192, + "loss/crossentropy": 2.1921679973602295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21085387468338013, + "step": 12096 + }, + { + "epoch": 0.24196, + "grad_norm": 2.34375, + "grad_norm_var": 0.03551839192708333, + "learning_rate": 0.0001, + "loss": 4.3551, + "loss/crossentropy": 2.18610817193985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2332247570157051, + "step": 12098 + }, + { + "epoch": 0.242, + "grad_norm": 2.203125, + "grad_norm_var": 0.038852691650390625, + "learning_rate": 0.0001, + "loss": 4.1216, + "loss/crossentropy": 2.08358097076416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21169763058423996, + "step": 12100 + }, + { + "epoch": 0.24204, + "grad_norm": 2.140625, + "grad_norm_var": 0.038913726806640625, + "learning_rate": 0.0001, + "loss": 4.2339, + "loss/crossentropy": 2.1001542806625366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2132650762796402, + "step": 12102 + }, + { + "epoch": 0.24208, + "grad_norm": 2.109375, + "grad_norm_var": 0.033614095052083334, + "learning_rate": 0.0001, + "loss": 4.2051, + "loss/crossentropy": 1.9031851887702942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2042345404624939, + "step": 12104 + }, + { + "epoch": 0.24212, + "grad_norm": 1.96875, + "grad_norm_var": 0.0127838134765625, + "learning_rate": 0.0001, + "loss": 4.0341, + "loss/crossentropy": 1.670085072517395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1878880336880684, + "step": 12106 + }, + { + "epoch": 0.24216, + "grad_norm": 2.09375, + "grad_norm_var": 0.012914784749348958, + "learning_rate": 0.0001, + "loss": 4.3395, + "loss/crossentropy": 1.9557109475135803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2211412936449051, + "step": 12108 + }, + { + "epoch": 0.2422, + "grad_norm": 1.96875, + "grad_norm_var": 0.013637034098307292, + "learning_rate": 0.0001, + "loss": 4.0107, + "loss/crossentropy": 2.085852086544037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21336784213781357, + "step": 12110 + }, + { + "epoch": 0.24224, + "grad_norm": 1.8671875, + "grad_norm_var": 0.015721638997395832, + "learning_rate": 0.0001, + "loss": 4.0321, + "loss/crossentropy": 2.1064602732658386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2096766158938408, + "step": 12112 + }, + { + "epoch": 0.24228, + "grad_norm": 2.109375, + "grad_norm_var": 0.009325917561848958, + "learning_rate": 0.0001, + "loss": 4.2998, + "loss/crossentropy": 2.381577968597412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23788201808929443, + "step": 12114 + }, + { + "epoch": 0.24232, + "grad_norm": 1.96875, + "grad_norm_var": 0.006758626302083333, + "learning_rate": 0.0001, + "loss": 4.0687, + "loss/crossentropy": 1.7200234532356262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18620850890874863, + "step": 12116 + }, + { + "epoch": 0.24236, + "grad_norm": 2.1875, + "grad_norm_var": 0.007697550455729166, + "learning_rate": 0.0001, + "loss": 4.2672, + "loss/crossentropy": 1.9106165170669556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21456392109394073, + "step": 12118 + }, + { + "epoch": 0.2424, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007106272379557291, + "learning_rate": 0.0001, + "loss": 4.04, + "loss/crossentropy": 1.9852410554885864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1927378550171852, + "step": 12120 + }, + { + "epoch": 0.24244, + "grad_norm": 1.96875, + "grad_norm_var": 0.007991282145182292, + "learning_rate": 0.0001, + "loss": 4.1278, + "loss/crossentropy": 1.4948370456695557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1698828637599945, + "step": 12122 + }, + { + "epoch": 0.24248, + "grad_norm": 1.9375, + "grad_norm_var": 0.0078857421875, + "learning_rate": 0.0001, + "loss": 4.2874, + "loss/crossentropy": 1.956885814666748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19851476699113846, + "step": 12124 + }, + { + "epoch": 0.24252, + "grad_norm": 2.171875, + "grad_norm_var": 0.009422810872395833, + "learning_rate": 0.0001, + "loss": 3.8709, + "loss/crossentropy": 1.9052257537841797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18438522517681122, + "step": 12126 + }, + { + "epoch": 0.24256, + "grad_norm": 2.1875, + "grad_norm_var": 0.008430735270182291, + "learning_rate": 0.0001, + "loss": 4.4347, + "loss/crossentropy": 2.353670358657837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22648276388645172, + "step": 12128 + }, + { + "epoch": 0.2426, + "grad_norm": 1.8203125, + "grad_norm_var": 0.01263427734375, + "learning_rate": 0.0001, + "loss": 3.9896, + "loss/crossentropy": 1.714030683040619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19275055080652237, + "step": 12130 + }, + { + "epoch": 0.24264, + "grad_norm": 2.046875, + "grad_norm_var": 0.012898763020833334, + "learning_rate": 0.0001, + "loss": 4.2981, + "loss/crossentropy": 2.0048200488090515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20478381216526031, + "step": 12132 + }, + { + "epoch": 0.24268, + "grad_norm": 2.125, + "grad_norm_var": 0.011766560872395833, + "learning_rate": 0.0001, + "loss": 4.0497, + "loss/crossentropy": 1.7151115536689758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19872941821813583, + "step": 12134 + }, + { + "epoch": 0.24272, + "grad_norm": 2.109375, + "grad_norm_var": 0.011818186442057291, + "learning_rate": 0.0001, + "loss": 4.2753, + "loss/crossentropy": 2.1646838188171387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22345983982086182, + "step": 12136 + }, + { + "epoch": 0.24276, + "grad_norm": 2.046875, + "grad_norm_var": 0.010807037353515625, + "learning_rate": 0.0001, + "loss": 4.157, + "loss/crossentropy": 2.053311765193939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2143322378396988, + "step": 12138 + }, + { + "epoch": 0.2428, + "grad_norm": 2.015625, + "grad_norm_var": 0.009993235270182291, + "learning_rate": 0.0001, + "loss": 3.8805, + "loss/crossentropy": 2.0471617579460144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20450014621019363, + "step": 12140 + }, + { + "epoch": 0.24284, + "grad_norm": 2.1875, + "grad_norm_var": 0.008737945556640625, + "learning_rate": 0.0001, + "loss": 4.2966, + "loss/crossentropy": 2.1727033853530884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22905820608139038, + "step": 12142 + }, + { + "epoch": 0.24288, + "grad_norm": 2.09375, + "grad_norm_var": 0.008245595296223958, + "learning_rate": 0.0001, + "loss": 4.3861, + "loss/crossentropy": 2.197450280189514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22059273719787598, + "step": 12144 + }, + { + "epoch": 0.24292, + "grad_norm": 2.171875, + "grad_norm_var": 0.003413899739583333, + "learning_rate": 0.0001, + "loss": 4.4841, + "loss/crossentropy": 1.9954137206077576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22441548854112625, + "step": 12146 + }, + { + "epoch": 0.24296, + "grad_norm": 2.0, + "grad_norm_var": 0.005110677083333333, + "learning_rate": 0.0001, + "loss": 4.2113, + "loss/crossentropy": 2.3393132090568542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22949577867984772, + "step": 12148 + }, + { + "epoch": 0.243, + "grad_norm": 1.953125, + "grad_norm_var": 0.0067291259765625, + "learning_rate": 0.0001, + "loss": 4.146, + "loss/crossentropy": 2.11602646112442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19181709736585617, + "step": 12150 + }, + { + "epoch": 0.24304, + "grad_norm": 2.078125, + "grad_norm_var": 0.005597941080729167, + "learning_rate": 0.0001, + "loss": 4.1381, + "loss/crossentropy": 2.130657136440277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20904043316841125, + "step": 12152 + }, + { + "epoch": 0.24308, + "grad_norm": 2.078125, + "grad_norm_var": 0.0053293863932291664, + "learning_rate": 0.0001, + "loss": 4.2983, + "loss/crossentropy": 2.3923556804656982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23029828071594238, + "step": 12154 + }, + { + "epoch": 0.24312, + "grad_norm": 1.875, + "grad_norm_var": 0.007209269205729166, + "learning_rate": 0.0001, + "loss": 3.9697, + "loss/crossentropy": 2.024892747402191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19879258424043655, + "step": 12156 + }, + { + "epoch": 0.24316, + "grad_norm": 1.921875, + "grad_norm_var": 0.007826487223307291, + "learning_rate": 0.0001, + "loss": 3.8761, + "loss/crossentropy": 1.7833393812179565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18542324006557465, + "step": 12158 + }, + { + "epoch": 0.2432, + "grad_norm": 1.890625, + "grad_norm_var": 0.007328033447265625, + "learning_rate": 0.0001, + "loss": 4.1896, + "loss/crossentropy": 2.253599762916565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148476019501686, + "step": 12160 + }, + { + "epoch": 0.24324, + "grad_norm": 2.046875, + "grad_norm_var": 0.005177561442057292, + "learning_rate": 0.0001, + "loss": 4.0253, + "loss/crossentropy": 1.7218471765518188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1923334002494812, + "step": 12162 + }, + { + "epoch": 0.24328, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007575480143229166, + "learning_rate": 0.0001, + "loss": 4.2947, + "loss/crossentropy": 2.210463523864746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2203536182641983, + "step": 12164 + }, + { + "epoch": 0.24332, + "grad_norm": 2.0625, + "grad_norm_var": 0.0076487223307291664, + "learning_rate": 0.0001, + "loss": 4.4164, + "loss/crossentropy": 2.452837347984314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22888235747814178, + "step": 12166 + }, + { + "epoch": 0.24336, + "grad_norm": 1.984375, + "grad_norm_var": 0.007657877604166667, + "learning_rate": 0.0001, + "loss": 4.2157, + "loss/crossentropy": 2.3511279821395874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2431751787662506, + "step": 12168 + }, + { + "epoch": 0.2434, + "grad_norm": 2.03125, + "grad_norm_var": 0.007819620768229167, + "learning_rate": 0.0001, + "loss": 4.0911, + "loss/crossentropy": 2.0734334588050842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21137738972902298, + "step": 12170 + }, + { + "epoch": 0.24344, + "grad_norm": 2.03125, + "grad_norm_var": 0.0065582275390625, + "learning_rate": 0.0001, + "loss": 4.0987, + "loss/crossentropy": 1.9492397904396057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22894760966300964, + "step": 12172 + }, + { + "epoch": 0.24348, + "grad_norm": 2.046875, + "grad_norm_var": 0.005936431884765625, + "learning_rate": 0.0001, + "loss": 4.2316, + "loss/crossentropy": 2.03458708524704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20981666445732117, + "step": 12174 + }, + { + "epoch": 0.24352, + "grad_norm": 1.96875, + "grad_norm_var": 0.007059478759765625, + "learning_rate": 0.0001, + "loss": 4.3396, + "loss/crossentropy": 2.1464394330978394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20741496980190277, + "step": 12176 + }, + { + "epoch": 0.24356, + "grad_norm": 2.015625, + "grad_norm_var": 0.0064999898274739586, + "learning_rate": 0.0001, + "loss": 3.9902, + "loss/crossentropy": 1.8449034094810486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21950078010559082, + "step": 12178 + }, + { + "epoch": 0.2436, + "grad_norm": 2.15625, + "grad_norm_var": 0.0054433186848958336, + "learning_rate": 0.0001, + "loss": 4.0129, + "loss/crossentropy": 1.8646993041038513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2117064744234085, + "step": 12180 + }, + { + "epoch": 0.24364, + "grad_norm": 1.859375, + "grad_norm_var": 0.008003743489583333, + "learning_rate": 0.0001, + "loss": 3.9347, + "loss/crossentropy": 1.9643146991729736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1946149319410324, + "step": 12182 + }, + { + "epoch": 0.24368, + "grad_norm": 2.109375, + "grad_norm_var": 0.008141835530598959, + "learning_rate": 0.0001, + "loss": 4.1468, + "loss/crossentropy": 1.867002248764038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19111115485429764, + "step": 12184 + }, + { + "epoch": 0.24372, + "grad_norm": 2.0625, + "grad_norm_var": 0.008129628499348958, + "learning_rate": 0.0001, + "loss": 4.0443, + "loss/crossentropy": 1.9431232810020447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19612180441617966, + "step": 12186 + }, + { + "epoch": 0.24376, + "grad_norm": 2.25, + "grad_norm_var": 0.0105224609375, + "learning_rate": 0.0001, + "loss": 4.2931, + "loss/crossentropy": 2.197216033935547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22340577840805054, + "step": 12188 + }, + { + "epoch": 0.2438, + "grad_norm": 2.109375, + "grad_norm_var": 0.010469563802083333, + "learning_rate": 0.0001, + "loss": 4.2499, + "loss/crossentropy": 1.916576623916626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20536810904741287, + "step": 12190 + }, + { + "epoch": 0.24384, + "grad_norm": 2.25, + "grad_norm_var": 0.010497029622395833, + "learning_rate": 0.0001, + "loss": 4.3838, + "loss/crossentropy": 2.0369369983673096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26386965811252594, + "step": 12192 + }, + { + "epoch": 0.24388, + "grad_norm": 2.09375, + "grad_norm_var": 0.01639404296875, + "learning_rate": 0.0001, + "loss": 4.2817, + "loss/crossentropy": 2.086443066596985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21351207792758942, + "step": 12194 + }, + { + "epoch": 0.24392, + "grad_norm": 2.015625, + "grad_norm_var": 0.01619873046875, + "learning_rate": 0.0001, + "loss": 4.3649, + "loss/crossentropy": 2.0969839096069336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21489758789539337, + "step": 12196 + }, + { + "epoch": 0.24396, + "grad_norm": 1.859375, + "grad_norm_var": 0.016463216145833334, + "learning_rate": 0.0001, + "loss": 4.2481, + "loss/crossentropy": 2.086832642555237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20651830732822418, + "step": 12198 + }, + { + "epoch": 0.244, + "grad_norm": 2.015625, + "grad_norm_var": 0.017146809895833334, + "learning_rate": 0.0001, + "loss": 4.1397, + "loss/crossentropy": 1.9104391932487488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1963956654071808, + "step": 12200 + }, + { + "epoch": 0.24404, + "grad_norm": 2.03125, + "grad_norm_var": 0.0191314697265625, + "learning_rate": 0.0001, + "loss": 3.8558, + "loss/crossentropy": 1.9419523477554321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19279541075229645, + "step": 12202 + }, + { + "epoch": 0.24408, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01697998046875, + "learning_rate": 0.0001, + "loss": 4.3239, + "loss/crossentropy": 2.2867754697799683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23261378705501556, + "step": 12204 + }, + { + "epoch": 0.24412, + "grad_norm": 1.90625, + "grad_norm_var": 0.017634073893229168, + "learning_rate": 0.0001, + "loss": 4.0786, + "loss/crossentropy": 1.941315233707428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20962880551815033, + "step": 12206 + }, + { + "epoch": 0.24416, + "grad_norm": 2.140625, + "grad_norm_var": 0.01541748046875, + "learning_rate": 0.0001, + "loss": 4.2044, + "loss/crossentropy": 2.0638798475265503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21499283611774445, + "step": 12208 + }, + { + "epoch": 0.2442, + "grad_norm": 2.140625, + "grad_norm_var": 0.008454386393229167, + "learning_rate": 0.0001, + "loss": 4.1889, + "loss/crossentropy": 2.3167499899864197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21805942803621292, + "step": 12210 + }, + { + "epoch": 0.24424, + "grad_norm": 2.0, + "grad_norm_var": 0.00848388671875, + "learning_rate": 0.0001, + "loss": 4.0956, + "loss/crossentropy": 2.1785646080970764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21259736269712448, + "step": 12212 + }, + { + "epoch": 0.24428, + "grad_norm": 2.03125, + "grad_norm_var": 0.006615193684895834, + "learning_rate": 0.0001, + "loss": 4.2054, + "loss/crossentropy": 1.983469545841217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21091710776090622, + "step": 12214 + }, + { + "epoch": 0.24432, + "grad_norm": 2.078125, + "grad_norm_var": 0.006937408447265625, + "learning_rate": 0.0001, + "loss": 4.2283, + "loss/crossentropy": 1.8501896858215332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20537292212247849, + "step": 12216 + }, + { + "epoch": 0.24436, + "grad_norm": 2.1875, + "grad_norm_var": 0.0072934468587239586, + "learning_rate": 0.0001, + "loss": 4.182, + "loss/crossentropy": 1.7293490767478943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1916719302535057, + "step": 12218 + }, + { + "epoch": 0.2444, + "grad_norm": 2.03125, + "grad_norm_var": 0.0069488525390625, + "learning_rate": 0.0001, + "loss": 4.3247, + "loss/crossentropy": 2.0931429862976074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22081860899925232, + "step": 12220 + }, + { + "epoch": 0.24444, + "grad_norm": 2.03125, + "grad_norm_var": 0.007657623291015625, + "learning_rate": 0.0001, + "loss": 4.0619, + "loss/crossentropy": 2.157875657081604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20259525626897812, + "step": 12222 + }, + { + "epoch": 0.24448, + "grad_norm": 2.0625, + "grad_norm_var": 0.007248687744140625, + "learning_rate": 0.0001, + "loss": 4.2712, + "loss/crossentropy": 2.2028552889823914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2431233748793602, + "step": 12224 + }, + { + "epoch": 0.24452, + "grad_norm": 2.046875, + "grad_norm_var": 0.006473541259765625, + "learning_rate": 0.0001, + "loss": 4.1184, + "loss/crossentropy": 2.0906929969787598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2051575481891632, + "step": 12226 + }, + { + "epoch": 0.24456, + "grad_norm": 2.078125, + "grad_norm_var": 0.006091054280598958, + "learning_rate": 0.0001, + "loss": 4.5601, + "loss/crossentropy": 2.5020272731781006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2485819309949875, + "step": 12228 + }, + { + "epoch": 0.2446, + "grad_norm": 8.0625, + "grad_norm_var": 2.244887034098307, + "learning_rate": 0.0001, + "loss": 4.1465, + "loss/crossentropy": 1.502736508846283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1839340552687645, + "step": 12230 + }, + { + "epoch": 0.24464, + "grad_norm": 2.484375, + "grad_norm_var": 2.2357358296712238, + "learning_rate": 0.0001, + "loss": 4.3882, + "loss/crossentropy": 2.1888676285743713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.226266011595726, + "step": 12232 + }, + { + "epoch": 0.24468, + "grad_norm": 2.125, + "grad_norm_var": 2.243033599853516, + "learning_rate": 0.0001, + "loss": 4.3887, + "loss/crossentropy": 2.377061367034912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23195213079452515, + "step": 12234 + }, + { + "epoch": 0.24472, + "grad_norm": 2.03125, + "grad_norm_var": 2.257559967041016, + "learning_rate": 0.0001, + "loss": 3.9354, + "loss/crossentropy": 2.0567076206207275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20983586460351944, + "step": 12236 + }, + { + "epoch": 0.24476, + "grad_norm": 2.015625, + "grad_norm_var": 2.246906534830729, + "learning_rate": 0.0001, + "loss": 4.1978, + "loss/crossentropy": 2.007763922214508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21983467787504196, + "step": 12238 + }, + { + "epoch": 0.2448, + "grad_norm": 2.078125, + "grad_norm_var": 2.247749837239583, + "learning_rate": 0.0001, + "loss": 4.1411, + "loss/crossentropy": 2.3309481143951416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23099908232688904, + "step": 12240 + }, + { + "epoch": 0.24484, + "grad_norm": 2.5, + "grad_norm_var": 2.23668212890625, + "learning_rate": 0.0001, + "loss": 4.2159, + "loss/crossentropy": 1.8231184482574463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22009263187646866, + "step": 12242 + }, + { + "epoch": 0.24488, + "grad_norm": 1.9609375, + "grad_norm_var": 2.2464637756347656, + "learning_rate": 0.0001, + "loss": 4.1131, + "loss/crossentropy": 2.140045642852783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22202204167842865, + "step": 12244 + }, + { + "epoch": 0.24492, + "grad_norm": 1.984375, + "grad_norm_var": 0.02835057576497396, + "learning_rate": 0.0001, + "loss": 4.2209, + "loss/crossentropy": 1.8495931029319763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19803623855113983, + "step": 12246 + }, + { + "epoch": 0.24496, + "grad_norm": 2.171875, + "grad_norm_var": 0.0185943603515625, + "learning_rate": 0.0001, + "loss": 4.1624, + "loss/crossentropy": 2.2141982913017273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23418182879686356, + "step": 12248 + }, + { + "epoch": 0.245, + "grad_norm": 2.03125, + "grad_norm_var": 0.01844482421875, + "learning_rate": 0.0001, + "loss": 4.4104, + "loss/crossentropy": 2.14465594291687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22683896869421005, + "step": 12250 + }, + { + "epoch": 0.24504, + "grad_norm": 2.0625, + "grad_norm_var": 0.01610107421875, + "learning_rate": 0.0001, + "loss": 4.1913, + "loss/crossentropy": 2.0804443359375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2265845239162445, + "step": 12252 + }, + { + "epoch": 0.24508, + "grad_norm": 2.046875, + "grad_norm_var": 0.0160308837890625, + "learning_rate": 0.0001, + "loss": 4.485, + "loss/crossentropy": 2.2451776266098022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22881492972373962, + "step": 12254 + }, + { + "epoch": 0.24512, + "grad_norm": 1.953125, + "grad_norm_var": 0.016947428385416668, + "learning_rate": 0.0001, + "loss": 4.0869, + "loss/crossentropy": 1.6861794590950012, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18126793205738068, + "step": 12256 + }, + { + "epoch": 0.24516, + "grad_norm": 1.96875, + "grad_norm_var": 0.0059773763020833336, + "learning_rate": 0.0001, + "loss": 4.2343, + "loss/crossentropy": 1.87660551071167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20990663766860962, + "step": 12258 + }, + { + "epoch": 0.2452, + "grad_norm": 2.15625, + "grad_norm_var": 0.006799062093098958, + "learning_rate": 0.0001, + "loss": 4.2019, + "loss/crossentropy": 2.103124976158142, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19615671038627625, + "step": 12260 + }, + { + "epoch": 0.24524, + "grad_norm": 2.0, + "grad_norm_var": 0.0069048563639322914, + "learning_rate": 0.0001, + "loss": 4.2509, + "loss/crossentropy": 2.139270842075348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21861301362514496, + "step": 12262 + }, + { + "epoch": 0.24528, + "grad_norm": 2.140625, + "grad_norm_var": 0.00791015625, + "learning_rate": 0.0001, + "loss": 4.4081, + "loss/crossentropy": 2.108114778995514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20996354520320892, + "step": 12264 + }, + { + "epoch": 0.24532, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008847808837890625, + "learning_rate": 0.0001, + "loss": 3.9495, + "loss/crossentropy": 2.1205111145973206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21352755278348923, + "step": 12266 + }, + { + "epoch": 0.24536, + "grad_norm": 2.109375, + "grad_norm_var": 0.009248860677083333, + "learning_rate": 0.0001, + "loss": 3.9946, + "loss/crossentropy": 2.20136821269989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2277800738811493, + "step": 12268 + }, + { + "epoch": 0.2454, + "grad_norm": 2.046875, + "grad_norm_var": 0.011336008707682291, + "learning_rate": 0.0001, + "loss": 4.1116, + "loss/crossentropy": 2.1149147748947144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2062554806470871, + "step": 12270 + }, + { + "epoch": 0.24544, + "grad_norm": 2.4375, + "grad_norm_var": 0.01932347615559896, + "learning_rate": 0.0001, + "loss": 4.5226, + "loss/crossentropy": 2.1119648218154907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2922344133257866, + "step": 12272 + }, + { + "epoch": 0.24548, + "grad_norm": 2.0625, + "grad_norm_var": 0.017964680989583332, + "learning_rate": 0.0001, + "loss": 4.1766, + "loss/crossentropy": 2.2631434202194214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2044210433959961, + "step": 12274 + }, + { + "epoch": 0.24552, + "grad_norm": 2.125, + "grad_norm_var": 0.06928609212239584, + "learning_rate": 0.0001, + "loss": 4.0956, + "loss/crossentropy": 2.264181971549988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23541904985904694, + "step": 12276 + }, + { + "epoch": 0.24556, + "grad_norm": 2.0625, + "grad_norm_var": 0.06836649576822916, + "learning_rate": 0.0001, + "loss": 4.2083, + "loss/crossentropy": 2.256329298019409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21784210950136185, + "step": 12278 + }, + { + "epoch": 0.2456, + "grad_norm": 2.21875, + "grad_norm_var": 0.06968765258789063, + "learning_rate": 0.0001, + "loss": 4.267, + "loss/crossentropy": 2.0962833166122437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.211236834526062, + "step": 12280 + }, + { + "epoch": 0.24564, + "grad_norm": 2.140625, + "grad_norm_var": 0.0683990478515625, + "learning_rate": 0.0001, + "loss": 4.4767, + "loss/crossentropy": 2.3430999517440796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25170228630304337, + "step": 12282 + }, + { + "epoch": 0.24568, + "grad_norm": 2.125, + "grad_norm_var": 0.06921361287434896, + "learning_rate": 0.0001, + "loss": 4.1492, + "loss/crossentropy": 1.8256065845489502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20632392168045044, + "step": 12284 + }, + { + "epoch": 0.24572, + "grad_norm": 1.96875, + "grad_norm_var": 0.06643778483072917, + "learning_rate": 0.0001, + "loss": 4.294, + "loss/crossentropy": 2.481971561908722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2121504619717598, + "step": 12286 + }, + { + "epoch": 0.24576, + "grad_norm": 2.125, + "grad_norm_var": 0.060469563802083334, + "learning_rate": 0.0001, + "loss": 4.2136, + "loss/crossentropy": 2.2815581560134888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22004567831754684, + "step": 12288 + }, + { + "epoch": 0.2458, + "grad_norm": 1.9921875, + "grad_norm_var": 0.060323079427083336, + "learning_rate": 0.0001, + "loss": 4.2426, + "loss/crossentropy": 1.7125394940376282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19618325680494308, + "step": 12290 + }, + { + "epoch": 0.24584, + "grad_norm": 1.859375, + "grad_norm_var": 0.008347320556640624, + "learning_rate": 0.0001, + "loss": 3.9676, + "loss/crossentropy": 2.0333253145217896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20251524448394775, + "step": 12292 + }, + { + "epoch": 0.24588, + "grad_norm": 1.984375, + "grad_norm_var": 0.008939361572265625, + "learning_rate": 0.0001, + "loss": 4.0958, + "loss/crossentropy": 2.359953284263611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23697812855243683, + "step": 12294 + }, + { + "epoch": 0.24592, + "grad_norm": 2.0625, + "grad_norm_var": 0.0065348307291666664, + "learning_rate": 0.0001, + "loss": 3.9914, + "loss/crossentropy": 1.8465049266815186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18830078095197678, + "step": 12296 + }, + { + "epoch": 0.24596, + "grad_norm": 2.109375, + "grad_norm_var": 0.00662841796875, + "learning_rate": 0.0001, + "loss": 4.1669, + "loss/crossentropy": 2.407967984676361, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20054074376821518, + "step": 12298 + }, + { + "epoch": 0.246, + "grad_norm": 2.015625, + "grad_norm_var": 0.007005818684895833, + "learning_rate": 0.0001, + "loss": 4.1943, + "loss/crossentropy": 2.306265115737915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2286146879196167, + "step": 12300 + }, + { + "epoch": 0.24604, + "grad_norm": 1.90625, + "grad_norm_var": 0.0076416015625, + "learning_rate": 0.0001, + "loss": 4.108, + "loss/crossentropy": 1.9776363968849182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015514224767685, + "step": 12302 + }, + { + "epoch": 0.24608, + "grad_norm": 1.859375, + "grad_norm_var": 0.010041300455729167, + "learning_rate": 0.0001, + "loss": 3.7023, + "loss/crossentropy": 1.6589386463165283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16657201945781708, + "step": 12304 + }, + { + "epoch": 0.24612, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009968058268229166, + "learning_rate": 0.0001, + "loss": 4.2749, + "loss/crossentropy": 2.263510227203369, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21220041066408157, + "step": 12306 + }, + { + "epoch": 0.24616, + "grad_norm": 2.015625, + "grad_norm_var": 0.008583323160807291, + "learning_rate": 0.0001, + "loss": 4.1735, + "loss/crossentropy": 2.0802704095840454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20156628638505936, + "step": 12308 + }, + { + "epoch": 0.2462, + "grad_norm": 2.0, + "grad_norm_var": 0.015193430582682292, + "learning_rate": 0.0001, + "loss": 4.2597, + "loss/crossentropy": 2.0921266674995422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22044362872838974, + "step": 12310 + }, + { + "epoch": 0.24624, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0164703369140625, + "learning_rate": 0.0001, + "loss": 3.8587, + "loss/crossentropy": 1.7366862297058105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1722203940153122, + "step": 12312 + }, + { + "epoch": 0.24628, + "grad_norm": 2.046875, + "grad_norm_var": 0.016377766927083332, + "learning_rate": 0.0001, + "loss": 4.1408, + "loss/crossentropy": 2.0741729140281677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1942591667175293, + "step": 12314 + }, + { + "epoch": 0.24632, + "grad_norm": 2.0625, + "grad_norm_var": 0.0145751953125, + "learning_rate": 0.0001, + "loss": 4.1343, + "loss/crossentropy": 2.0611414909362793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22244945168495178, + "step": 12316 + }, + { + "epoch": 0.24636, + "grad_norm": 2.015625, + "grad_norm_var": 0.014196523030598958, + "learning_rate": 0.0001, + "loss": 3.8586, + "loss/crossentropy": 1.9337337017059326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20655318349599838, + "step": 12318 + }, + { + "epoch": 0.2464, + "grad_norm": 2.09375, + "grad_norm_var": 0.012141672770182292, + "learning_rate": 0.0001, + "loss": 3.9197, + "loss/crossentropy": 1.8766502737998962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1851816102862358, + "step": 12320 + }, + { + "epoch": 0.24644, + "grad_norm": 2.0625, + "grad_norm_var": 0.012552897135416666, + "learning_rate": 0.0001, + "loss": 4.2407, + "loss/crossentropy": 2.121203899383545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2177201583981514, + "step": 12322 + }, + { + "epoch": 0.24648, + "grad_norm": 2.03125, + "grad_norm_var": 0.0136138916015625, + "learning_rate": 0.0001, + "loss": 4.1441, + "loss/crossentropy": 1.8102782368659973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19694476574659348, + "step": 12324 + }, + { + "epoch": 0.24652, + "grad_norm": 1.984375, + "grad_norm_var": 0.0062164306640625, + "learning_rate": 0.0001, + "loss": 4.2526, + "loss/crossentropy": 2.323423147201538, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20612536370754242, + "step": 12326 + }, + { + "epoch": 0.24656, + "grad_norm": 2.15625, + "grad_norm_var": 0.007814280192057292, + "learning_rate": 0.0001, + "loss": 4.4862, + "loss/crossentropy": 2.2835768461227417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22697383165359497, + "step": 12328 + }, + { + "epoch": 0.2466, + "grad_norm": 2.09375, + "grad_norm_var": 0.006192779541015625, + "learning_rate": 0.0001, + "loss": 4.2868, + "loss/crossentropy": 2.197639048099518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22446971386671066, + "step": 12330 + }, + { + "epoch": 0.24664, + "grad_norm": 1.9296875, + "grad_norm_var": 0.006761678059895833, + "learning_rate": 0.0001, + "loss": 4.0758, + "loss/crossentropy": 2.1731618642807007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21908622235059738, + "step": 12332 + }, + { + "epoch": 0.24668, + "grad_norm": 1.875, + "grad_norm_var": 0.009291330973307291, + "learning_rate": 0.0001, + "loss": 3.9108, + "loss/crossentropy": 1.9236284494400024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1810280755162239, + "step": 12334 + }, + { + "epoch": 0.24672, + "grad_norm": 2.0, + "grad_norm_var": 0.008318837483723958, + "learning_rate": 0.0001, + "loss": 4.3975, + "loss/crossentropy": 2.2902809381484985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23192713409662247, + "step": 12336 + }, + { + "epoch": 0.24676, + "grad_norm": 2.015625, + "grad_norm_var": 0.008367665608723958, + "learning_rate": 0.0001, + "loss": 4.1681, + "loss/crossentropy": 1.9808775186538696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19344759732484818, + "step": 12338 + }, + { + "epoch": 0.2468, + "grad_norm": 1.921875, + "grad_norm_var": 0.008337148030598958, + "learning_rate": 0.0001, + "loss": 4.1581, + "loss/crossentropy": 2.148995041847229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19994106143712997, + "step": 12340 + }, + { + "epoch": 0.24684, + "grad_norm": 2.109375, + "grad_norm_var": 0.008528391520182291, + "learning_rate": 0.0001, + "loss": 4.3927, + "loss/crossentropy": 2.374568462371826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24350540339946747, + "step": 12342 + }, + { + "epoch": 0.24688, + "grad_norm": 2.1875, + "grad_norm_var": 0.006959788004557292, + "learning_rate": 0.0001, + "loss": 4.258, + "loss/crossentropy": 1.9702014923095703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2038111537694931, + "step": 12344 + }, + { + "epoch": 0.24692, + "grad_norm": 1.921875, + "grad_norm_var": 0.007008616129557292, + "learning_rate": 0.0001, + "loss": 4.0126, + "loss/crossentropy": 1.8438855409622192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1822836771607399, + "step": 12346 + }, + { + "epoch": 0.24696, + "grad_norm": 2.265625, + "grad_norm_var": 0.010754140218098958, + "learning_rate": 0.0001, + "loss": 4.1352, + "loss/crossentropy": 2.005755662918091, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21399007737636566, + "step": 12348 + }, + { + "epoch": 0.247, + "grad_norm": 2.015625, + "grad_norm_var": 0.008819325764973959, + "learning_rate": 0.0001, + "loss": 4.0988, + "loss/crossentropy": 1.8640305399894714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19055305421352386, + "step": 12350 + }, + { + "epoch": 0.24704, + "grad_norm": 2.140625, + "grad_norm_var": 0.010453033447265624, + "learning_rate": 0.0001, + "loss": 4.1981, + "loss/crossentropy": 2.1293725967407227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2117801234126091, + "step": 12352 + }, + { + "epoch": 0.24708, + "grad_norm": 2.484375, + "grad_norm_var": 0.02337621053059896, + "learning_rate": 0.0001, + "loss": 4.2666, + "loss/crossentropy": 1.7655459642410278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2085946872830391, + "step": 12354 + }, + { + "epoch": 0.24712, + "grad_norm": 2.046875, + "grad_norm_var": 0.021897125244140624, + "learning_rate": 0.0001, + "loss": 4.3175, + "loss/crossentropy": 2.184986114501953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22597889602184296, + "step": 12356 + }, + { + "epoch": 0.24716, + "grad_norm": 2.015625, + "grad_norm_var": 0.021897125244140624, + "learning_rate": 0.0001, + "loss": 4.1295, + "loss/crossentropy": 2.368021607398987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21957845985889435, + "step": 12358 + }, + { + "epoch": 0.2472, + "grad_norm": 2.046875, + "grad_norm_var": 0.020645904541015624, + "learning_rate": 0.0001, + "loss": 4.16, + "loss/crossentropy": 2.297884225845337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2155354768037796, + "step": 12360 + }, + { + "epoch": 0.24724, + "grad_norm": 2.328125, + "grad_norm_var": 0.023273722330729166, + "learning_rate": 0.0001, + "loss": 4.2181, + "loss/crossentropy": 2.121878147125244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22192668169736862, + "step": 12362 + }, + { + "epoch": 0.24728, + "grad_norm": 2.125, + "grad_norm_var": 0.020442454020182292, + "learning_rate": 0.0001, + "loss": 4.3233, + "loss/crossentropy": 2.130228877067566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20712029188871384, + "step": 12364 + }, + { + "epoch": 0.24732, + "grad_norm": 1.9921875, + "grad_norm_var": 0.02367121378580729, + "learning_rate": 0.0001, + "loss": 3.8542, + "loss/crossentropy": 1.6520383954048157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16511157900094986, + "step": 12366 + }, + { + "epoch": 0.24736, + "grad_norm": 2.1875, + "grad_norm_var": 0.02220637003580729, + "learning_rate": 0.0001, + "loss": 4.1894, + "loss/crossentropy": 1.9993118047714233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2112603336572647, + "step": 12368 + }, + { + "epoch": 0.2474, + "grad_norm": 2.046875, + "grad_norm_var": 0.011494700113932292, + "learning_rate": 0.0001, + "loss": 4.0522, + "loss/crossentropy": 1.9502257108688354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1944574937224388, + "step": 12370 + }, + { + "epoch": 0.24744, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011872355143229167, + "learning_rate": 0.0001, + "loss": 4.1098, + "loss/crossentropy": 2.1219520568847656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22872696816921234, + "step": 12372 + }, + { + "epoch": 0.24748, + "grad_norm": 1.984375, + "grad_norm_var": 0.0502593994140625, + "learning_rate": 0.0001, + "loss": 4.172, + "loss/crossentropy": 2.0668599605560303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21508124470710754, + "step": 12374 + }, + { + "epoch": 0.24752, + "grad_norm": 1.984375, + "grad_norm_var": 0.05098241170247396, + "learning_rate": 0.0001, + "loss": 4.1901, + "loss/crossentropy": 2.242877721786499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2308686003088951, + "step": 12376 + }, + { + "epoch": 0.24756, + "grad_norm": 2.015625, + "grad_norm_var": 0.047304026285807294, + "learning_rate": 0.0001, + "loss": 3.8779, + "loss/crossentropy": 1.8258161544799805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1852153167128563, + "step": 12378 + }, + { + "epoch": 0.2476, + "grad_norm": 2.046875, + "grad_norm_var": 0.04918390909830729, + "learning_rate": 0.0001, + "loss": 4.1241, + "loss/crossentropy": 2.0654167532920837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20526036620140076, + "step": 12380 + }, + { + "epoch": 0.24764, + "grad_norm": 2.046875, + "grad_norm_var": 0.04582087198893229, + "learning_rate": 0.0001, + "loss": 4.1601, + "loss/crossentropy": 2.125216484069824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.216343455016613, + "step": 12382 + }, + { + "epoch": 0.24768, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0473052978515625, + "learning_rate": 0.0001, + "loss": 4.1756, + "loss/crossentropy": 2.285245180130005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22941745072603226, + "step": 12384 + }, + { + "epoch": 0.24772, + "grad_norm": 2.046875, + "grad_norm_var": 0.04720637003580729, + "learning_rate": 0.0001, + "loss": 4.1097, + "loss/crossentropy": 2.0148350596427917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1874203458428383, + "step": 12386 + }, + { + "epoch": 0.24776, + "grad_norm": 2.03125, + "grad_norm_var": 0.04812825520833333, + "learning_rate": 0.0001, + "loss": 4.3127, + "loss/crossentropy": 2.2349741458892822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23187313228845596, + "step": 12388 + }, + { + "epoch": 0.2478, + "grad_norm": 2.078125, + "grad_norm_var": 0.009178670247395833, + "learning_rate": 0.0001, + "loss": 4.1875, + "loss/crossentropy": 2.195721983909607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2119324654340744, + "step": 12390 + }, + { + "epoch": 0.24784, + "grad_norm": 2.265625, + "grad_norm_var": 0.012676747639973958, + "learning_rate": 0.0001, + "loss": 4.5732, + "loss/crossentropy": 2.1261669397354126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21143332868814468, + "step": 12392 + }, + { + "epoch": 0.24788, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012798817952473958, + "learning_rate": 0.0001, + "loss": 3.9475, + "loss/crossentropy": 2.1640073657035828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2192412167787552, + "step": 12394 + }, + { + "epoch": 0.24792, + "grad_norm": 1.953125, + "grad_norm_var": 0.011230214436848959, + "learning_rate": 0.0001, + "loss": 4.2435, + "loss/crossentropy": 1.9555792808532715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2110436111688614, + "step": 12396 + }, + { + "epoch": 0.24796, + "grad_norm": 2.015625, + "grad_norm_var": 0.011156972249348958, + "learning_rate": 0.0001, + "loss": 4.0616, + "loss/crossentropy": 1.948053002357483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19957780838012695, + "step": 12398 + }, + { + "epoch": 0.248, + "grad_norm": 1.9453125, + "grad_norm_var": 0.011156972249348958, + "learning_rate": 0.0001, + "loss": 4.0865, + "loss/crossentropy": 2.355304718017578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20301833003759384, + "step": 12400 + }, + { + "epoch": 0.24804, + "grad_norm": 2.0, + "grad_norm_var": 0.010944620768229166, + "learning_rate": 0.0001, + "loss": 4.2537, + "loss/crossentropy": 2.172769784927368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2157917320728302, + "step": 12402 + }, + { + "epoch": 0.24808, + "grad_norm": 1.8671875, + "grad_norm_var": 0.012109120686848959, + "learning_rate": 0.0001, + "loss": 4.2817, + "loss/crossentropy": 2.2186524868011475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.221016563475132, + "step": 12404 + }, + { + "epoch": 0.24812, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012113189697265625, + "learning_rate": 0.0001, + "loss": 3.9624, + "loss/crossentropy": 2.011409044265747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19150879979133606, + "step": 12406 + }, + { + "epoch": 0.24816, + "grad_norm": 2.125, + "grad_norm_var": 0.010984039306640625, + "learning_rate": 0.0001, + "loss": 4.5039, + "loss/crossentropy": 1.9245591163635254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2296570986509323, + "step": 12408 + }, + { + "epoch": 0.2482, + "grad_norm": 2.328125, + "grad_norm_var": 0.016434733072916666, + "learning_rate": 0.0001, + "loss": 4.5765, + "loss/crossentropy": 1.973130702972412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20532477647066116, + "step": 12410 + }, + { + "epoch": 0.24824, + "grad_norm": 2.03125, + "grad_norm_var": 0.01617609659830729, + "learning_rate": 0.0001, + "loss": 3.914, + "loss/crossentropy": 1.655932605266571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1609746441245079, + "step": 12412 + }, + { + "epoch": 0.24828, + "grad_norm": 2.40625, + "grad_norm_var": 0.02411677042643229, + "learning_rate": 0.0001, + "loss": 4.2172, + "loss/crossentropy": 2.104023277759552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19898276031017303, + "step": 12414 + }, + { + "epoch": 0.24832, + "grad_norm": 1.9921875, + "grad_norm_var": 0.02210261027018229, + "learning_rate": 0.0001, + "loss": 4.0597, + "loss/crossentropy": 1.9362882375717163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20107803493738174, + "step": 12416 + }, + { + "epoch": 0.24836, + "grad_norm": 2.046875, + "grad_norm_var": 0.021996815999348957, + "learning_rate": 0.0001, + "loss": 4.4181, + "loss/crossentropy": 2.1508368253707886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2056911736726761, + "step": 12418 + }, + { + "epoch": 0.2484, + "grad_norm": 1.921875, + "grad_norm_var": 0.02194391886393229, + "learning_rate": 0.0001, + "loss": 3.802, + "loss/crossentropy": 1.7712991833686829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1908227875828743, + "step": 12420 + }, + { + "epoch": 0.24844, + "grad_norm": 2.140625, + "grad_norm_var": 0.02060114542643229, + "learning_rate": 0.0001, + "loss": 4.3557, + "loss/crossentropy": 2.2705806493759155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23912303894758224, + "step": 12422 + }, + { + "epoch": 0.24848, + "grad_norm": 1.9765625, + "grad_norm_var": 0.019791666666666666, + "learning_rate": 0.0001, + "loss": 4.2696, + "loss/crossentropy": 2.4222676753997803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21739919483661652, + "step": 12424 + }, + { + "epoch": 0.24852, + "grad_norm": 2.03125, + "grad_norm_var": 0.0145172119140625, + "learning_rate": 0.0001, + "loss": 4.1935, + "loss/crossentropy": 1.945872962474823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19402413070201874, + "step": 12426 + }, + { + "epoch": 0.24856, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01646728515625, + "learning_rate": 0.0001, + "loss": 4.0908, + "loss/crossentropy": 1.7793864011764526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19452669471502304, + "step": 12428 + }, + { + "epoch": 0.2486, + "grad_norm": 2.0, + "grad_norm_var": 0.015941365559895834, + "learning_rate": 0.0001, + "loss": 4.1045, + "loss/crossentropy": 2.2827813625335693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21572840213775635, + "step": 12430 + }, + { + "epoch": 0.24864, + "grad_norm": 2.03125, + "grad_norm_var": 0.016544342041015625, + "learning_rate": 0.0001, + "loss": 3.749, + "loss/crossentropy": 1.9543398022651672, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20825360715389252, + "step": 12432 + }, + { + "epoch": 0.24868, + "grad_norm": 2.046875, + "grad_norm_var": 0.017276763916015625, + "learning_rate": 0.0001, + "loss": 4.3727, + "loss/crossentropy": 2.223303198814392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21834088116884232, + "step": 12434 + }, + { + "epoch": 0.24872, + "grad_norm": 1.9140625, + "grad_norm_var": 0.017071278889973958, + "learning_rate": 0.0001, + "loss": 3.9993, + "loss/crossentropy": 1.6987267136573792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1834847405552864, + "step": 12436 + }, + { + "epoch": 0.24876, + "grad_norm": 2.125, + "grad_norm_var": 0.014522043863932292, + "learning_rate": 0.0001, + "loss": 4.4309, + "loss/crossentropy": 2.3947317600250244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22872642427682877, + "step": 12438 + }, + { + "epoch": 0.2488, + "grad_norm": 2.015625, + "grad_norm_var": 0.014435831705729167, + "learning_rate": 0.0001, + "loss": 4.4858, + "loss/crossentropy": 2.5481021404266357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23773372173309326, + "step": 12440 + }, + { + "epoch": 0.24884, + "grad_norm": 2.015625, + "grad_norm_var": 0.0145904541015625, + "learning_rate": 0.0001, + "loss": 4.1621, + "loss/crossentropy": 2.2349241971969604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21584660559892654, + "step": 12442 + }, + { + "epoch": 0.24888, + "grad_norm": 2.140625, + "grad_norm_var": 0.012988026936848958, + "learning_rate": 0.0001, + "loss": 4.2827, + "loss/crossentropy": 1.9308242201805115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19295598566532135, + "step": 12444 + }, + { + "epoch": 0.24892, + "grad_norm": 1.8515625, + "grad_norm_var": 0.008536783854166667, + "learning_rate": 0.0001, + "loss": 4.0778, + "loss/crossentropy": 2.3508042097091675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22008787840604782, + "step": 12446 + }, + { + "epoch": 0.24896, + "grad_norm": 2.109375, + "grad_norm_var": 0.012593332926432292, + "learning_rate": 0.0001, + "loss": 4.0865, + "loss/crossentropy": 2.0870128870010376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20749760419130325, + "step": 12448 + }, + { + "epoch": 0.249, + "grad_norm": 2.109375, + "grad_norm_var": 0.012971750895182292, + "learning_rate": 0.0001, + "loss": 4.2758, + "loss/crossentropy": 2.087821125984192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21110248565673828, + "step": 12450 + }, + { + "epoch": 0.24904, + "grad_norm": 2.078125, + "grad_norm_var": 0.011494954427083334, + "learning_rate": 0.0001, + "loss": 4.3075, + "loss/crossentropy": 2.131770372390747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21426154673099518, + "step": 12452 + }, + { + "epoch": 0.24908, + "grad_norm": 2.046875, + "grad_norm_var": 0.011092122395833333, + "learning_rate": 0.0001, + "loss": 4.2373, + "loss/crossentropy": 2.175555467605591, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20418058335781097, + "step": 12454 + }, + { + "epoch": 0.24912, + "grad_norm": 2.1875, + "grad_norm_var": 0.012198893229166667, + "learning_rate": 0.0001, + "loss": 4.2788, + "loss/crossentropy": 2.205165147781372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23265192657709122, + "step": 12456 + }, + { + "epoch": 0.24916, + "grad_norm": 2.046875, + "grad_norm_var": 0.014534250895182291, + "learning_rate": 0.0001, + "loss": 3.9345, + "loss/crossentropy": 1.6410180926322937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18058068305253983, + "step": 12458 + }, + { + "epoch": 0.2492, + "grad_norm": 1.9375, + "grad_norm_var": 0.014345041910807292, + "learning_rate": 0.0001, + "loss": 3.962, + "loss/crossentropy": 1.8249012231826782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19164791703224182, + "step": 12460 + }, + { + "epoch": 0.24924, + "grad_norm": 2.234375, + "grad_norm_var": 0.0142242431640625, + "learning_rate": 0.0001, + "loss": 4.4945, + "loss/crossentropy": 2.0906582474708557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23415963351726532, + "step": 12462 + }, + { + "epoch": 0.24928, + "grad_norm": 2.078125, + "grad_norm_var": 0.009285227457682291, + "learning_rate": 0.0001, + "loss": 4.4702, + "loss/crossentropy": 2.314555048942566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24343416839838028, + "step": 12464 + }, + { + "epoch": 0.24932, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009105428059895834, + "learning_rate": 0.0001, + "loss": 4.3059, + "loss/crossentropy": 2.1258983612060547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21851430088281631, + "step": 12466 + }, + { + "epoch": 0.24936, + "grad_norm": 2.03125, + "grad_norm_var": 0.009325917561848958, + "learning_rate": 0.0001, + "loss": 4.1642, + "loss/crossentropy": 1.977954626083374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20545368641614914, + "step": 12468 + }, + { + "epoch": 0.2494, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011258951822916667, + "learning_rate": 0.0001, + "loss": 4.2224, + "loss/crossentropy": 2.0212838649749756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21720624715089798, + "step": 12470 + }, + { + "epoch": 0.24944, + "grad_norm": 2.0625, + "grad_norm_var": 0.010205078125, + "learning_rate": 0.0001, + "loss": 4.3526, + "loss/crossentropy": 1.964124321937561, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19991052150726318, + "step": 12472 + }, + { + "epoch": 0.24948, + "grad_norm": 2.078125, + "grad_norm_var": 0.008567047119140626, + "learning_rate": 0.0001, + "loss": 4.2633, + "loss/crossentropy": 1.9038777947425842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19013714790344238, + "step": 12474 + }, + { + "epoch": 0.24952, + "grad_norm": 2.171875, + "grad_norm_var": 0.008449045817057292, + "learning_rate": 0.0001, + "loss": 4.4785, + "loss/crossentropy": 2.1772372722625732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21470360457897186, + "step": 12476 + }, + { + "epoch": 0.24956, + "grad_norm": 2.109375, + "grad_norm_var": 0.006605784098307292, + "learning_rate": 0.0001, + "loss": 4.3636, + "loss/crossentropy": 2.153541684150696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23221681267023087, + "step": 12478 + }, + { + "epoch": 0.2496, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006668853759765625, + "learning_rate": 0.0001, + "loss": 4.4578, + "loss/crossentropy": 2.421363592147827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20812640339136124, + "step": 12480 + }, + { + "epoch": 0.24964, + "grad_norm": 2.046875, + "grad_norm_var": 0.008188629150390625, + "learning_rate": 0.0001, + "loss": 4.0628, + "loss/crossentropy": 1.8497061133384705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18766893446445465, + "step": 12482 + }, + { + "epoch": 0.24968, + "grad_norm": 2.0625, + "grad_norm_var": 0.008894602457682291, + "learning_rate": 0.0001, + "loss": 4.1011, + "loss/crossentropy": 1.9876007437705994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1932743340730667, + "step": 12484 + }, + { + "epoch": 0.24972, + "grad_norm": 2.0625, + "grad_norm_var": 0.007616170247395833, + "learning_rate": 0.0001, + "loss": 4.2722, + "loss/crossentropy": 2.4254921674728394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23315788805484772, + "step": 12486 + }, + { + "epoch": 0.24976, + "grad_norm": 1.96875, + "grad_norm_var": 0.0079010009765625, + "learning_rate": 0.0001, + "loss": 4.1012, + "loss/crossentropy": 1.983458697795868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2190450206398964, + "step": 12488 + }, + { + "epoch": 0.2498, + "grad_norm": 2.046875, + "grad_norm_var": 0.007111612955729167, + "learning_rate": 0.0001, + "loss": 4.0284, + "loss/crossentropy": 1.8051987886428833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19166412204504013, + "step": 12490 + }, + { + "epoch": 0.24984, + "grad_norm": 2.140625, + "grad_norm_var": 0.0052398681640625, + "learning_rate": 0.0001, + "loss": 4.2385, + "loss/crossentropy": 2.1552056670188904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2053346112370491, + "step": 12492 + }, + { + "epoch": 0.24988, + "grad_norm": 2.0625, + "grad_norm_var": 0.004992421468098958, + "learning_rate": 0.0001, + "loss": 4.0157, + "loss/crossentropy": 1.9644648432731628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17631876468658447, + "step": 12494 + }, + { + "epoch": 0.24992, + "grad_norm": 1.90625, + "grad_norm_var": 0.006089019775390625, + "learning_rate": 0.0001, + "loss": 4.1626, + "loss/crossentropy": 2.027154862880707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21671167761087418, + "step": 12496 + }, + { + "epoch": 0.24996, + "grad_norm": 2.125, + "grad_norm_var": 0.00628662109375, + "learning_rate": 0.0001, + "loss": 4.2485, + "loss/crossentropy": 2.2398791313171387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22507991641759872, + "step": 12498 + }, + { + "epoch": 0.25, + "grad_norm": 1.9296875, + "grad_norm_var": 0.006232706705729166, + "learning_rate": 0.0001, + "loss": 4.13, + "loss/crossentropy": 2.2342909574508667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2192724049091339, + "step": 12500 + }, + { + "epoch": 0.25004, + "grad_norm": 2.0, + "grad_norm_var": 0.0054443359375, + "learning_rate": 0.0001, + "loss": 4.2779, + "loss/crossentropy": 1.8621744513511658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1882321536540985, + "step": 12502 + }, + { + "epoch": 0.25008, + "grad_norm": 2.109375, + "grad_norm_var": 0.004988606770833333, + "learning_rate": 0.0001, + "loss": 4.4516, + "loss/crossentropy": 2.2012354135513306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2128412127494812, + "step": 12504 + }, + { + "epoch": 0.25012, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005873362223307292, + "learning_rate": 0.0001, + "loss": 4.0122, + "loss/crossentropy": 1.7629758715629578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1928938776254654, + "step": 12506 + }, + { + "epoch": 0.25016, + "grad_norm": 2.0625, + "grad_norm_var": 0.005730946858723958, + "learning_rate": 0.0001, + "loss": 4.1563, + "loss/crossentropy": 2.076514720916748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20530561357736588, + "step": 12508 + }, + { + "epoch": 0.2502, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006207021077473959, + "learning_rate": 0.0001, + "loss": 4.2005, + "loss/crossentropy": 2.004107654094696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21081873774528503, + "step": 12510 + }, + { + "epoch": 0.25024, + "grad_norm": 2.234375, + "grad_norm_var": 0.008013661702473958, + "learning_rate": 0.0001, + "loss": 4.3323, + "loss/crossentropy": 2.232061505317688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21327269077301025, + "step": 12512 + }, + { + "epoch": 0.25028, + "grad_norm": 2.0625, + "grad_norm_var": 0.0068662007649739586, + "learning_rate": 0.0001, + "loss": 4.1507, + "loss/crossentropy": 2.1506210565567017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20797627419233322, + "step": 12514 + }, + { + "epoch": 0.25032, + "grad_norm": 2.03125, + "grad_norm_var": 0.008207194010416667, + "learning_rate": 0.0001, + "loss": 4.1999, + "loss/crossentropy": 2.084704279899597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25306878983974457, + "step": 12516 + }, + { + "epoch": 0.25036, + "grad_norm": 2.0625, + "grad_norm_var": 0.008137003580729166, + "learning_rate": 0.0001, + "loss": 4.3286, + "loss/crossentropy": 2.000536620616913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20802763104438782, + "step": 12518 + }, + { + "epoch": 0.2504, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0088531494140625, + "learning_rate": 0.0001, + "loss": 4.1324, + "loss/crossentropy": 2.2391778230667114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2171899899840355, + "step": 12520 + }, + { + "epoch": 0.25044, + "grad_norm": 1.9453125, + "grad_norm_var": 0.011375935872395833, + "learning_rate": 0.0001, + "loss": 4.458, + "loss/crossentropy": 2.1465210914611816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2026364952325821, + "step": 12522 + }, + { + "epoch": 0.25048, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011226145426432292, + "learning_rate": 0.0001, + "loss": 4.0804, + "loss/crossentropy": 1.8129625916481018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17917531728744507, + "step": 12524 + }, + { + "epoch": 0.25052, + "grad_norm": 2.0, + "grad_norm_var": 0.010029856363932292, + "learning_rate": 0.0001, + "loss": 4.1832, + "loss/crossentropy": 1.98322331905365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20269985496997833, + "step": 12526 + }, + { + "epoch": 0.25056, + "grad_norm": 2.1875, + "grad_norm_var": 0.008955637613932291, + "learning_rate": 0.0001, + "loss": 4.373, + "loss/crossentropy": 2.196588397026062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23827942460775375, + "step": 12528 + }, + { + "epoch": 0.2506, + "grad_norm": 2.09375, + "grad_norm_var": 0.009209950764973959, + "learning_rate": 0.0001, + "loss": 4.3811, + "loss/crossentropy": 2.183007001876831, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2071165144443512, + "step": 12530 + }, + { + "epoch": 0.25064, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0106353759765625, + "learning_rate": 0.0001, + "loss": 4.4205, + "loss/crossentropy": 2.2728021144866943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21004344522953033, + "step": 12532 + }, + { + "epoch": 0.25068, + "grad_norm": 2.125, + "grad_norm_var": 0.0111083984375, + "learning_rate": 0.0001, + "loss": 4.5478, + "loss/crossentropy": 2.280518889427185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22367073595523834, + "step": 12534 + }, + { + "epoch": 0.25072, + "grad_norm": 2.25, + "grad_norm_var": 0.015038045247395833, + "learning_rate": 0.0001, + "loss": 4.135, + "loss/crossentropy": 2.088103711605072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2038320079445839, + "step": 12536 + }, + { + "epoch": 0.25076, + "grad_norm": 2.140625, + "grad_norm_var": 0.012837727864583334, + "learning_rate": 0.0001, + "loss": 4.1798, + "loss/crossentropy": 1.646530568599701, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18982253968715668, + "step": 12538 + }, + { + "epoch": 0.2508, + "grad_norm": 2.09375, + "grad_norm_var": 0.013622792561848958, + "learning_rate": 0.0001, + "loss": 4.2246, + "loss/crossentropy": 2.1743921041488647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22835461795330048, + "step": 12540 + }, + { + "epoch": 0.25084, + "grad_norm": 1.9375, + "grad_norm_var": 0.0151123046875, + "learning_rate": 0.0001, + "loss": 4.3138, + "loss/crossentropy": 2.2216947078704834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21978579461574554, + "step": 12542 + }, + { + "epoch": 0.25088, + "grad_norm": 1.9921875, + "grad_norm_var": 0.014890289306640625, + "learning_rate": 0.0001, + "loss": 4.298, + "loss/crossentropy": 2.141101062297821, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21347828209400177, + "step": 12544 + }, + { + "epoch": 0.25092, + "grad_norm": 2.078125, + "grad_norm_var": 0.014861806233723959, + "learning_rate": 0.0001, + "loss": 4.1534, + "loss/crossentropy": 2.1961969137191772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2192479968070984, + "step": 12546 + }, + { + "epoch": 0.25096, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014274088541666667, + "learning_rate": 0.0001, + "loss": 3.9219, + "loss/crossentropy": 1.9632240533828735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19182069599628448, + "step": 12548 + }, + { + "epoch": 0.251, + "grad_norm": 2.359375, + "grad_norm_var": 0.02173639933268229, + "learning_rate": 0.0001, + "loss": 4.0594, + "loss/crossentropy": 1.7821694612503052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24946660548448563, + "step": 12550 + }, + { + "epoch": 0.25104, + "grad_norm": 2.078125, + "grad_norm_var": 0.016290028889973957, + "learning_rate": 0.0001, + "loss": 4.2423, + "loss/crossentropy": 2.261335611343384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23269569873809814, + "step": 12552 + }, + { + "epoch": 0.25108, + "grad_norm": 1.921875, + "grad_norm_var": 0.01762669881184896, + "learning_rate": 0.0001, + "loss": 3.9858, + "loss/crossentropy": 2.2365309596061707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2119913324713707, + "step": 12554 + }, + { + "epoch": 0.25112, + "grad_norm": 2.078125, + "grad_norm_var": 0.0176025390625, + "learning_rate": 0.0001, + "loss": 3.9798, + "loss/crossentropy": 2.0685681104660034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19985978305339813, + "step": 12556 + }, + { + "epoch": 0.25116, + "grad_norm": 2.015625, + "grad_norm_var": 0.0144683837890625, + "learning_rate": 0.0001, + "loss": 4.0231, + "loss/crossentropy": 2.0773105025291443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2006489858031273, + "step": 12558 + }, + { + "epoch": 0.2512, + "grad_norm": 2.09375, + "grad_norm_var": 0.01561279296875, + "learning_rate": 0.0001, + "loss": 4.1948, + "loss/crossentropy": 2.1145309805870056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20449287444353104, + "step": 12560 + }, + { + "epoch": 0.25124, + "grad_norm": 1.828125, + "grad_norm_var": 0.017097981770833333, + "learning_rate": 0.0001, + "loss": 4.0543, + "loss/crossentropy": 2.109993577003479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20761344581842422, + "step": 12562 + }, + { + "epoch": 0.25128, + "grad_norm": 1.9375, + "grad_norm_var": 0.016755167643229166, + "learning_rate": 0.0001, + "loss": 4.1661, + "loss/crossentropy": 1.9653338193893433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2069985270500183, + "step": 12564 + }, + { + "epoch": 0.25132, + "grad_norm": 2.015625, + "grad_norm_var": 0.007045237223307291, + "learning_rate": 0.0001, + "loss": 4.101, + "loss/crossentropy": 1.7084832191467285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18922459334135056, + "step": 12566 + }, + { + "epoch": 0.25136, + "grad_norm": 2.046875, + "grad_norm_var": 0.006494140625, + "learning_rate": 0.0001, + "loss": 3.9288, + "loss/crossentropy": 1.8007041215896606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2122679501771927, + "step": 12568 + }, + { + "epoch": 0.2514, + "grad_norm": 2.125, + "grad_norm_var": 0.007972971598307291, + "learning_rate": 0.0001, + "loss": 4.3199, + "loss/crossentropy": 2.041845440864563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2219957411289215, + "step": 12570 + }, + { + "epoch": 0.25144, + "grad_norm": 1.8671875, + "grad_norm_var": 0.008577219645182292, + "learning_rate": 0.0001, + "loss": 4.3703, + "loss/crossentropy": 2.2668861150741577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2117021307349205, + "step": 12572 + }, + { + "epoch": 0.25148, + "grad_norm": 1.890625, + "grad_norm_var": 0.010235341389973958, + "learning_rate": 0.0001, + "loss": 3.8462, + "loss/crossentropy": 1.8246251940727234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1822521835565567, + "step": 12574 + }, + { + "epoch": 0.25152, + "grad_norm": 2.046875, + "grad_norm_var": 0.010553995768229166, + "learning_rate": 0.0001, + "loss": 3.8137, + "loss/crossentropy": 2.0739742517471313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2114427089691162, + "step": 12576 + }, + { + "epoch": 0.25156, + "grad_norm": 2.03125, + "grad_norm_var": 0.008906809488932292, + "learning_rate": 0.0001, + "loss": 3.9261, + "loss/crossentropy": 1.6614344120025635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18680214881896973, + "step": 12578 + }, + { + "epoch": 0.2516, + "grad_norm": 1.9765625, + "grad_norm_var": 0.01279296875, + "learning_rate": 0.0001, + "loss": 4.4042, + "loss/crossentropy": 2.5629080533981323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26097629219293594, + "step": 12580 + }, + { + "epoch": 0.25164, + "grad_norm": 2.015625, + "grad_norm_var": 0.01337890625, + "learning_rate": 0.0001, + "loss": 4.1583, + "loss/crossentropy": 2.058123230934143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2273598164319992, + "step": 12582 + }, + { + "epoch": 0.25168, + "grad_norm": 2.15625, + "grad_norm_var": 0.014314524332682292, + "learning_rate": 0.0001, + "loss": 4.2802, + "loss/crossentropy": 1.9959335327148438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1991373971104622, + "step": 12584 + }, + { + "epoch": 0.25172, + "grad_norm": 2.140625, + "grad_norm_var": 0.018293253580729165, + "learning_rate": 0.0001, + "loss": 3.9794, + "loss/crossentropy": 1.8165839314460754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1888013407588005, + "step": 12586 + }, + { + "epoch": 0.25176, + "grad_norm": 2.03125, + "grad_norm_var": 0.015221913655598959, + "learning_rate": 0.0001, + "loss": 4.0687, + "loss/crossentropy": 1.9918802976608276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18641646206378937, + "step": 12588 + }, + { + "epoch": 0.2518, + "grad_norm": 2.078125, + "grad_norm_var": 0.013378651936848958, + "learning_rate": 0.0001, + "loss": 4.3443, + "loss/crossentropy": 2.0256036520004272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19628985226154327, + "step": 12590 + }, + { + "epoch": 0.25184, + "grad_norm": 2.09375, + "grad_norm_var": 0.01236572265625, + "learning_rate": 0.0001, + "loss": 4.1317, + "loss/crossentropy": 1.948347806930542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19971590489149094, + "step": 12592 + }, + { + "epoch": 0.25188, + "grad_norm": 2.046875, + "grad_norm_var": 0.012544759114583333, + "learning_rate": 0.0001, + "loss": 3.8528, + "loss/crossentropy": 2.2025747299194336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2437238171696663, + "step": 12594 + }, + { + "epoch": 0.25192, + "grad_norm": 2.078125, + "grad_norm_var": 0.010872141520182291, + "learning_rate": 0.0001, + "loss": 4.2102, + "loss/crossentropy": 2.3209575414657593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2366873100399971, + "step": 12596 + }, + { + "epoch": 0.25196, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010846964518229167, + "learning_rate": 0.0001, + "loss": 3.8028, + "loss/crossentropy": 1.7978705763816833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19252559542655945, + "step": 12598 + }, + { + "epoch": 0.252, + "grad_norm": 2.015625, + "grad_norm_var": 0.010359446207682291, + "learning_rate": 0.0001, + "loss": 3.9269, + "loss/crossentropy": 1.964760184288025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1943562552332878, + "step": 12600 + }, + { + "epoch": 0.25204, + "grad_norm": 2.0, + "grad_norm_var": 0.005399322509765625, + "learning_rate": 0.0001, + "loss": 4.2205, + "loss/crossentropy": 2.1420929431915283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.206790953874588, + "step": 12602 + }, + { + "epoch": 0.25208, + "grad_norm": 2.234375, + "grad_norm_var": 0.008084869384765625, + "learning_rate": 0.0001, + "loss": 4.5232, + "loss/crossentropy": 2.2422659397125244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22513067722320557, + "step": 12604 + }, + { + "epoch": 0.25212, + "grad_norm": 2.75, + "grad_norm_var": 0.04026667277018229, + "learning_rate": 0.0001, + "loss": 3.8394, + "loss/crossentropy": 1.587006688117981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1833646520972252, + "step": 12606 + }, + { + "epoch": 0.25216, + "grad_norm": 2.09375, + "grad_norm_var": 0.03873291015625, + "learning_rate": 0.0001, + "loss": 4.4461, + "loss/crossentropy": 2.426867365837097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2435208261013031, + "step": 12608 + }, + { + "epoch": 0.2522, + "grad_norm": 2.0, + "grad_norm_var": 0.040185546875, + "learning_rate": 0.0001, + "loss": 4.0039, + "loss/crossentropy": 1.9733251333236694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21772438287734985, + "step": 12610 + }, + { + "epoch": 0.25224, + "grad_norm": 2.84375, + "grad_norm_var": 0.0764312744140625, + "learning_rate": 0.0001, + "loss": 4.2712, + "loss/crossentropy": 1.4835429191589355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17013566195964813, + "step": 12612 + }, + { + "epoch": 0.25228, + "grad_norm": 2.0, + "grad_norm_var": 0.07500178019205729, + "learning_rate": 0.0001, + "loss": 3.8985, + "loss/crossentropy": 2.165693759918213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21315700560808182, + "step": 12614 + }, + { + "epoch": 0.25232, + "grad_norm": 2.09375, + "grad_norm_var": 0.07396647135416666, + "learning_rate": 0.0001, + "loss": 3.9856, + "loss/crossentropy": 2.206323266029358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19913922995328903, + "step": 12616 + }, + { + "epoch": 0.25236, + "grad_norm": 2.015625, + "grad_norm_var": 0.07515360514322916, + "learning_rate": 0.0001, + "loss": 4.0788, + "loss/crossentropy": 2.2520995140075684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22997721284627914, + "step": 12618 + }, + { + "epoch": 0.2524, + "grad_norm": 2.140625, + "grad_norm_var": 0.07444559733072917, + "learning_rate": 0.0001, + "loss": 4.365, + "loss/crossentropy": 1.9322227239608765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20600315928459167, + "step": 12620 + }, + { + "epoch": 0.25244, + "grad_norm": 1.96875, + "grad_norm_var": 0.047459920247395836, + "learning_rate": 0.0001, + "loss": 4.0578, + "loss/crossentropy": 2.140220284461975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21072514355182648, + "step": 12622 + }, + { + "epoch": 0.25248, + "grad_norm": 1.96875, + "grad_norm_var": 0.04942118326822917, + "learning_rate": 0.0001, + "loss": 4.2316, + "loss/crossentropy": 2.061431884765625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2259845808148384, + "step": 12624 + }, + { + "epoch": 0.25252, + "grad_norm": 2.125, + "grad_norm_var": 0.047055816650390624, + "learning_rate": 0.0001, + "loss": 4.14, + "loss/crossentropy": 2.323551654815674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23168490827083588, + "step": 12626 + }, + { + "epoch": 0.25256, + "grad_norm": 1.890625, + "grad_norm_var": 0.008335113525390625, + "learning_rate": 0.0001, + "loss": 4.0036, + "loss/crossentropy": 1.93999582529068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21139459311962128, + "step": 12628 + }, + { + "epoch": 0.2526, + "grad_norm": 2.140625, + "grad_norm_var": 0.011173248291015625, + "learning_rate": 0.0001, + "loss": 4.207, + "loss/crossentropy": 1.9334313869476318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21537292003631592, + "step": 12630 + }, + { + "epoch": 0.25264, + "grad_norm": 2.0, + "grad_norm_var": 0.010477447509765625, + "learning_rate": 0.0001, + "loss": 4.1674, + "loss/crossentropy": 2.145465135574341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22080931067466736, + "step": 12632 + }, + { + "epoch": 0.25268, + "grad_norm": 1.9375, + "grad_norm_var": 0.011668904622395834, + "learning_rate": 0.0001, + "loss": 4.0058, + "loss/crossentropy": 1.9708059430122375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2007170245051384, + "step": 12634 + }, + { + "epoch": 0.25272, + "grad_norm": 2.015625, + "grad_norm_var": 0.0115631103515625, + "learning_rate": 0.0001, + "loss": 3.9649, + "loss/crossentropy": 1.9522746801376343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20030274242162704, + "step": 12636 + }, + { + "epoch": 0.25276, + "grad_norm": 2.0625, + "grad_norm_var": 0.012035878499348958, + "learning_rate": 0.0001, + "loss": 4.0416, + "loss/crossentropy": 1.856387436389923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19685833156108856, + "step": 12638 + }, + { + "epoch": 0.2528, + "grad_norm": 2.296875, + "grad_norm_var": 0.013787587483723959, + "learning_rate": 0.0001, + "loss": 4.475, + "loss/crossentropy": 1.9449518322944641, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20622673630714417, + "step": 12640 + }, + { + "epoch": 0.25284, + "grad_norm": 2.25, + "grad_norm_var": 0.017533365885416666, + "learning_rate": 0.0001, + "loss": 4.4683, + "loss/crossentropy": 2.4559473991394043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21476459503173828, + "step": 12642 + }, + { + "epoch": 0.25288, + "grad_norm": 1.9296875, + "grad_norm_var": 0.016778310139973957, + "learning_rate": 0.0001, + "loss": 4.3164, + "loss/crossentropy": 2.238897919654846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22538839280605316, + "step": 12644 + }, + { + "epoch": 0.25292, + "grad_norm": 2.078125, + "grad_norm_var": 0.013944498697916667, + "learning_rate": 0.0001, + "loss": 3.9801, + "loss/crossentropy": 1.7506417036056519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18603645265102386, + "step": 12646 + }, + { + "epoch": 0.25296, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014359283447265624, + "learning_rate": 0.0001, + "loss": 4.1238, + "loss/crossentropy": 2.162258505821228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21168682724237442, + "step": 12648 + }, + { + "epoch": 0.253, + "grad_norm": 2.171875, + "grad_norm_var": 0.015154774983723958, + "learning_rate": 0.0001, + "loss": 4.0441, + "loss/crossentropy": 1.9122841954231262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20636393874883652, + "step": 12650 + }, + { + "epoch": 0.25304, + "grad_norm": 2.125, + "grad_norm_var": 0.015187327067057292, + "learning_rate": 0.0001, + "loss": 4.3566, + "loss/crossentropy": 2.1501541137695312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22346170246601105, + "step": 12652 + }, + { + "epoch": 0.25308, + "grad_norm": 2.015625, + "grad_norm_var": 0.01456298828125, + "learning_rate": 0.0001, + "loss": 4.0914, + "loss/crossentropy": 1.797228217124939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17510685324668884, + "step": 12654 + }, + { + "epoch": 0.25312, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010422515869140624, + "learning_rate": 0.0001, + "loss": 3.9232, + "loss/crossentropy": 1.7334046363830566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19618064910173416, + "step": 12656 + }, + { + "epoch": 0.25316, + "grad_norm": 2.109375, + "grad_norm_var": 0.0066912333170572914, + "learning_rate": 0.0001, + "loss": 4.3287, + "loss/crossentropy": 1.99091237783432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21426931023597717, + "step": 12658 + }, + { + "epoch": 0.2532, + "grad_norm": 2.09375, + "grad_norm_var": 0.0069244384765625, + "learning_rate": 0.0001, + "loss": 4.5036, + "loss/crossentropy": 2.1908310651779175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20360572636127472, + "step": 12660 + }, + { + "epoch": 0.25324, + "grad_norm": 1.9140625, + "grad_norm_var": 0.00787353515625, + "learning_rate": 0.0001, + "loss": 3.9746, + "loss/crossentropy": 1.7925593852996826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18203188478946686, + "step": 12662 + }, + { + "epoch": 0.25328, + "grad_norm": 2.078125, + "grad_norm_var": 0.009006500244140625, + "learning_rate": 0.0001, + "loss": 3.8955, + "loss/crossentropy": 1.9464862942695618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1896686628460884, + "step": 12664 + }, + { + "epoch": 0.25332, + "grad_norm": 2.046875, + "grad_norm_var": 0.006379954020182292, + "learning_rate": 0.0001, + "loss": 3.9922, + "loss/crossentropy": 2.0207581520080566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20436276495456696, + "step": 12666 + }, + { + "epoch": 0.25336, + "grad_norm": 2.09375, + "grad_norm_var": 0.005322011311848959, + "learning_rate": 0.0001, + "loss": 4.141, + "loss/crossentropy": 1.9915068745613098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21265029907226562, + "step": 12668 + }, + { + "epoch": 0.2534, + "grad_norm": 2.015625, + "grad_norm_var": 0.005204010009765625, + "learning_rate": 0.0001, + "loss": 4.2602, + "loss/crossentropy": 1.787190020084381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20195094496011734, + "step": 12670 + }, + { + "epoch": 0.25344, + "grad_norm": 2.0625, + "grad_norm_var": 0.005036417643229167, + "learning_rate": 0.0001, + "loss": 4.3554, + "loss/crossentropy": 2.2403881549835205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2313152700662613, + "step": 12672 + }, + { + "epoch": 0.25348, + "grad_norm": 1.9921875, + "grad_norm_var": 0.00540771484375, + "learning_rate": 0.0001, + "loss": 3.9895, + "loss/crossentropy": 1.6122660636901855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19166506081819534, + "step": 12674 + }, + { + "epoch": 0.25352, + "grad_norm": 2.171875, + "grad_norm_var": 0.005402628580729167, + "learning_rate": 0.0001, + "loss": 4.1966, + "loss/crossentropy": 2.0758888125419617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19904160499572754, + "step": 12676 + }, + { + "epoch": 0.25356, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0064656575520833336, + "learning_rate": 0.0001, + "loss": 4.2466, + "loss/crossentropy": 1.8765565156936646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17384758591651917, + "step": 12678 + }, + { + "epoch": 0.2536, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0054705301920572914, + "learning_rate": 0.0001, + "loss": 4.0977, + "loss/crossentropy": 2.1322853565216064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21987880766391754, + "step": 12680 + }, + { + "epoch": 0.25364, + "grad_norm": 2.0, + "grad_norm_var": 0.006196848551432292, + "learning_rate": 0.0001, + "loss": 3.9862, + "loss/crossentropy": 2.0303893089294434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20912615954875946, + "step": 12682 + }, + { + "epoch": 0.25368, + "grad_norm": 2.109375, + "grad_norm_var": 0.0065826416015625, + "learning_rate": 0.0001, + "loss": 4.0022, + "loss/crossentropy": 2.2314319610595703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22904963046312332, + "step": 12684 + }, + { + "epoch": 0.25372, + "grad_norm": 2.140625, + "grad_norm_var": 0.007438151041666666, + "learning_rate": 0.0001, + "loss": 4.1854, + "loss/crossentropy": 1.9751350283622742, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20683745294809341, + "step": 12686 + }, + { + "epoch": 0.25376, + "grad_norm": 2.09375, + "grad_norm_var": 0.0088043212890625, + "learning_rate": 0.0001, + "loss": 4.112, + "loss/crossentropy": 1.9198334217071533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19134660065174103, + "step": 12688 + }, + { + "epoch": 0.2538, + "grad_norm": 1.890625, + "grad_norm_var": 0.009214019775390625, + "learning_rate": 0.0001, + "loss": 3.8783, + "loss/crossentropy": 1.945135474205017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20948659628629684, + "step": 12690 + }, + { + "epoch": 0.25384, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0094879150390625, + "learning_rate": 0.0001, + "loss": 4.1018, + "loss/crossentropy": 2.146402955055237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21819791197776794, + "step": 12692 + }, + { + "epoch": 0.25388, + "grad_norm": 2.09375, + "grad_norm_var": 0.007806142171223958, + "learning_rate": 0.0001, + "loss": 4.3696, + "loss/crossentropy": 2.309106230735779, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2160014659166336, + "step": 12694 + }, + { + "epoch": 0.25392, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007616170247395833, + "learning_rate": 0.0001, + "loss": 4.0637, + "loss/crossentropy": 2.1726107597351074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20791994035243988, + "step": 12696 + }, + { + "epoch": 0.25396, + "grad_norm": 2.09375, + "grad_norm_var": 0.007370758056640625, + "learning_rate": 0.0001, + "loss": 4.252, + "loss/crossentropy": 2.0905996561050415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2083379328250885, + "step": 12698 + }, + { + "epoch": 0.254, + "grad_norm": 2.078125, + "grad_norm_var": 0.008185831705729167, + "learning_rate": 0.0001, + "loss": 4.197, + "loss/crossentropy": 1.9190585017204285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1856430396437645, + "step": 12700 + }, + { + "epoch": 0.25404, + "grad_norm": 2.125, + "grad_norm_var": 0.007877604166666666, + "learning_rate": 0.0001, + "loss": 4.3906, + "loss/crossentropy": 2.2493419647216797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20197859406471252, + "step": 12702 + }, + { + "epoch": 0.25408, + "grad_norm": 2.171875, + "grad_norm_var": 0.007731119791666667, + "learning_rate": 0.0001, + "loss": 4.1935, + "loss/crossentropy": 2.0205613374710083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2155362293124199, + "step": 12704 + }, + { + "epoch": 0.25412, + "grad_norm": 2.078125, + "grad_norm_var": 0.006257120768229167, + "learning_rate": 0.0001, + "loss": 4.188, + "loss/crossentropy": 1.8853323459625244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20137012749910355, + "step": 12706 + }, + { + "epoch": 0.25416, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0061431884765625, + "learning_rate": 0.0001, + "loss": 4.098, + "loss/crossentropy": 2.109869122505188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2036309316754341, + "step": 12708 + }, + { + "epoch": 0.2542, + "grad_norm": 2.078125, + "grad_norm_var": 0.005890909830729167, + "learning_rate": 0.0001, + "loss": 4.1782, + "loss/crossentropy": 2.0598954558372498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22793744504451752, + "step": 12710 + }, + { + "epoch": 0.25424, + "grad_norm": 2.171875, + "grad_norm_var": 0.0055735270182291664, + "learning_rate": 0.0001, + "loss": 4.2984, + "loss/crossentropy": 2.169256567955017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20931441336870193, + "step": 12712 + }, + { + "epoch": 0.25428, + "grad_norm": 2.09375, + "grad_norm_var": 0.005197906494140625, + "learning_rate": 0.0001, + "loss": 4.2715, + "loss/crossentropy": 1.8178632855415344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20344894379377365, + "step": 12714 + }, + { + "epoch": 0.25432, + "grad_norm": 2.046875, + "grad_norm_var": 0.004327138264973958, + "learning_rate": 0.0001, + "loss": 4.0897, + "loss/crossentropy": 1.6468743085861206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16709627211093903, + "step": 12716 + }, + { + "epoch": 0.25436, + "grad_norm": 2.171875, + "grad_norm_var": 0.004748280843098958, + "learning_rate": 0.0001, + "loss": 4.1702, + "loss/crossentropy": 1.8353837728500366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20837673544883728, + "step": 12718 + }, + { + "epoch": 0.2544, + "grad_norm": 1.9140625, + "grad_norm_var": 0.005492146809895833, + "learning_rate": 0.0001, + "loss": 3.8056, + "loss/crossentropy": 1.7881666421890259, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18411727994680405, + "step": 12720 + }, + { + "epoch": 0.25444, + "grad_norm": 2.109375, + "grad_norm_var": 0.0065958658854166664, + "learning_rate": 0.0001, + "loss": 4.5369, + "loss/crossentropy": 2.19934618473053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22607161849737167, + "step": 12722 + }, + { + "epoch": 0.25448, + "grad_norm": 2.328125, + "grad_norm_var": 0.008821360270182292, + "learning_rate": 0.0001, + "loss": 4.5237, + "loss/crossentropy": 1.9789779782295227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2156950756907463, + "step": 12724 + }, + { + "epoch": 0.25452, + "grad_norm": 1.921875, + "grad_norm_var": 0.010778554280598958, + "learning_rate": 0.0001, + "loss": 4.1059, + "loss/crossentropy": 2.0605525970458984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20622699707746506, + "step": 12726 + }, + { + "epoch": 0.25456, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011336263020833333, + "learning_rate": 0.0001, + "loss": 4.2862, + "loss/crossentropy": 2.1131407022476196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21070127934217453, + "step": 12728 + }, + { + "epoch": 0.2546, + "grad_norm": 2.09375, + "grad_norm_var": 0.011336263020833333, + "learning_rate": 0.0001, + "loss": 4.3789, + "loss/crossentropy": 1.9708096981048584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22219926118850708, + "step": 12730 + }, + { + "epoch": 0.25464, + "grad_norm": 2.171875, + "grad_norm_var": 0.011918131510416667, + "learning_rate": 0.0001, + "loss": 4.2985, + "loss/crossentropy": 2.220608353614807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2271113097667694, + "step": 12732 + }, + { + "epoch": 0.25468, + "grad_norm": 2.046875, + "grad_norm_var": 0.01146240234375, + "learning_rate": 0.0001, + "loss": 4.2097, + "loss/crossentropy": 2.535509705543518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24400310218334198, + "step": 12734 + }, + { + "epoch": 0.25472, + "grad_norm": 2.390625, + "grad_norm_var": 0.015952301025390626, + "learning_rate": 0.0001, + "loss": 4.6837, + "loss/crossentropy": 2.3619974851608276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2268039956688881, + "step": 12736 + }, + { + "epoch": 0.25476, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0168212890625, + "learning_rate": 0.0001, + "loss": 3.9929, + "loss/crossentropy": 2.0095953941345215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20850248634815216, + "step": 12738 + }, + { + "epoch": 0.2548, + "grad_norm": 2.03125, + "grad_norm_var": 0.012276204427083333, + "learning_rate": 0.0001, + "loss": 4.1466, + "loss/crossentropy": 1.934277892112732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2116999626159668, + "step": 12740 + }, + { + "epoch": 0.25484, + "grad_norm": 2.03125, + "grad_norm_var": 0.011701456705729167, + "learning_rate": 0.0001, + "loss": 4.336, + "loss/crossentropy": 2.237201452255249, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22638197988271713, + "step": 12742 + }, + { + "epoch": 0.25488, + "grad_norm": 2.0, + "grad_norm_var": 0.011464182535807292, + "learning_rate": 0.0001, + "loss": 4.0354, + "loss/crossentropy": 2.302059292793274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22285569459199905, + "step": 12744 + }, + { + "epoch": 0.25492, + "grad_norm": 1.8671875, + "grad_norm_var": 0.013944244384765625, + "learning_rate": 0.0001, + "loss": 3.7097, + "loss/crossentropy": 1.7690886855125427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19209770113229752, + "step": 12746 + }, + { + "epoch": 0.25496, + "grad_norm": 2.171875, + "grad_norm_var": 0.013944244384765625, + "learning_rate": 0.0001, + "loss": 4.2324, + "loss/crossentropy": 2.4372475147247314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2192872166633606, + "step": 12748 + }, + { + "epoch": 0.255, + "grad_norm": 1.8984375, + "grad_norm_var": 0.016405232747395835, + "learning_rate": 0.0001, + "loss": 4.2627, + "loss/crossentropy": 2.12698757648468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21698793768882751, + "step": 12750 + }, + { + "epoch": 0.25504, + "grad_norm": 2.078125, + "grad_norm_var": 0.007710520426432292, + "learning_rate": 0.0001, + "loss": 3.9991, + "loss/crossentropy": 1.989953875541687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1969183310866356, + "step": 12752 + }, + { + "epoch": 0.25508, + "grad_norm": 2.671875, + "grad_norm_var": 0.034501139322916666, + "learning_rate": 0.0001, + "loss": 4.5505, + "loss/crossentropy": 2.2911970615386963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21863195300102234, + "step": 12754 + }, + { + "epoch": 0.25512, + "grad_norm": 2.09375, + "grad_norm_var": 0.03460286458333333, + "learning_rate": 0.0001, + "loss": 4.0915, + "loss/crossentropy": 1.8447460532188416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20167672634124756, + "step": 12756 + }, + { + "epoch": 0.25516, + "grad_norm": 1.96875, + "grad_norm_var": 0.035471343994140626, + "learning_rate": 0.0001, + "loss": 3.8488, + "loss/crossentropy": 2.043856382369995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2075670212507248, + "step": 12758 + }, + { + "epoch": 0.2552, + "grad_norm": 2.15625, + "grad_norm_var": 0.037536366780598955, + "learning_rate": 0.0001, + "loss": 4.0893, + "loss/crossentropy": 1.735123872756958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19347237050533295, + "step": 12760 + }, + { + "epoch": 0.25524, + "grad_norm": 2.015625, + "grad_norm_var": 0.034407552083333334, + "learning_rate": 0.0001, + "loss": 4.0586, + "loss/crossentropy": 2.1058340072631836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21859320253133774, + "step": 12762 + }, + { + "epoch": 0.25528, + "grad_norm": 2.171875, + "grad_norm_var": 0.03444722493489583, + "learning_rate": 0.0001, + "loss": 4.2518, + "loss/crossentropy": 2.0583395957946777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224592886865139, + "step": 12764 + }, + { + "epoch": 0.25532, + "grad_norm": 1.9921875, + "grad_norm_var": 0.032206217447916664, + "learning_rate": 0.0001, + "loss": 4.3562, + "loss/crossentropy": 2.2550116777420044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23644013702869415, + "step": 12766 + }, + { + "epoch": 0.25536, + "grad_norm": 2.015625, + "grad_norm_var": 0.032293446858723956, + "learning_rate": 0.0001, + "loss": 4.2338, + "loss/crossentropy": 2.463193655014038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2229352444410324, + "step": 12768 + }, + { + "epoch": 0.2554, + "grad_norm": 2.0, + "grad_norm_var": 0.006400299072265625, + "learning_rate": 0.0001, + "loss": 4.016, + "loss/crossentropy": 2.010310709476471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20725534856319427, + "step": 12770 + }, + { + "epoch": 0.25544, + "grad_norm": 1.921875, + "grad_norm_var": 0.006219228108723958, + "learning_rate": 0.0001, + "loss": 4.1988, + "loss/crossentropy": 2.2002042531967163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21686428785324097, + "step": 12772 + }, + { + "epoch": 0.25548, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009690093994140624, + "learning_rate": 0.0001, + "loss": 4.2974, + "loss/crossentropy": 2.072916865348816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2210928201675415, + "step": 12774 + }, + { + "epoch": 0.25552, + "grad_norm": 2.015625, + "grad_norm_var": 0.010149892171223958, + "learning_rate": 0.0001, + "loss": 4.3168, + "loss/crossentropy": 2.068341016769409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20776809751987457, + "step": 12776 + }, + { + "epoch": 0.25556, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010949452718098959, + "learning_rate": 0.0001, + "loss": 3.8552, + "loss/crossentropy": 1.6145030856132507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2056947946548462, + "step": 12778 + }, + { + "epoch": 0.2556, + "grad_norm": 2.03125, + "grad_norm_var": 0.011311848958333334, + "learning_rate": 0.0001, + "loss": 4.0132, + "loss/crossentropy": 2.1529496908187866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21802741289138794, + "step": 12780 + }, + { + "epoch": 0.25564, + "grad_norm": 2.03125, + "grad_norm_var": 0.011226399739583334, + "learning_rate": 0.0001, + "loss": 4.1708, + "loss/crossentropy": 2.255640387535095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21794818341732025, + "step": 12782 + }, + { + "epoch": 0.25568, + "grad_norm": 2.03125, + "grad_norm_var": 0.011546834309895834, + "learning_rate": 0.0001, + "loss": 4.1102, + "loss/crossentropy": 1.9934669137001038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21103205531835556, + "step": 12784 + }, + { + "epoch": 0.25572, + "grad_norm": 2.125, + "grad_norm_var": 0.011864217122395833, + "learning_rate": 0.0001, + "loss": 4.2924, + "loss/crossentropy": 1.9404807090759277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22771108150482178, + "step": 12786 + }, + { + "epoch": 0.25576, + "grad_norm": 2.078125, + "grad_norm_var": 0.012084706624348959, + "learning_rate": 0.0001, + "loss": 4.0444, + "loss/crossentropy": 1.7803818583488464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19245180487632751, + "step": 12788 + }, + { + "epoch": 0.2558, + "grad_norm": 2.078125, + "grad_norm_var": 0.009423828125, + "learning_rate": 0.0001, + "loss": 4.2033, + "loss/crossentropy": 2.108216881752014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2239757925271988, + "step": 12790 + }, + { + "epoch": 0.25584, + "grad_norm": 2.0625, + "grad_norm_var": 0.0059478759765625, + "learning_rate": 0.0001, + "loss": 4.1999, + "loss/crossentropy": 2.056196451187134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20573420077562332, + "step": 12792 + }, + { + "epoch": 0.25588, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007275390625, + "learning_rate": 0.0001, + "loss": 3.9749, + "loss/crossentropy": 2.2215099334716797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21107079833745956, + "step": 12794 + }, + { + "epoch": 0.25592, + "grad_norm": 1.796875, + "grad_norm_var": 0.009281158447265625, + "learning_rate": 0.0001, + "loss": 3.9527, + "loss/crossentropy": 1.947695553302765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19738437235355377, + "step": 12796 + }, + { + "epoch": 0.25596, + "grad_norm": 2.0625, + "grad_norm_var": 0.010204823811848958, + "learning_rate": 0.0001, + "loss": 4.1072, + "loss/crossentropy": 2.3351560831069946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21714143455028534, + "step": 12798 + }, + { + "epoch": 0.256, + "grad_norm": 2.0625, + "grad_norm_var": 0.009798177083333333, + "learning_rate": 0.0001, + "loss": 4.3055, + "loss/crossentropy": 2.399898648262024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25758983194828033, + "step": 12800 + }, + { + "epoch": 0.25604, + "grad_norm": 2.671875, + "grad_norm_var": 0.03814697265625, + "learning_rate": 0.0001, + "loss": 4.5632, + "loss/crossentropy": 2.3537880182266235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2976065129041672, + "step": 12802 + }, + { + "epoch": 0.25608, + "grad_norm": 1.921875, + "grad_norm_var": 0.03857421875, + "learning_rate": 0.0001, + "loss": 3.8394, + "loss/crossentropy": 1.7765586972236633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1823679357767105, + "step": 12804 + }, + { + "epoch": 0.25612, + "grad_norm": 2.1875, + "grad_norm_var": 0.039793904622395834, + "learning_rate": 0.0001, + "loss": 4.4023, + "loss/crossentropy": 2.119332432746887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23052836954593658, + "step": 12806 + }, + { + "epoch": 0.25616, + "grad_norm": 2.109375, + "grad_norm_var": 0.0414947509765625, + "learning_rate": 0.0001, + "loss": 4.0251, + "loss/crossentropy": 1.8325288891792297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19811426103115082, + "step": 12808 + }, + { + "epoch": 0.2562, + "grad_norm": 1.953125, + "grad_norm_var": 0.039896392822265626, + "learning_rate": 0.0001, + "loss": 4.1089, + "loss/crossentropy": 2.079294800758362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21122989803552628, + "step": 12810 + }, + { + "epoch": 0.25624, + "grad_norm": 2.078125, + "grad_norm_var": 0.03581110636393229, + "learning_rate": 0.0001, + "loss": 4.0105, + "loss/crossentropy": 2.029142141342163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2008090615272522, + "step": 12812 + }, + { + "epoch": 0.25628, + "grad_norm": 1.9453125, + "grad_norm_var": 0.03578058878580729, + "learning_rate": 0.0001, + "loss": 4.1743, + "loss/crossentropy": 1.8697097301483154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19589021801948547, + "step": 12814 + }, + { + "epoch": 0.25632, + "grad_norm": 1.8203125, + "grad_norm_var": 0.038331858317057294, + "learning_rate": 0.0001, + "loss": 3.8837, + "loss/crossentropy": 1.8488793969154358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19999422132968903, + "step": 12816 + }, + { + "epoch": 0.25636, + "grad_norm": 2.109375, + "grad_norm_var": 0.011946360270182291, + "learning_rate": 0.0001, + "loss": 4.236, + "loss/crossentropy": 2.352132201194763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23569129407405853, + "step": 12818 + }, + { + "epoch": 0.2564, + "grad_norm": 2.015625, + "grad_norm_var": 0.010155232747395833, + "learning_rate": 0.0001, + "loss": 4.2247, + "loss/crossentropy": 2.213895559310913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19782276451587677, + "step": 12820 + }, + { + "epoch": 0.25644, + "grad_norm": 1.96875, + "grad_norm_var": 0.008876291910807292, + "learning_rate": 0.0001, + "loss": 4.205, + "loss/crossentropy": 2.328965425491333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2138378769159317, + "step": 12822 + }, + { + "epoch": 0.25648, + "grad_norm": 1.9375, + "grad_norm_var": 0.007407379150390625, + "learning_rate": 0.0001, + "loss": 3.9178, + "loss/crossentropy": 2.148880124092102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20276722311973572, + "step": 12824 + }, + { + "epoch": 0.25652, + "grad_norm": 2.109375, + "grad_norm_var": 0.007124582926432292, + "learning_rate": 0.0001, + "loss": 4.2742, + "loss/crossentropy": 2.1533660888671875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22211932390928268, + "step": 12826 + }, + { + "epoch": 0.25656, + "grad_norm": 2.75, + "grad_norm_var": 0.04098078409830729, + "learning_rate": 0.0001, + "loss": 4.0928, + "loss/crossentropy": 1.7518101930618286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17689460515975952, + "step": 12828 + }, + { + "epoch": 0.2566, + "grad_norm": 2.109375, + "grad_norm_var": 0.0395660400390625, + "learning_rate": 0.0001, + "loss": 3.9745, + "loss/crossentropy": 2.346145749092102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22157911956310272, + "step": 12830 + }, + { + "epoch": 0.25664, + "grad_norm": 2.09375, + "grad_norm_var": 0.0358062744140625, + "learning_rate": 0.0001, + "loss": 4.1689, + "loss/crossentropy": 1.8355774283409119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20288754999637604, + "step": 12832 + }, + { + "epoch": 0.25668, + "grad_norm": 1.984375, + "grad_norm_var": 0.03906962076822917, + "learning_rate": 0.0001, + "loss": 3.9947, + "loss/crossentropy": 1.949233889579773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19860410690307617, + "step": 12834 + }, + { + "epoch": 0.25672, + "grad_norm": 1.9609375, + "grad_norm_var": 0.04248046875, + "learning_rate": 0.0001, + "loss": 4.0903, + "loss/crossentropy": 2.0141645669937134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19531898200511932, + "step": 12836 + }, + { + "epoch": 0.25676, + "grad_norm": 2.015625, + "grad_norm_var": 0.04277725219726562, + "learning_rate": 0.0001, + "loss": 3.918, + "loss/crossentropy": 1.6573863625526428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.163545623421669, + "step": 12838 + }, + { + "epoch": 0.2568, + "grad_norm": 2.15625, + "grad_norm_var": 0.04273656209309896, + "learning_rate": 0.0001, + "loss": 4.3252, + "loss/crossentropy": 2.2174651622772217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23083333671092987, + "step": 12840 + }, + { + "epoch": 0.25684, + "grad_norm": 1.953125, + "grad_norm_var": 0.045967356363932295, + "learning_rate": 0.0001, + "loss": 3.7593, + "loss/crossentropy": 1.8067168593406677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1963481903076172, + "step": 12842 + }, + { + "epoch": 0.25688, + "grad_norm": 2.078125, + "grad_norm_var": 0.010536448160807291, + "learning_rate": 0.0001, + "loss": 3.936, + "loss/crossentropy": 1.928274691104889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21783485263586044, + "step": 12844 + }, + { + "epoch": 0.25692, + "grad_norm": 1.90625, + "grad_norm_var": 0.013498687744140625, + "learning_rate": 0.0001, + "loss": 4.0984, + "loss/crossentropy": 2.183669090270996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2040836587548256, + "step": 12846 + }, + { + "epoch": 0.25696, + "grad_norm": 2.015625, + "grad_norm_var": 0.012894439697265624, + "learning_rate": 0.0001, + "loss": 4.0624, + "loss/crossentropy": 1.7395422458648682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19434035569429398, + "step": 12848 + }, + { + "epoch": 0.257, + "grad_norm": 1.8515625, + "grad_norm_var": 0.012772623697916667, + "learning_rate": 0.0001, + "loss": 3.9914, + "loss/crossentropy": 1.9124428629875183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19825156033039093, + "step": 12850 + }, + { + "epoch": 0.25704, + "grad_norm": 2.09375, + "grad_norm_var": 0.011606597900390625, + "learning_rate": 0.0001, + "loss": 3.9325, + "loss/crossentropy": 1.8064388036727905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19374487549066544, + "step": 12852 + }, + { + "epoch": 0.25708, + "grad_norm": 2.0, + "grad_norm_var": 0.0111724853515625, + "learning_rate": 0.0001, + "loss": 3.8722, + "loss/crossentropy": 2.0943931341171265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20997963845729828, + "step": 12854 + }, + { + "epoch": 0.25712, + "grad_norm": 1.96875, + "grad_norm_var": 0.00948486328125, + "learning_rate": 0.0001, + "loss": 3.9974, + "loss/crossentropy": 1.9252876043319702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20608004182577133, + "step": 12856 + }, + { + "epoch": 0.25716, + "grad_norm": 2.015625, + "grad_norm_var": 0.007575480143229166, + "learning_rate": 0.0001, + "loss": 3.9536, + "loss/crossentropy": 1.813286304473877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2107577547430992, + "step": 12858 + }, + { + "epoch": 0.2572, + "grad_norm": 1.984375, + "grad_norm_var": 0.007405598958333333, + "learning_rate": 0.0001, + "loss": 4.1571, + "loss/crossentropy": 1.985919713973999, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18724270164966583, + "step": 12860 + }, + { + "epoch": 0.25724, + "grad_norm": 1.921875, + "grad_norm_var": 0.004654693603515625, + "learning_rate": 0.0001, + "loss": 3.9377, + "loss/crossentropy": 1.912338137626648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19602636992931366, + "step": 12862 + }, + { + "epoch": 0.25728, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004743448893229167, + "learning_rate": 0.0001, + "loss": 4.0823, + "loss/crossentropy": 1.962310791015625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1975855752825737, + "step": 12864 + }, + { + "epoch": 0.25732, + "grad_norm": 2.140625, + "grad_norm_var": 0.0050046284993489586, + "learning_rate": 0.0001, + "loss": 4.285, + "loss/crossentropy": 2.063133656978607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20012972503900528, + "step": 12866 + }, + { + "epoch": 0.25736, + "grad_norm": 2.15625, + "grad_norm_var": 0.005751291910807292, + "learning_rate": 0.0001, + "loss": 4.1443, + "loss/crossentropy": 2.1755728721618652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21089357137680054, + "step": 12868 + }, + { + "epoch": 0.2574, + "grad_norm": 2.09375, + "grad_norm_var": 0.006148274739583333, + "learning_rate": 0.0001, + "loss": 4.3854, + "loss/crossentropy": 2.21665620803833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22525488585233688, + "step": 12870 + }, + { + "epoch": 0.25744, + "grad_norm": 2.140625, + "grad_norm_var": 0.006959788004557292, + "learning_rate": 0.0001, + "loss": 4.2465, + "loss/crossentropy": 2.2826790809631348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22989312559366226, + "step": 12872 + }, + { + "epoch": 0.25748, + "grad_norm": 1.875, + "grad_norm_var": 0.008432769775390625, + "learning_rate": 0.0001, + "loss": 4.0753, + "loss/crossentropy": 2.3187366724014282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22048770636320114, + "step": 12874 + }, + { + "epoch": 0.25752, + "grad_norm": 2.09375, + "grad_norm_var": 0.0093017578125, + "learning_rate": 0.0001, + "loss": 4.2469, + "loss/crossentropy": 2.234878957271576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19875259697437286, + "step": 12876 + }, + { + "epoch": 0.25756, + "grad_norm": 2.046875, + "grad_norm_var": 0.007897694905598959, + "learning_rate": 0.0001, + "loss": 4.0981, + "loss/crossentropy": 1.8919751644134521, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20682695508003235, + "step": 12878 + }, + { + "epoch": 0.2576, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008861287434895834, + "learning_rate": 0.0001, + "loss": 3.9564, + "loss/crossentropy": 2.135833740234375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100691795349121, + "step": 12880 + }, + { + "epoch": 0.25764, + "grad_norm": 1.96875, + "grad_norm_var": 0.008421834309895833, + "learning_rate": 0.0001, + "loss": 4.163, + "loss/crossentropy": 2.043276846408844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20327290892601013, + "step": 12882 + }, + { + "epoch": 0.25768, + "grad_norm": 1.921875, + "grad_norm_var": 0.0077512105305989586, + "learning_rate": 0.0001, + "loss": 4.2439, + "loss/crossentropy": 1.996088445186615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20499806851148605, + "step": 12884 + }, + { + "epoch": 0.25772, + "grad_norm": 1.984375, + "grad_norm_var": 0.006648508707682291, + "learning_rate": 0.0001, + "loss": 3.9821, + "loss/crossentropy": 1.7864345908164978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19354674220085144, + "step": 12886 + }, + { + "epoch": 0.25776, + "grad_norm": 2.03125, + "grad_norm_var": 0.0051310221354166664, + "learning_rate": 0.0001, + "loss": 3.961, + "loss/crossentropy": 1.7890866994857788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1806325614452362, + "step": 12888 + }, + { + "epoch": 0.2578, + "grad_norm": 2.09375, + "grad_norm_var": 0.004938761393229167, + "learning_rate": 0.0001, + "loss": 4.1836, + "loss/crossentropy": 1.9287649989128113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19555142521858215, + "step": 12890 + }, + { + "epoch": 0.25784, + "grad_norm": 2.140625, + "grad_norm_var": 0.005081939697265625, + "learning_rate": 0.0001, + "loss": 3.9151, + "loss/crossentropy": 1.930963397026062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19455663859844208, + "step": 12892 + }, + { + "epoch": 0.25788, + "grad_norm": 2.109375, + "grad_norm_var": 0.007551829020182292, + "learning_rate": 0.0001, + "loss": 4.3216, + "loss/crossentropy": 2.1105018854141235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22564146667718887, + "step": 12894 + }, + { + "epoch": 0.25792, + "grad_norm": 2.03125, + "grad_norm_var": 0.008365885416666666, + "learning_rate": 0.0001, + "loss": 4.1924, + "loss/crossentropy": 2.0535677671432495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2083517387509346, + "step": 12896 + }, + { + "epoch": 0.25796, + "grad_norm": 2.015625, + "grad_norm_var": 0.007966105143229167, + "learning_rate": 0.0001, + "loss": 4.2951, + "loss/crossentropy": 2.062114655971527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20135939121246338, + "step": 12898 + }, + { + "epoch": 0.258, + "grad_norm": 1.7734375, + "grad_norm_var": 0.012482706705729167, + "learning_rate": 0.0001, + "loss": 3.8187, + "loss/crossentropy": 1.968269407749176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20238329470157623, + "step": 12900 + }, + { + "epoch": 0.25804, + "grad_norm": 2.03125, + "grad_norm_var": 0.0598541259765625, + "learning_rate": 0.0001, + "loss": 4.1899, + "loss/crossentropy": 2.0783804655075073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23015306890010834, + "step": 12902 + }, + { + "epoch": 0.25808, + "grad_norm": 2.140625, + "grad_norm_var": 0.05815836588541667, + "learning_rate": 0.0001, + "loss": 4.1245, + "loss/crossentropy": 1.8616145253181458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19802623987197876, + "step": 12904 + }, + { + "epoch": 0.25812, + "grad_norm": 2.046875, + "grad_norm_var": 0.05834147135416667, + "learning_rate": 0.0001, + "loss": 4.2293, + "loss/crossentropy": 2.0005027651786804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21621856093406677, + "step": 12906 + }, + { + "epoch": 0.25816, + "grad_norm": 2.1875, + "grad_norm_var": 0.06181208292643229, + "learning_rate": 0.0001, + "loss": 4.0563, + "loss/crossentropy": 1.796087920665741, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18594232201576233, + "step": 12908 + }, + { + "epoch": 0.2582, + "grad_norm": 2.1875, + "grad_norm_var": 0.06258316040039062, + "learning_rate": 0.0001, + "loss": 4.3161, + "loss/crossentropy": 2.003196358680725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19296320527791977, + "step": 12910 + }, + { + "epoch": 0.25824, + "grad_norm": 1.9296875, + "grad_norm_var": 0.06313654581705729, + "learning_rate": 0.0001, + "loss": 4.0489, + "loss/crossentropy": 1.8252119421958923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1879737451672554, + "step": 12912 + }, + { + "epoch": 0.25828, + "grad_norm": 2.046875, + "grad_norm_var": 0.0629900614420573, + "learning_rate": 0.0001, + "loss": 4.2053, + "loss/crossentropy": 1.9801989793777466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20867998152971268, + "step": 12914 + }, + { + "epoch": 0.25832, + "grad_norm": 1.9921875, + "grad_norm_var": 0.05583902994791667, + "learning_rate": 0.0001, + "loss": 3.9754, + "loss/crossentropy": 2.04214608669281, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21134068816900253, + "step": 12916 + }, + { + "epoch": 0.25836, + "grad_norm": 2.171875, + "grad_norm_var": 0.009159342447916666, + "learning_rate": 0.0001, + "loss": 4.2202, + "loss/crossentropy": 2.0669824481010437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20436694473028183, + "step": 12918 + }, + { + "epoch": 0.2584, + "grad_norm": 2.09375, + "grad_norm_var": 0.008771769205729167, + "learning_rate": 0.0001, + "loss": 4.3616, + "loss/crossentropy": 1.951616883277893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19572293758392334, + "step": 12920 + }, + { + "epoch": 0.25844, + "grad_norm": 1.953125, + "grad_norm_var": 0.010074869791666666, + "learning_rate": 0.0001, + "loss": 3.8476, + "loss/crossentropy": 1.7768954634666443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1930573582649231, + "step": 12922 + }, + { + "epoch": 0.25848, + "grad_norm": 2.40625, + "grad_norm_var": 0.020458984375, + "learning_rate": 0.0001, + "loss": 3.8775, + "loss/crossentropy": 1.8527125716209412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18682826310396194, + "step": 12924 + }, + { + "epoch": 0.25852, + "grad_norm": 2.0, + "grad_norm_var": 0.018990071614583333, + "learning_rate": 0.0001, + "loss": 4.3814, + "loss/crossentropy": 2.3325828313827515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21535523980855942, + "step": 12926 + }, + { + "epoch": 0.25856, + "grad_norm": 1.8984375, + "grad_norm_var": 0.02063166300455729, + "learning_rate": 0.0001, + "loss": 3.7701, + "loss/crossentropy": 1.759222686290741, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17470692098140717, + "step": 12928 + }, + { + "epoch": 0.2586, + "grad_norm": 1.921875, + "grad_norm_var": 0.021247355143229167, + "learning_rate": 0.0001, + "loss": 3.8065, + "loss/crossentropy": 1.944493055343628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20259656757116318, + "step": 12930 + }, + { + "epoch": 0.25864, + "grad_norm": 1.984375, + "grad_norm_var": 0.05238825480143229, + "learning_rate": 0.0001, + "loss": 4.133, + "loss/crossentropy": 2.086555302143097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2180738002061844, + "step": 12932 + }, + { + "epoch": 0.25868, + "grad_norm": 1.9453125, + "grad_norm_var": 0.052779134114583334, + "learning_rate": 0.0001, + "loss": 4.1414, + "loss/crossentropy": 1.9723011255264282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22814874351024628, + "step": 12934 + }, + { + "epoch": 0.25872, + "grad_norm": 2.125, + "grad_norm_var": 0.05356038411458333, + "learning_rate": 0.0001, + "loss": 4.418, + "loss/crossentropy": 1.5646896958351135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18182098120450974, + "step": 12936 + }, + { + "epoch": 0.25876, + "grad_norm": 1.8828125, + "grad_norm_var": 0.054870351155598955, + "learning_rate": 0.0001, + "loss": 3.8356, + "loss/crossentropy": 1.8396940231323242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19390598684549332, + "step": 12938 + }, + { + "epoch": 0.2588, + "grad_norm": 2.046875, + "grad_norm_var": 0.041112263997395836, + "learning_rate": 0.0001, + "loss": 4.5902, + "loss/crossentropy": 2.086738705635071, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.209588885307312, + "step": 12940 + }, + { + "epoch": 0.25884, + "grad_norm": 2.078125, + "grad_norm_var": 0.04136530558268229, + "learning_rate": 0.0001, + "loss": 4.2611, + "loss/crossentropy": 2.0965282917022705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21021046489477158, + "step": 12942 + }, + { + "epoch": 0.25888, + "grad_norm": 1.8125, + "grad_norm_var": 0.04311421712239583, + "learning_rate": 0.0001, + "loss": 4.0702, + "loss/crossentropy": 2.2878576517105103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2203715667128563, + "step": 12944 + }, + { + "epoch": 0.25892, + "grad_norm": 1.96875, + "grad_norm_var": 0.045446523030598956, + "learning_rate": 0.0001, + "loss": 4.419, + "loss/crossentropy": 2.1753373742103577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21585986018180847, + "step": 12946 + }, + { + "epoch": 0.25896, + "grad_norm": 2.0625, + "grad_norm_var": 0.036232248942057295, + "learning_rate": 0.0001, + "loss": 4.3735, + "loss/crossentropy": 1.963489055633545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20374132692813873, + "step": 12948 + }, + { + "epoch": 0.259, + "grad_norm": 2.265625, + "grad_norm_var": 0.037751261393229166, + "learning_rate": 0.0001, + "loss": 4.4073, + "loss/crossentropy": 2.3734259605407715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2375379502773285, + "step": 12950 + }, + { + "epoch": 0.25904, + "grad_norm": 1.9921875, + "grad_norm_var": 0.040710194905598955, + "learning_rate": 0.0001, + "loss": 4.184, + "loss/crossentropy": 2.266390085220337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22274205088615417, + "step": 12952 + }, + { + "epoch": 0.25908, + "grad_norm": 2.046875, + "grad_norm_var": 0.03729654947916667, + "learning_rate": 0.0001, + "loss": 3.9735, + "loss/crossentropy": 1.9015297293663025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20370282977819443, + "step": 12954 + }, + { + "epoch": 0.25912, + "grad_norm": 2.03125, + "grad_norm_var": 0.03886617024739583, + "learning_rate": 0.0001, + "loss": 4.3269, + "loss/crossentropy": 2.1192378997802734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26288172602653503, + "step": 12956 + }, + { + "epoch": 0.25916, + "grad_norm": 2.046875, + "grad_norm_var": 0.03792088826497396, + "learning_rate": 0.0001, + "loss": 4.0982, + "loss/crossentropy": 1.713826835155487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18557386100292206, + "step": 12958 + }, + { + "epoch": 0.2592, + "grad_norm": 1.9921875, + "grad_norm_var": 0.029545084635416666, + "learning_rate": 0.0001, + "loss": 4.2002, + "loss/crossentropy": 2.0365039706230164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21086978912353516, + "step": 12960 + }, + { + "epoch": 0.25924, + "grad_norm": 2.078125, + "grad_norm_var": 0.027852376302083332, + "learning_rate": 0.0001, + "loss": 4.2422, + "loss/crossentropy": 1.6235400438308716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18112218379974365, + "step": 12962 + }, + { + "epoch": 0.25928, + "grad_norm": 2.03125, + "grad_norm_var": 0.0116455078125, + "learning_rate": 0.0001, + "loss": 4.2996, + "loss/crossentropy": 2.042620003223419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21463964879512787, + "step": 12964 + }, + { + "epoch": 0.25932, + "grad_norm": 2.046875, + "grad_norm_var": 0.009611002604166667, + "learning_rate": 0.0001, + "loss": 4.0222, + "loss/crossentropy": 1.6238983273506165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2171783745288849, + "step": 12966 + }, + { + "epoch": 0.25936, + "grad_norm": 2.4375, + "grad_norm_var": 0.014070383707682292, + "learning_rate": 0.0001, + "loss": 4.0098, + "loss/crossentropy": 2.3150339126586914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22038520872592926, + "step": 12968 + }, + { + "epoch": 0.2594, + "grad_norm": 2.09375, + "grad_norm_var": 0.013301595052083334, + "learning_rate": 0.0001, + "loss": 4.2202, + "loss/crossentropy": 2.3907227516174316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2269512265920639, + "step": 12970 + }, + { + "epoch": 0.25944, + "grad_norm": 1.921875, + "grad_norm_var": 0.014378865559895834, + "learning_rate": 0.0001, + "loss": 4.1032, + "loss/crossentropy": 2.1069165468215942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21007738262414932, + "step": 12972 + }, + { + "epoch": 0.25948, + "grad_norm": 1.8984375, + "grad_norm_var": 0.017942047119140624, + "learning_rate": 0.0001, + "loss": 3.9475, + "loss/crossentropy": 2.277982234954834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21795185655355453, + "step": 12974 + }, + { + "epoch": 0.25952, + "grad_norm": 1.9921875, + "grad_norm_var": 0.017775217692057293, + "learning_rate": 0.0001, + "loss": 4.0159, + "loss/crossentropy": 1.7227254509925842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19050486385822296, + "step": 12976 + }, + { + "epoch": 0.25956, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0184722900390625, + "learning_rate": 0.0001, + "loss": 4.0281, + "loss/crossentropy": 2.0112340450286865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21032612770795822, + "step": 12978 + }, + { + "epoch": 0.2596, + "grad_norm": 2.140625, + "grad_norm_var": 0.019530232747395834, + "learning_rate": 0.0001, + "loss": 4.3041, + "loss/crossentropy": 2.4241796731948853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22474834322929382, + "step": 12980 + }, + { + "epoch": 0.25964, + "grad_norm": 2.078125, + "grad_norm_var": 0.019710286458333334, + "learning_rate": 0.0001, + "loss": 4.3814, + "loss/crossentropy": 1.893187701702118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20330534130334854, + "step": 12982 + }, + { + "epoch": 0.25968, + "grad_norm": 1.9375, + "grad_norm_var": 0.008467356363932291, + "learning_rate": 0.0001, + "loss": 4.1527, + "loss/crossentropy": 2.045100212097168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20636773109436035, + "step": 12984 + }, + { + "epoch": 0.25972, + "grad_norm": 1.8125, + "grad_norm_var": 0.010035959879557292, + "learning_rate": 0.0001, + "loss": 3.864, + "loss/crossentropy": 1.819455087184906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1775035709142685, + "step": 12986 + }, + { + "epoch": 0.25976, + "grad_norm": 1.9609375, + "grad_norm_var": 0.00985107421875, + "learning_rate": 0.0001, + "loss": 4.0553, + "loss/crossentropy": 2.1358631253242493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2027900367975235, + "step": 12988 + }, + { + "epoch": 0.2598, + "grad_norm": 2.1875, + "grad_norm_var": 0.010902659098307291, + "learning_rate": 0.0001, + "loss": 4.1552, + "loss/crossentropy": 1.7847145199775696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21203257888555527, + "step": 12990 + }, + { + "epoch": 0.25984, + "grad_norm": 2.109375, + "grad_norm_var": 0.011979166666666667, + "learning_rate": 0.0001, + "loss": 4.325, + "loss/crossentropy": 2.2848687171936035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22337154299020767, + "step": 12992 + }, + { + "epoch": 0.25988, + "grad_norm": 2.046875, + "grad_norm_var": 0.012889607747395834, + "learning_rate": 0.0001, + "loss": 3.975, + "loss/crossentropy": 1.9629716277122498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20660604536533356, + "step": 12994 + }, + { + "epoch": 0.25992, + "grad_norm": 2.0, + "grad_norm_var": 0.010461171468098959, + "learning_rate": 0.0001, + "loss": 4.1521, + "loss/crossentropy": 2.0321004390716553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21755626797676086, + "step": 12996 + }, + { + "epoch": 0.25996, + "grad_norm": 1.890625, + "grad_norm_var": 0.0105712890625, + "learning_rate": 0.0001, + "loss": 4.1208, + "loss/crossentropy": 1.950038492679596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987697035074234, + "step": 12998 + }, + { + "epoch": 0.26, + "grad_norm": 2.078125, + "grad_norm_var": 0.010544586181640624, + "learning_rate": 0.0001, + "loss": 4.3383, + "loss/crossentropy": 2.2752585411071777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20959648489952087, + "step": 13000 + }, + { + "epoch": 0.26004, + "grad_norm": 2.125, + "grad_norm_var": 0.008138020833333334, + "learning_rate": 0.0001, + "loss": 4.3132, + "loss/crossentropy": 2.162104368209839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21731911599636078, + "step": 13002 + }, + { + "epoch": 0.26008, + "grad_norm": 1.890625, + "grad_norm_var": 0.009102121988932291, + "learning_rate": 0.0001, + "loss": 4.0138, + "loss/crossentropy": 2.3739081025123596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22912397980690002, + "step": 13004 + }, + { + "epoch": 0.26012, + "grad_norm": 3.0625, + "grad_norm_var": 0.07608413696289062, + "learning_rate": 0.0001, + "loss": 4.3052, + "loss/crossentropy": 2.161367416381836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22413842380046844, + "step": 13006 + }, + { + "epoch": 0.26016, + "grad_norm": 2.0625, + "grad_norm_var": 0.07587661743164062, + "learning_rate": 0.0001, + "loss": 4.1435, + "loss/crossentropy": 1.9589285850524902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21598373353481293, + "step": 13008 + }, + { + "epoch": 0.2602, + "grad_norm": 2.09375, + "grad_norm_var": 0.0731842041015625, + "learning_rate": 0.0001, + "loss": 4.3268, + "loss/crossentropy": 2.3472602367401123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22626091539859772, + "step": 13010 + }, + { + "epoch": 0.26024, + "grad_norm": 1.984375, + "grad_norm_var": 0.07296727498372396, + "learning_rate": 0.0001, + "loss": 3.8455, + "loss/crossentropy": 1.9968576431274414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20777586847543716, + "step": 13012 + }, + { + "epoch": 0.26028, + "grad_norm": 2.140625, + "grad_norm_var": 0.0729400634765625, + "learning_rate": 0.0001, + "loss": 3.8219, + "loss/crossentropy": 1.7597955465316772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19565816968679428, + "step": 13014 + }, + { + "epoch": 0.26032, + "grad_norm": 2.09375, + "grad_norm_var": 0.0735992431640625, + "learning_rate": 0.0001, + "loss": 4.1446, + "loss/crossentropy": 1.8115187287330627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19675451517105103, + "step": 13016 + }, + { + "epoch": 0.26036, + "grad_norm": 1.9453125, + "grad_norm_var": 0.07507909138997396, + "learning_rate": 0.0001, + "loss": 4.2018, + "loss/crossentropy": 1.9728147983551025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19999082386493683, + "step": 13018 + }, + { + "epoch": 0.2604, + "grad_norm": 1.8828125, + "grad_norm_var": 0.07516988118489583, + "learning_rate": 0.0001, + "loss": 4.1434, + "loss/crossentropy": 2.155986785888672, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20160435140132904, + "step": 13020 + }, + { + "epoch": 0.26044, + "grad_norm": 2.015625, + "grad_norm_var": 0.0081298828125, + "learning_rate": 0.0001, + "loss": 4.0647, + "loss/crossentropy": 2.119171440601349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2231176346540451, + "step": 13022 + }, + { + "epoch": 0.26048, + "grad_norm": 1.8046875, + "grad_norm_var": 0.010884348551432292, + "learning_rate": 0.0001, + "loss": 3.8185, + "loss/crossentropy": 2.0395994186401367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21075501292943954, + "step": 13024 + }, + { + "epoch": 0.26052, + "grad_norm": 2.28125, + "grad_norm_var": 0.014808909098307291, + "learning_rate": 0.0001, + "loss": 4.4988, + "loss/crossentropy": 2.42897891998291, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24882060289382935, + "step": 13026 + }, + { + "epoch": 0.26056, + "grad_norm": 2.25, + "grad_norm_var": 0.018202463785807293, + "learning_rate": 0.0001, + "loss": 4.4995, + "loss/crossentropy": 2.367077350616455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21996428817510605, + "step": 13028 + }, + { + "epoch": 0.2606, + "grad_norm": 1.875, + "grad_norm_var": 0.017207590738932292, + "learning_rate": 0.0001, + "loss": 4.0349, + "loss/crossentropy": 2.071397542953491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20548687875270844, + "step": 13030 + }, + { + "epoch": 0.26064, + "grad_norm": 2.078125, + "grad_norm_var": 0.016943359375, + "learning_rate": 0.0001, + "loss": 4.0425, + "loss/crossentropy": 1.764617681503296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19541263580322266, + "step": 13032 + }, + { + "epoch": 0.26068, + "grad_norm": 2.015625, + "grad_norm_var": 0.016916656494140626, + "learning_rate": 0.0001, + "loss": 4.2657, + "loss/crossentropy": 2.2966678142547607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2187207117676735, + "step": 13034 + }, + { + "epoch": 0.26072, + "grad_norm": 2.09375, + "grad_norm_var": 0.0159820556640625, + "learning_rate": 0.0001, + "loss": 4.2959, + "loss/crossentropy": 1.923275649547577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20180264115333557, + "step": 13036 + }, + { + "epoch": 0.26076, + "grad_norm": 2.171875, + "grad_norm_var": 0.017207845052083334, + "learning_rate": 0.0001, + "loss": 4.3138, + "loss/crossentropy": 2.246683716773987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22133737802505493, + "step": 13038 + }, + { + "epoch": 0.2608, + "grad_norm": 2.046875, + "grad_norm_var": 0.011755116780598958, + "learning_rate": 0.0001, + "loss": 4.3685, + "loss/crossentropy": 2.271313190460205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23316439241170883, + "step": 13040 + }, + { + "epoch": 0.26084, + "grad_norm": 2.0625, + "grad_norm_var": 0.008652496337890624, + "learning_rate": 0.0001, + "loss": 4.4451, + "loss/crossentropy": 2.2649654150009155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.211005300283432, + "step": 13042 + }, + { + "epoch": 0.26088, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008006795247395834, + "learning_rate": 0.0001, + "loss": 3.9505, + "loss/crossentropy": 1.7872197031974792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17795034497976303, + "step": 13044 + }, + { + "epoch": 0.26092, + "grad_norm": 2.046875, + "grad_norm_var": 0.007875315348307292, + "learning_rate": 0.0001, + "loss": 3.8693, + "loss/crossentropy": 1.793078064918518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18739672750234604, + "step": 13046 + }, + { + "epoch": 0.26096, + "grad_norm": 2.140625, + "grad_norm_var": 0.0111572265625, + "learning_rate": 0.0001, + "loss": 4.4178, + "loss/crossentropy": 2.0676932334899902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2156887650489807, + "step": 13048 + }, + { + "epoch": 0.261, + "grad_norm": 2.046875, + "grad_norm_var": 0.013484700520833334, + "learning_rate": 0.0001, + "loss": 4.2851, + "loss/crossentropy": 2.2497498989105225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21906063705682755, + "step": 13050 + }, + { + "epoch": 0.26104, + "grad_norm": 2.015625, + "grad_norm_var": 0.014994303385416666, + "learning_rate": 0.0001, + "loss": 4.0203, + "loss/crossentropy": 1.901672899723053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2070387825369835, + "step": 13052 + }, + { + "epoch": 0.26108, + "grad_norm": 2.109375, + "grad_norm_var": 0.013887532552083333, + "learning_rate": 0.0001, + "loss": 4.3772, + "loss/crossentropy": 2.2334693670272827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21539244055747986, + "step": 13054 + }, + { + "epoch": 0.26112, + "grad_norm": 2.0625, + "grad_norm_var": 0.014288075764973958, + "learning_rate": 0.0001, + "loss": 4.2916, + "loss/crossentropy": 2.0297399759292603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18854249268770218, + "step": 13056 + }, + { + "epoch": 0.26116, + "grad_norm": 2.09375, + "grad_norm_var": 0.014452107747395833, + "learning_rate": 0.0001, + "loss": 4.1115, + "loss/crossentropy": 2.0964863896369934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21821416169404984, + "step": 13058 + }, + { + "epoch": 0.2612, + "grad_norm": 2.09375, + "grad_norm_var": 0.013702138264973959, + "learning_rate": 0.0001, + "loss": 4.4918, + "loss/crossentropy": 2.1526511907577515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22606930136680603, + "step": 13060 + }, + { + "epoch": 0.26124, + "grad_norm": 2.109375, + "grad_norm_var": 0.0101470947265625, + "learning_rate": 0.0001, + "loss": 4.2079, + "loss/crossentropy": 2.2275509238243103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21959830820560455, + "step": 13062 + }, + { + "epoch": 0.26128, + "grad_norm": 2.046875, + "grad_norm_var": 0.0086181640625, + "learning_rate": 0.0001, + "loss": 4.4246, + "loss/crossentropy": 2.542263627052307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22889885306358337, + "step": 13064 + }, + { + "epoch": 0.26132, + "grad_norm": 1.875, + "grad_norm_var": 0.008585611979166666, + "learning_rate": 0.0001, + "loss": 4.0292, + "loss/crossentropy": 1.7915868163108826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20242593437433243, + "step": 13066 + }, + { + "epoch": 0.26136, + "grad_norm": 2.046875, + "grad_norm_var": 0.006453450520833333, + "learning_rate": 0.0001, + "loss": 4.1415, + "loss/crossentropy": 2.183286130428314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.227259561419487, + "step": 13068 + }, + { + "epoch": 0.2614, + "grad_norm": 2.046875, + "grad_norm_var": 0.0063720703125, + "learning_rate": 0.0001, + "loss": 4.3211, + "loss/crossentropy": 2.2707515954971313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2300432324409485, + "step": 13070 + }, + { + "epoch": 0.26144, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0066487630208333336, + "learning_rate": 0.0001, + "loss": 4.0237, + "loss/crossentropy": 1.822485864162445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19380664825439453, + "step": 13072 + }, + { + "epoch": 0.26148, + "grad_norm": 2.203125, + "grad_norm_var": 0.008349355061848958, + "learning_rate": 0.0001, + "loss": 4.0915, + "loss/crossentropy": 2.089439034461975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22531338036060333, + "step": 13074 + }, + { + "epoch": 0.26152, + "grad_norm": 2.21875, + "grad_norm_var": 0.008318837483723958, + "learning_rate": 0.0001, + "loss": 4.3872, + "loss/crossentropy": 2.0921221375465393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22256288677453995, + "step": 13076 + }, + { + "epoch": 0.26156, + "grad_norm": 2.1875, + "grad_norm_var": 0.009065500895182292, + "learning_rate": 0.0001, + "loss": 4.4644, + "loss/crossentropy": 1.8978914022445679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2677152305841446, + "step": 13078 + }, + { + "epoch": 0.2616, + "grad_norm": 2.296875, + "grad_norm_var": 0.011643218994140624, + "learning_rate": 0.0001, + "loss": 4.405, + "loss/crossentropy": 2.2540348768234253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24195441603660583, + "step": 13080 + }, + { + "epoch": 0.26164, + "grad_norm": 1.9140625, + "grad_norm_var": 0.01092529296875, + "learning_rate": 0.0001, + "loss": 4.0965, + "loss/crossentropy": 1.9546288847923279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19464369863271713, + "step": 13082 + }, + { + "epoch": 0.26168, + "grad_norm": 2.0625, + "grad_norm_var": 0.012457021077473958, + "learning_rate": 0.0001, + "loss": 4.0103, + "loss/crossentropy": 2.1527568101882935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22238580137491226, + "step": 13084 + }, + { + "epoch": 0.26172, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0128814697265625, + "learning_rate": 0.0001, + "loss": 4.3957, + "loss/crossentropy": 2.111591637134552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21360620856285095, + "step": 13086 + }, + { + "epoch": 0.26176, + "grad_norm": 1.953125, + "grad_norm_var": 0.01339111328125, + "learning_rate": 0.0001, + "loss": 3.99, + "loss/crossentropy": 2.006071925163269, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19746285676956177, + "step": 13088 + }, + { + "epoch": 0.2618, + "grad_norm": 2.203125, + "grad_norm_var": 0.0128814697265625, + "learning_rate": 0.0001, + "loss": 4.2073, + "loss/crossentropy": 2.276778221130371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21222709119319916, + "step": 13090 + }, + { + "epoch": 0.26184, + "grad_norm": 2.015625, + "grad_norm_var": 0.013346354166666666, + "learning_rate": 0.0001, + "loss": 4.0679, + "loss/crossentropy": 2.102404534816742, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2049730271100998, + "step": 13092 + }, + { + "epoch": 0.26188, + "grad_norm": 2.015625, + "grad_norm_var": 0.013719685872395833, + "learning_rate": 0.0001, + "loss": 4.439, + "loss/crossentropy": 2.204727053642273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21260947734117508, + "step": 13094 + }, + { + "epoch": 0.26192, + "grad_norm": 1.9609375, + "grad_norm_var": 0.010184478759765626, + "learning_rate": 0.0001, + "loss": 4.2801, + "loss/crossentropy": 2.3762032985687256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22091203182935715, + "step": 13096 + }, + { + "epoch": 0.26196, + "grad_norm": 1.90625, + "grad_norm_var": 0.010286458333333333, + "learning_rate": 0.0001, + "loss": 4.3472, + "loss/crossentropy": 2.0710391998291016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20782632380723953, + "step": 13098 + }, + { + "epoch": 0.262, + "grad_norm": 2.09375, + "grad_norm_var": 0.009319814046223958, + "learning_rate": 0.0001, + "loss": 4.3257, + "loss/crossentropy": 2.193024158477783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21109049022197723, + "step": 13100 + }, + { + "epoch": 0.26204, + "grad_norm": 2.109375, + "grad_norm_var": 0.009308878580729167, + "learning_rate": 0.0001, + "loss": 4.1863, + "loss/crossentropy": 1.9668392539024353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19249404221773148, + "step": 13102 + }, + { + "epoch": 0.26208, + "grad_norm": 2.015625, + "grad_norm_var": 0.00858154296875, + "learning_rate": 0.0001, + "loss": 4.3122, + "loss/crossentropy": 2.2675795555114746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26278799772262573, + "step": 13104 + }, + { + "epoch": 0.26212, + "grad_norm": 2.109375, + "grad_norm_var": 0.007857004801432291, + "learning_rate": 0.0001, + "loss": 3.9962, + "loss/crossentropy": 2.0119330883026123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20821017771959305, + "step": 13106 + }, + { + "epoch": 0.26216, + "grad_norm": 1.984375, + "grad_norm_var": 0.006237538655598959, + "learning_rate": 0.0001, + "loss": 4.329, + "loss/crossentropy": 2.1259734630584717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22454438358545303, + "step": 13108 + }, + { + "epoch": 0.2622, + "grad_norm": 2.03125, + "grad_norm_var": 0.0043413798014322914, + "learning_rate": 0.0001, + "loss": 4.2558, + "loss/crossentropy": 1.8091979622840881, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19353152066469193, + "step": 13110 + }, + { + "epoch": 0.26224, + "grad_norm": 2.0625, + "grad_norm_var": 0.0038655598958333335, + "learning_rate": 0.0001, + "loss": 4.1847, + "loss/crossentropy": 1.9463382363319397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20932665467262268, + "step": 13112 + }, + { + "epoch": 0.26228, + "grad_norm": 2.125, + "grad_norm_var": 0.004366048177083333, + "learning_rate": 0.0001, + "loss": 4.3975, + "loss/crossentropy": 2.475601077079773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23451963067054749, + "step": 13114 + }, + { + "epoch": 0.26232, + "grad_norm": 1.9921875, + "grad_norm_var": 0.004412587483723958, + "learning_rate": 0.0001, + "loss": 4.4397, + "loss/crossentropy": 2.0833849906921387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22380392253398895, + "step": 13116 + }, + { + "epoch": 0.26236, + "grad_norm": 1.8984375, + "grad_norm_var": 0.005301920572916666, + "learning_rate": 0.0001, + "loss": 4.0408, + "loss/crossentropy": 1.9876453876495361, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20823375135660172, + "step": 13118 + }, + { + "epoch": 0.2624, + "grad_norm": 2.15625, + "grad_norm_var": 0.006750233968098958, + "learning_rate": 0.0001, + "loss": 4.5321, + "loss/crossentropy": 2.184763789176941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24357233941555023, + "step": 13120 + }, + { + "epoch": 0.26244, + "grad_norm": 2.015625, + "grad_norm_var": 0.0061187744140625, + "learning_rate": 0.0001, + "loss": 4.3552, + "loss/crossentropy": 2.217886805534363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21944888681173325, + "step": 13122 + }, + { + "epoch": 0.26248, + "grad_norm": 2.0625, + "grad_norm_var": 0.0052886962890625, + "learning_rate": 0.0001, + "loss": 4.2443, + "loss/crossentropy": 2.3299126625061035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2158309668302536, + "step": 13124 + }, + { + "epoch": 0.26252, + "grad_norm": 2.09375, + "grad_norm_var": 0.0058349609375, + "learning_rate": 0.0001, + "loss": 4.1303, + "loss/crossentropy": 2.0423209071159363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20835570991039276, + "step": 13126 + }, + { + "epoch": 0.26256, + "grad_norm": 1.796875, + "grad_norm_var": 0.012325032552083334, + "learning_rate": 0.0001, + "loss": 4.31, + "loss/crossentropy": 2.227811813354492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2043633684515953, + "step": 13128 + }, + { + "epoch": 0.2626, + "grad_norm": 2.078125, + "grad_norm_var": 0.011153157552083333, + "learning_rate": 0.0001, + "loss": 4.1312, + "loss/crossentropy": 2.105396568775177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2212589681148529, + "step": 13130 + }, + { + "epoch": 0.26264, + "grad_norm": 2.234375, + "grad_norm_var": 0.012737782796223958, + "learning_rate": 0.0001, + "loss": 4.2244, + "loss/crossentropy": 2.1711814999580383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21528291702270508, + "step": 13132 + }, + { + "epoch": 0.26268, + "grad_norm": 1.9375, + "grad_norm_var": 0.011937459309895834, + "learning_rate": 0.0001, + "loss": 4.1288, + "loss/crossentropy": 2.0086284279823303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974012851715088, + "step": 13134 + }, + { + "epoch": 0.26272, + "grad_norm": 2.0625, + "grad_norm_var": 0.011205037434895834, + "learning_rate": 0.0001, + "loss": 3.895, + "loss/crossentropy": 1.7025322914123535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1878160759806633, + "step": 13136 + }, + { + "epoch": 0.26276, + "grad_norm": 2.21875, + "grad_norm_var": 0.015950520833333332, + "learning_rate": 0.0001, + "loss": 4.0134, + "loss/crossentropy": 1.821410596370697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.197972871363163, + "step": 13138 + }, + { + "epoch": 0.2628, + "grad_norm": 1.8671875, + "grad_norm_var": 0.018993886311848958, + "learning_rate": 0.0001, + "loss": 4.0875, + "loss/crossentropy": 2.0485053658485413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20401400327682495, + "step": 13140 + }, + { + "epoch": 0.26284, + "grad_norm": 2.03125, + "grad_norm_var": 0.018808746337890626, + "learning_rate": 0.0001, + "loss": 4.4344, + "loss/crossentropy": 2.350088357925415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2638252303004265, + "step": 13142 + }, + { + "epoch": 0.26288, + "grad_norm": 2.078125, + "grad_norm_var": 0.012308502197265625, + "learning_rate": 0.0001, + "loss": 4.5756, + "loss/crossentropy": 2.3423168659210205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22735413163900375, + "step": 13144 + }, + { + "epoch": 0.26292, + "grad_norm": 2.0, + "grad_norm_var": 0.012033843994140625, + "learning_rate": 0.0001, + "loss": 4.3848, + "loss/crossentropy": 2.134036064147949, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20407382398843765, + "step": 13146 + }, + { + "epoch": 0.26296, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009748331705729167, + "learning_rate": 0.0001, + "loss": 4.105, + "loss/crossentropy": 2.0744638442993164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2107524871826172, + "step": 13148 + }, + { + "epoch": 0.263, + "grad_norm": 1.9375, + "grad_norm_var": 0.012604777018229167, + "learning_rate": 0.0001, + "loss": 4.0507, + "loss/crossentropy": 2.1522003412246704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2046833261847496, + "step": 13150 + }, + { + "epoch": 0.26304, + "grad_norm": 2.140625, + "grad_norm_var": 0.013224283854166666, + "learning_rate": 0.0001, + "loss": 4.2754, + "loss/crossentropy": 2.328645348548889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24340055882930756, + "step": 13152 + }, + { + "epoch": 0.26308, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009161122639973958, + "learning_rate": 0.0001, + "loss": 4.0502, + "loss/crossentropy": 2.0878008008003235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2110893502831459, + "step": 13154 + }, + { + "epoch": 0.26312, + "grad_norm": 1.984375, + "grad_norm_var": 0.0074155171712239586, + "learning_rate": 0.0001, + "loss": 4.3065, + "loss/crossentropy": 1.9381126761436462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18893951922655106, + "step": 13156 + }, + { + "epoch": 0.26316, + "grad_norm": 2.21875, + "grad_norm_var": 0.009364573160807292, + "learning_rate": 0.0001, + "loss": 4.0254, + "loss/crossentropy": 2.153562545776367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1973574310541153, + "step": 13158 + }, + { + "epoch": 0.2632, + "grad_norm": 2.203125, + "grad_norm_var": 0.011370595296223958, + "learning_rate": 0.0001, + "loss": 3.998, + "loss/crossentropy": 2.0583202242851257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19830025732517242, + "step": 13160 + }, + { + "epoch": 0.26324, + "grad_norm": 2.171875, + "grad_norm_var": 0.012117258707682292, + "learning_rate": 0.0001, + "loss": 4.4843, + "loss/crossentropy": 2.1951998472213745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21916336566209793, + "step": 13162 + }, + { + "epoch": 0.26328, + "grad_norm": 2.078125, + "grad_norm_var": 0.011888631184895833, + "learning_rate": 0.0001, + "loss": 4.4944, + "loss/crossentropy": 2.1612678170204163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21040990203619003, + "step": 13164 + }, + { + "epoch": 0.26332, + "grad_norm": 2.078125, + "grad_norm_var": 0.009415690104166667, + "learning_rate": 0.0001, + "loss": 4.2012, + "loss/crossentropy": 1.9372112154960632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24868668615818024, + "step": 13166 + }, + { + "epoch": 0.26336, + "grad_norm": 2.015625, + "grad_norm_var": 0.009968058268229166, + "learning_rate": 0.0001, + "loss": 4.3605, + "loss/crossentropy": 2.0220513939857483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22029200196266174, + "step": 13168 + }, + { + "epoch": 0.2634, + "grad_norm": 1.921875, + "grad_norm_var": 0.010355631510416666, + "learning_rate": 0.0001, + "loss": 3.7251, + "loss/crossentropy": 1.6749334335327148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18502884358167648, + "step": 13170 + }, + { + "epoch": 0.26344, + "grad_norm": 1.9375, + "grad_norm_var": 0.010096995035807292, + "learning_rate": 0.0001, + "loss": 4.1394, + "loss/crossentropy": 2.1463050842285156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21098337322473526, + "step": 13172 + }, + { + "epoch": 0.26348, + "grad_norm": 2.125, + "grad_norm_var": 0.009089914957682292, + "learning_rate": 0.0001, + "loss": 4.3261, + "loss/crossentropy": 2.20473051071167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23006527870893478, + "step": 13174 + }, + { + "epoch": 0.26352, + "grad_norm": 2.3125, + "grad_norm_var": 0.012068684895833333, + "learning_rate": 0.0001, + "loss": 4.3289, + "loss/crossentropy": 2.097456157207489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2077179104089737, + "step": 13176 + }, + { + "epoch": 0.26356, + "grad_norm": 2.015625, + "grad_norm_var": 0.0124176025390625, + "learning_rate": 0.0001, + "loss": 4.402, + "loss/crossentropy": 2.136400580406189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2152952179312706, + "step": 13178 + }, + { + "epoch": 0.2636, + "grad_norm": 1.921875, + "grad_norm_var": 0.013451131184895833, + "learning_rate": 0.0001, + "loss": 4.23, + "loss/crossentropy": 2.0791231393814087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19688992202281952, + "step": 13180 + }, + { + "epoch": 0.26364, + "grad_norm": 2.03125, + "grad_norm_var": 0.0126617431640625, + "learning_rate": 0.0001, + "loss": 4.1233, + "loss/crossentropy": 2.135149598121643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22163429111242294, + "step": 13182 + }, + { + "epoch": 0.26368, + "grad_norm": 2.171875, + "grad_norm_var": 0.013936360677083334, + "learning_rate": 0.0001, + "loss": 4.4215, + "loss/crossentropy": 2.3207671642303467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24321655184030533, + "step": 13184 + }, + { + "epoch": 0.26372, + "grad_norm": 2.03125, + "grad_norm_var": 0.013069407145182291, + "learning_rate": 0.0001, + "loss": 4.0995, + "loss/crossentropy": 2.0136974453926086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19274070858955383, + "step": 13186 + }, + { + "epoch": 0.26376, + "grad_norm": 2.203125, + "grad_norm_var": 0.013610585530598959, + "learning_rate": 0.0001, + "loss": 4.5094, + "loss/crossentropy": 2.3542726039886475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24547156691551208, + "step": 13188 + }, + { + "epoch": 0.2638, + "grad_norm": 1.8984375, + "grad_norm_var": 0.016007486979166666, + "learning_rate": 0.0001, + "loss": 4.068, + "loss/crossentropy": 1.905708134174347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18261709809303284, + "step": 13190 + }, + { + "epoch": 0.26384, + "grad_norm": 1.921875, + "grad_norm_var": 0.012147776285807292, + "learning_rate": 0.0001, + "loss": 4.1426, + "loss/crossentropy": 2.137321710586548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2052912563085556, + "step": 13192 + }, + { + "epoch": 0.26388, + "grad_norm": 2.0625, + "grad_norm_var": 0.010827382405598959, + "learning_rate": 0.0001, + "loss": 4.2252, + "loss/crossentropy": 2.083053410053253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2038491740822792, + "step": 13194 + }, + { + "epoch": 0.26392, + "grad_norm": 2.0, + "grad_norm_var": 0.011346181233723959, + "learning_rate": 0.0001, + "loss": 3.8349, + "loss/crossentropy": 1.8006438612937927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1910744607448578, + "step": 13196 + }, + { + "epoch": 0.26396, + "grad_norm": 2.03125, + "grad_norm_var": 0.012902577718098959, + "learning_rate": 0.0001, + "loss": 4.4104, + "loss/crossentropy": 2.323628544807434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2280128449201584, + "step": 13198 + }, + { + "epoch": 0.264, + "grad_norm": 1.921875, + "grad_norm_var": 0.010137685139973958, + "learning_rate": 0.0001, + "loss": 4.1231, + "loss/crossentropy": 1.9763594269752502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19776062667369843, + "step": 13200 + }, + { + "epoch": 0.26404, + "grad_norm": 2.109375, + "grad_norm_var": 0.010601552327473958, + "learning_rate": 0.0001, + "loss": 4.0175, + "loss/crossentropy": 1.8776894807815552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19732315093278885, + "step": 13202 + }, + { + "epoch": 0.26408, + "grad_norm": 1.96875, + "grad_norm_var": 0.007478841145833333, + "learning_rate": 0.0001, + "loss": 4.0713, + "loss/crossentropy": 1.826314091682434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19153253734111786, + "step": 13204 + }, + { + "epoch": 0.26412, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007673136393229167, + "learning_rate": 0.0001, + "loss": 4.1503, + "loss/crossentropy": 2.435014486312866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22149913012981415, + "step": 13206 + }, + { + "epoch": 0.26416, + "grad_norm": 2.4375, + "grad_norm_var": 0.01866633097330729, + "learning_rate": 0.0001, + "loss": 4.0829, + "loss/crossentropy": 2.055279493331909, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21625792980194092, + "step": 13208 + }, + { + "epoch": 0.2642, + "grad_norm": 1.984375, + "grad_norm_var": 0.018641916910807292, + "learning_rate": 0.0001, + "loss": 4.216, + "loss/crossentropy": 2.4019049406051636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22626115381717682, + "step": 13210 + }, + { + "epoch": 0.26424, + "grad_norm": 1.9921875, + "grad_norm_var": 0.017508697509765626, + "learning_rate": 0.0001, + "loss": 3.9822, + "loss/crossentropy": 2.336918354034424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22773776203393936, + "step": 13212 + }, + { + "epoch": 0.26428, + "grad_norm": 2.03125, + "grad_norm_var": 0.01608454386393229, + "learning_rate": 0.0001, + "loss": 4.3929, + "loss/crossentropy": 2.0398528575897217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20827088505029678, + "step": 13214 + }, + { + "epoch": 0.26432, + "grad_norm": 2.03125, + "grad_norm_var": 0.015419260660807291, + "learning_rate": 0.0001, + "loss": 4.026, + "loss/crossentropy": 2.166857123374939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2136707603931427, + "step": 13216 + }, + { + "epoch": 0.26436, + "grad_norm": 1.9921875, + "grad_norm_var": 0.015095011393229166, + "learning_rate": 0.0001, + "loss": 4.2282, + "loss/crossentropy": 1.8454258441925049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20678912848234177, + "step": 13218 + }, + { + "epoch": 0.2644, + "grad_norm": 2.140625, + "grad_norm_var": 0.014642079671223959, + "learning_rate": 0.0001, + "loss": 4.2966, + "loss/crossentropy": 2.249597668647766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22785719484090805, + "step": 13220 + }, + { + "epoch": 0.26444, + "grad_norm": 1.90625, + "grad_norm_var": 0.014134724934895834, + "learning_rate": 0.0001, + "loss": 3.875, + "loss/crossentropy": 1.7729946374893188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2014644593000412, + "step": 13222 + }, + { + "epoch": 0.26448, + "grad_norm": 1.9296875, + "grad_norm_var": 0.004976145426432292, + "learning_rate": 0.0001, + "loss": 3.9144, + "loss/crossentropy": 1.7968116998672485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1889382004737854, + "step": 13224 + }, + { + "epoch": 0.26452, + "grad_norm": 2.015625, + "grad_norm_var": 0.006510162353515625, + "learning_rate": 0.0001, + "loss": 4.2015, + "loss/crossentropy": 2.0765512585639954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20551903545856476, + "step": 13226 + }, + { + "epoch": 0.26456, + "grad_norm": 2.203125, + "grad_norm_var": 0.007933553059895833, + "learning_rate": 0.0001, + "loss": 4.2603, + "loss/crossentropy": 2.2007554173469543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2250915989279747, + "step": 13228 + }, + { + "epoch": 0.2646, + "grad_norm": 1.9765625, + "grad_norm_var": 0.008184560139973958, + "learning_rate": 0.0001, + "loss": 4.2014, + "loss/crossentropy": 2.319425046443939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22913866490125656, + "step": 13230 + }, + { + "epoch": 0.26464, + "grad_norm": 1.96875, + "grad_norm_var": 0.008365631103515625, + "learning_rate": 0.0001, + "loss": 4.1395, + "loss/crossentropy": 1.9845139980316162, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20769847929477692, + "step": 13232 + }, + { + "epoch": 0.26468, + "grad_norm": 1.96875, + "grad_norm_var": 0.008674112955729167, + "learning_rate": 0.0001, + "loss": 4.0985, + "loss/crossentropy": 1.94157075881958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2314443141222, + "step": 13234 + }, + { + "epoch": 0.26472, + "grad_norm": 2.0, + "grad_norm_var": 0.007470703125, + "learning_rate": 0.0001, + "loss": 4.3428, + "loss/crossentropy": 2.0699294209480286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22432812303304672, + "step": 13236 + }, + { + "epoch": 0.26476, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006841786702473958, + "learning_rate": 0.0001, + "loss": 3.983, + "loss/crossentropy": 1.7656533122062683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20837824791669846, + "step": 13238 + }, + { + "epoch": 0.2648, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0060618082682291664, + "learning_rate": 0.0001, + "loss": 4.1799, + "loss/crossentropy": 1.9473227858543396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20151428878307343, + "step": 13240 + }, + { + "epoch": 0.26484, + "grad_norm": 1.921875, + "grad_norm_var": 0.005387369791666667, + "learning_rate": 0.0001, + "loss": 4.284, + "loss/crossentropy": 2.060440719127655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2041771411895752, + "step": 13242 + }, + { + "epoch": 0.26488, + "grad_norm": 1.8984375, + "grad_norm_var": 0.003922526041666667, + "learning_rate": 0.0001, + "loss": 3.7503, + "loss/crossentropy": 2.0440456867218018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2041776031255722, + "step": 13244 + }, + { + "epoch": 0.26492, + "grad_norm": 2.109375, + "grad_norm_var": 0.004622395833333333, + "learning_rate": 0.0001, + "loss": 4.1921, + "loss/crossentropy": 1.9192551374435425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18886462599039078, + "step": 13246 + }, + { + "epoch": 0.26496, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0072100321451822914, + "learning_rate": 0.0001, + "loss": 4.1673, + "loss/crossentropy": 1.7112281918525696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18532422930002213, + "step": 13248 + }, + { + "epoch": 0.265, + "grad_norm": 2.0, + "grad_norm_var": 0.007637532552083334, + "learning_rate": 0.0001, + "loss": 4.0561, + "loss/crossentropy": 1.9225355386734009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19994951784610748, + "step": 13250 + }, + { + "epoch": 0.26504, + "grad_norm": 2.0, + "grad_norm_var": 0.007950846354166667, + "learning_rate": 0.0001, + "loss": 3.6825, + "loss/crossentropy": 2.0243517756462097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19785288721323013, + "step": 13252 + }, + { + "epoch": 0.26508, + "grad_norm": 1.984375, + "grad_norm_var": 0.007413482666015625, + "learning_rate": 0.0001, + "loss": 4.2929, + "loss/crossentropy": 2.255920171737671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21220286190509796, + "step": 13254 + }, + { + "epoch": 0.26512, + "grad_norm": 2.0625, + "grad_norm_var": 0.008475748697916667, + "learning_rate": 0.0001, + "loss": 4.4145, + "loss/crossentropy": 2.3024203777313232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22255335003137589, + "step": 13256 + }, + { + "epoch": 0.26516, + "grad_norm": 2.03125, + "grad_norm_var": 0.00733642578125, + "learning_rate": 0.0001, + "loss": 4.1386, + "loss/crossentropy": 2.0467708110809326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20193494111299515, + "step": 13258 + }, + { + "epoch": 0.2652, + "grad_norm": 2.03125, + "grad_norm_var": 0.006064605712890625, + "learning_rate": 0.0001, + "loss": 4.2188, + "loss/crossentropy": 2.1543976068496704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2257647141814232, + "step": 13260 + }, + { + "epoch": 0.26524, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006373850504557291, + "learning_rate": 0.0001, + "loss": 4.2683, + "loss/crossentropy": 2.077743351459503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21964918076992035, + "step": 13262 + }, + { + "epoch": 0.26528, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0044921875, + "learning_rate": 0.0001, + "loss": 3.963, + "loss/crossentropy": 1.6356236338615417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1847509667277336, + "step": 13264 + }, + { + "epoch": 0.26532, + "grad_norm": 1.953125, + "grad_norm_var": 0.006925201416015625, + "learning_rate": 0.0001, + "loss": 4.3559, + "loss/crossentropy": 2.0876659750938416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22080402821302414, + "step": 13266 + }, + { + "epoch": 0.26536, + "grad_norm": 2.109375, + "grad_norm_var": 0.005747222900390625, + "learning_rate": 0.0001, + "loss": 4.0452, + "loss/crossentropy": 1.7057855129241943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18772340565919876, + "step": 13268 + }, + { + "epoch": 0.2654, + "grad_norm": 1.890625, + "grad_norm_var": 0.0078277587890625, + "learning_rate": 0.0001, + "loss": 4.0032, + "loss/crossentropy": 2.0550594329833984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18500665575265884, + "step": 13270 + }, + { + "epoch": 0.26544, + "grad_norm": 2.0, + "grad_norm_var": 0.011131795247395833, + "learning_rate": 0.0001, + "loss": 4.2837, + "loss/crossentropy": 2.027509331703186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21377252787351608, + "step": 13272 + }, + { + "epoch": 0.26548, + "grad_norm": 1.921875, + "grad_norm_var": 0.012393951416015625, + "learning_rate": 0.0001, + "loss": 3.9739, + "loss/crossentropy": 2.0517578125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21172185242176056, + "step": 13274 + }, + { + "epoch": 0.26552, + "grad_norm": 2.09375, + "grad_norm_var": 0.01395263671875, + "learning_rate": 0.0001, + "loss": 4.1628, + "loss/crossentropy": 2.0550750494003296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22374523431062698, + "step": 13276 + }, + { + "epoch": 0.26556, + "grad_norm": 2.09375, + "grad_norm_var": 0.013399251302083333, + "learning_rate": 0.0001, + "loss": 4.1829, + "loss/crossentropy": 1.8055049777030945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17029780894517899, + "step": 13278 + }, + { + "epoch": 0.2656, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013114166259765626, + "learning_rate": 0.0001, + "loss": 4.3999, + "loss/crossentropy": 1.9650321006774902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20327500253915787, + "step": 13280 + }, + { + "epoch": 0.26564, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010209147135416667, + "learning_rate": 0.0001, + "loss": 4.1395, + "loss/crossentropy": 2.164494276046753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21579623967409134, + "step": 13282 + }, + { + "epoch": 0.26568, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009810129801432291, + "learning_rate": 0.0001, + "loss": 4.0676, + "loss/crossentropy": 2.0488376021385193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20213299989700317, + "step": 13284 + }, + { + "epoch": 0.26572, + "grad_norm": 2.0, + "grad_norm_var": 0.010308583577473959, + "learning_rate": 0.0001, + "loss": 3.7442, + "loss/crossentropy": 1.948447048664093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19875720143318176, + "step": 13286 + }, + { + "epoch": 0.26576, + "grad_norm": 1.875, + "grad_norm_var": 0.008487701416015625, + "learning_rate": 0.0001, + "loss": 4.2312, + "loss/crossentropy": 2.115865170955658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19675103574991226, + "step": 13288 + }, + { + "epoch": 0.2658, + "grad_norm": 2.140625, + "grad_norm_var": 0.0105377197265625, + "learning_rate": 0.0001, + "loss": 4.2289, + "loss/crossentropy": 1.9720736145973206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20975899696350098, + "step": 13290 + }, + { + "epoch": 0.26584, + "grad_norm": 2.015625, + "grad_norm_var": 0.009154256184895833, + "learning_rate": 0.0001, + "loss": 3.9258, + "loss/crossentropy": 1.9105132222175598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20582708716392517, + "step": 13292 + }, + { + "epoch": 0.26588, + "grad_norm": 2.40625, + "grad_norm_var": 0.018369293212890624, + "learning_rate": 0.0001, + "loss": 4.2629, + "loss/crossentropy": 2.089789867401123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21264012902975082, + "step": 13294 + }, + { + "epoch": 0.26592, + "grad_norm": 2.03125, + "grad_norm_var": 0.018358357747395835, + "learning_rate": 0.0001, + "loss": 4.41, + "loss/crossentropy": 2.271156430244446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21146492660045624, + "step": 13296 + }, + { + "epoch": 0.26596, + "grad_norm": 1.90625, + "grad_norm_var": 0.0196929931640625, + "learning_rate": 0.0001, + "loss": 3.8554, + "loss/crossentropy": 1.930766224861145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1959189623594284, + "step": 13298 + }, + { + "epoch": 0.266, + "grad_norm": 2.25, + "grad_norm_var": 0.023374176025390624, + "learning_rate": 0.0001, + "loss": 4.3551, + "loss/crossentropy": 1.9945995807647705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20959854125976562, + "step": 13300 + }, + { + "epoch": 0.26604, + "grad_norm": 2.125, + "grad_norm_var": 0.020637003580729167, + "learning_rate": 0.0001, + "loss": 4.5445, + "loss/crossentropy": 2.2128443717956543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21266601234674454, + "step": 13302 + }, + { + "epoch": 0.26608, + "grad_norm": 2.03125, + "grad_norm_var": 0.018049112955729165, + "learning_rate": 0.0001, + "loss": 4.2578, + "loss/crossentropy": 2.076206088066101, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2037106677889824, + "step": 13304 + }, + { + "epoch": 0.26612, + "grad_norm": 2.171875, + "grad_norm_var": 0.018724568684895835, + "learning_rate": 0.0001, + "loss": 4.0512, + "loss/crossentropy": 2.077200174331665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.228537917137146, + "step": 13306 + }, + { + "epoch": 0.26616, + "grad_norm": 2.0, + "grad_norm_var": 0.01874567667643229, + "learning_rate": 0.0001, + "loss": 3.9641, + "loss/crossentropy": 2.0368794202804565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2150697112083435, + "step": 13308 + }, + { + "epoch": 0.2662, + "grad_norm": 1.9375, + "grad_norm_var": 0.016556803385416666, + "learning_rate": 0.0001, + "loss": 3.8516, + "loss/crossentropy": 1.9108307361602783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1903306469321251, + "step": 13310 + }, + { + "epoch": 0.26624, + "grad_norm": 2.046875, + "grad_norm_var": 0.016218058268229165, + "learning_rate": 0.0001, + "loss": 3.9928, + "loss/crossentropy": 1.8118465542793274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18557358533143997, + "step": 13312 + }, + { + "epoch": 0.26628, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01736424763997396, + "learning_rate": 0.0001, + "loss": 3.897, + "loss/crossentropy": 1.7392275929450989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19214990735054016, + "step": 13314 + }, + { + "epoch": 0.26632, + "grad_norm": 3.359375, + "grad_norm_var": 0.12773412068684895, + "learning_rate": 0.0001, + "loss": 3.9728, + "loss/crossentropy": 2.1361005306243896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22042304277420044, + "step": 13316 + }, + { + "epoch": 0.26636, + "grad_norm": 1.9375, + "grad_norm_var": 0.12786026000976564, + "learning_rate": 0.0001, + "loss": 4.0055, + "loss/crossentropy": 2.0087279677391052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20278790593147278, + "step": 13318 + }, + { + "epoch": 0.2664, + "grad_norm": 2.09375, + "grad_norm_var": 0.12776692708333334, + "learning_rate": 0.0001, + "loss": 4.0781, + "loss/crossentropy": 2.307933807373047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24929606914520264, + "step": 13320 + }, + { + "epoch": 0.26644, + "grad_norm": 2.078125, + "grad_norm_var": 0.12690022786458333, + "learning_rate": 0.0001, + "loss": 4.3451, + "loss/crossentropy": 2.5539438724517822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2219071164727211, + "step": 13322 + }, + { + "epoch": 0.26648, + "grad_norm": 1.953125, + "grad_norm_var": 0.12910334269205728, + "learning_rate": 0.0001, + "loss": 3.9984, + "loss/crossentropy": 1.9069242477416992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19276316463947296, + "step": 13324 + }, + { + "epoch": 0.26652, + "grad_norm": 2.140625, + "grad_norm_var": 0.12194010416666666, + "learning_rate": 0.0001, + "loss": 4.535, + "loss/crossentropy": 2.0715484619140625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20079267024993896, + "step": 13326 + }, + { + "epoch": 0.26656, + "grad_norm": 2.15625, + "grad_norm_var": 0.12164713541666666, + "learning_rate": 0.0001, + "loss": 4.46, + "loss/crossentropy": 2.1134172677993774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20564979314804077, + "step": 13328 + }, + { + "epoch": 0.2666, + "grad_norm": 2.046875, + "grad_norm_var": 0.11581929524739583, + "learning_rate": 0.0001, + "loss": 4.3992, + "loss/crossentropy": 2.103445053100586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19361437857151031, + "step": 13330 + }, + { + "epoch": 0.26664, + "grad_norm": 1.8203125, + "grad_norm_var": 0.014499664306640625, + "learning_rate": 0.0001, + "loss": 4.0169, + "loss/crossentropy": 2.1882529258728027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21402588486671448, + "step": 13332 + }, + { + "epoch": 0.26668, + "grad_norm": 2.046875, + "grad_norm_var": 0.015498606363932292, + "learning_rate": 0.0001, + "loss": 4.3247, + "loss/crossentropy": 2.2531981468200684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2019234597682953, + "step": 13334 + }, + { + "epoch": 0.26672, + "grad_norm": 1.9921875, + "grad_norm_var": 0.021954091389973958, + "learning_rate": 0.0001, + "loss": 3.8596, + "loss/crossentropy": 1.8077877759933472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19121356308460236, + "step": 13336 + }, + { + "epoch": 0.26676, + "grad_norm": 2.046875, + "grad_norm_var": 0.02600072224934896, + "learning_rate": 0.0001, + "loss": 4.3195, + "loss/crossentropy": 1.7387139797210693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20326050370931625, + "step": 13338 + }, + { + "epoch": 0.2668, + "grad_norm": 1.984375, + "grad_norm_var": 0.022712198893229167, + "learning_rate": 0.0001, + "loss": 4.065, + "loss/crossentropy": 2.2061930894851685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22271326184272766, + "step": 13340 + }, + { + "epoch": 0.26684, + "grad_norm": 2.109375, + "grad_norm_var": 0.02191162109375, + "learning_rate": 0.0001, + "loss": 4.4018, + "loss/crossentropy": 2.096329092979431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2201632410287857, + "step": 13342 + }, + { + "epoch": 0.26688, + "grad_norm": 1.9375, + "grad_norm_var": 0.0232574462890625, + "learning_rate": 0.0001, + "loss": 4.0783, + "loss/crossentropy": 2.2015284299850464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22004209458827972, + "step": 13344 + }, + { + "epoch": 0.26692, + "grad_norm": 2.125, + "grad_norm_var": 0.023957316080729166, + "learning_rate": 0.0001, + "loss": 4.5227, + "loss/crossentropy": 2.256517231464386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21762817353010178, + "step": 13346 + }, + { + "epoch": 0.26696, + "grad_norm": 1.828125, + "grad_norm_var": 0.022874959309895835, + "learning_rate": 0.0001, + "loss": 3.8578, + "loss/crossentropy": 1.8551252484321594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18950944393873215, + "step": 13348 + }, + { + "epoch": 0.267, + "grad_norm": 2.0, + "grad_norm_var": 0.020967610677083335, + "learning_rate": 0.0001, + "loss": 4.3051, + "loss/crossentropy": 1.8834841847419739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17950639128684998, + "step": 13350 + }, + { + "epoch": 0.26704, + "grad_norm": 2.109375, + "grad_norm_var": 0.012872060139973959, + "learning_rate": 0.0001, + "loss": 4.0234, + "loss/crossentropy": 2.0059815645217896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2279331535100937, + "step": 13352 + }, + { + "epoch": 0.26708, + "grad_norm": 2.015625, + "grad_norm_var": 0.00594482421875, + "learning_rate": 0.0001, + "loss": 4.0102, + "loss/crossentropy": 2.2327537536621094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22790935635566711, + "step": 13354 + }, + { + "epoch": 0.26712, + "grad_norm": 2.109375, + "grad_norm_var": 0.0066650390625, + "learning_rate": 0.0001, + "loss": 4.0535, + "loss/crossentropy": 1.7623894214630127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17889687418937683, + "step": 13356 + }, + { + "epoch": 0.26716, + "grad_norm": 1.984375, + "grad_norm_var": 0.006205240885416667, + "learning_rate": 0.0001, + "loss": 4.1595, + "loss/crossentropy": 1.9240076541900635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18552126735448837, + "step": 13358 + }, + { + "epoch": 0.2672, + "grad_norm": 1.8515625, + "grad_norm_var": 0.007387034098307292, + "learning_rate": 0.0001, + "loss": 3.9162, + "loss/crossentropy": 2.301589012145996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20251981914043427, + "step": 13360 + }, + { + "epoch": 0.26724, + "grad_norm": 2.078125, + "grad_norm_var": 0.007671864827473959, + "learning_rate": 0.0001, + "loss": 4.1575, + "loss/crossentropy": 2.144057512283325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22049692273139954, + "step": 13362 + }, + { + "epoch": 0.26728, + "grad_norm": 2.03125, + "grad_norm_var": 0.006209309895833333, + "learning_rate": 0.0001, + "loss": 4.2381, + "loss/crossentropy": 1.9329981207847595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015545517206192, + "step": 13364 + }, + { + "epoch": 0.26732, + "grad_norm": 2.046875, + "grad_norm_var": 0.0062896728515625, + "learning_rate": 0.0001, + "loss": 4.2544, + "loss/crossentropy": 2.057462990283966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19826926290988922, + "step": 13366 + }, + { + "epoch": 0.26736, + "grad_norm": 2.03125, + "grad_norm_var": 0.006670888264973958, + "learning_rate": 0.0001, + "loss": 4.085, + "loss/crossentropy": 2.097291588783264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18807460367679596, + "step": 13368 + }, + { + "epoch": 0.2674, + "grad_norm": 2.015625, + "grad_norm_var": 0.007281239827473958, + "learning_rate": 0.0001, + "loss": 4.1204, + "loss/crossentropy": 2.154082179069519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22133710980415344, + "step": 13370 + }, + { + "epoch": 0.26744, + "grad_norm": 1.9453125, + "grad_norm_var": 0.017561848958333334, + "learning_rate": 0.0001, + "loss": 4.1412, + "loss/crossentropy": 2.208779454231262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2139016017317772, + "step": 13372 + }, + { + "epoch": 0.26748, + "grad_norm": 2.0, + "grad_norm_var": 0.018355305989583334, + "learning_rate": 0.0001, + "loss": 3.8929, + "loss/crossentropy": 2.251901865005493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22264431416988373, + "step": 13374 + }, + { + "epoch": 0.26752, + "grad_norm": 2.15625, + "grad_norm_var": 0.016857655843098958, + "learning_rate": 0.0001, + "loss": 4.3243, + "loss/crossentropy": 2.4881285429000854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.250653475522995, + "step": 13376 + }, + { + "epoch": 0.26756, + "grad_norm": 2.140625, + "grad_norm_var": 0.017427571614583335, + "learning_rate": 0.0001, + "loss": 4.2806, + "loss/crossentropy": 2.104843556880951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19004888087511063, + "step": 13378 + }, + { + "epoch": 0.2676, + "grad_norm": 2.359375, + "grad_norm_var": 0.022899373372395834, + "learning_rate": 0.0001, + "loss": 4.0708, + "loss/crossentropy": 1.913047194480896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20572267472743988, + "step": 13380 + }, + { + "epoch": 0.26764, + "grad_norm": 1.9765625, + "grad_norm_var": 0.02463353474934896, + "learning_rate": 0.0001, + "loss": 4.3632, + "loss/crossentropy": 2.1043838262557983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21029697358608246, + "step": 13382 + }, + { + "epoch": 0.26768, + "grad_norm": 2.1875, + "grad_norm_var": 0.02617162068684896, + "learning_rate": 0.0001, + "loss": 4.2429, + "loss/crossentropy": 2.055271625518799, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2113131284713745, + "step": 13384 + }, + { + "epoch": 0.26772, + "grad_norm": 2.71875, + "grad_norm_var": 0.0488433837890625, + "learning_rate": 0.0001, + "loss": 4.4956, + "loss/crossentropy": 1.8407886624336243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2729990780353546, + "step": 13386 + }, + { + "epoch": 0.26776, + "grad_norm": 2.0625, + "grad_norm_var": 0.04177017211914062, + "learning_rate": 0.0001, + "loss": 4.0097, + "loss/crossentropy": 2.0368717908859253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21415862441062927, + "step": 13388 + }, + { + "epoch": 0.2678, + "grad_norm": 1.953125, + "grad_norm_var": 0.040897369384765625, + "learning_rate": 0.0001, + "loss": 4.1836, + "loss/crossentropy": 1.8235292434692383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2062123641371727, + "step": 13390 + }, + { + "epoch": 0.26784, + "grad_norm": 1.9375, + "grad_norm_var": 0.04222183227539063, + "learning_rate": 0.0001, + "loss": 3.8392, + "loss/crossentropy": 1.733048439025879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20651907473802567, + "step": 13392 + }, + { + "epoch": 0.26788, + "grad_norm": 1.9765625, + "grad_norm_var": 0.043454742431640624, + "learning_rate": 0.0001, + "loss": 3.9004, + "loss/crossentropy": 2.1339367628097534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21730080246925354, + "step": 13394 + }, + { + "epoch": 0.26792, + "grad_norm": 1.890625, + "grad_norm_var": 0.0452789306640625, + "learning_rate": 0.0001, + "loss": 3.8421, + "loss/crossentropy": 1.885707974433899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20833835750818253, + "step": 13396 + }, + { + "epoch": 0.26796, + "grad_norm": 1.890625, + "grad_norm_var": 0.04429423014322917, + "learning_rate": 0.0001, + "loss": 3.9714, + "loss/crossentropy": 1.9896257519721985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1938292756676674, + "step": 13398 + }, + { + "epoch": 0.268, + "grad_norm": 2.25, + "grad_norm_var": 0.04457575480143229, + "learning_rate": 0.0001, + "loss": 4.2962, + "loss/crossentropy": 2.0359573364257812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21572312712669373, + "step": 13400 + }, + { + "epoch": 0.26804, + "grad_norm": 1.9296875, + "grad_norm_var": 0.014095052083333334, + "learning_rate": 0.0001, + "loss": 4.3483, + "loss/crossentropy": 2.241006851196289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2361859679222107, + "step": 13402 + }, + { + "epoch": 0.26808, + "grad_norm": 1.96875, + "grad_norm_var": 0.013852691650390625, + "learning_rate": 0.0001, + "loss": 4.0518, + "loss/crossentropy": 2.0988917350769043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20220966637134552, + "step": 13404 + }, + { + "epoch": 0.26812, + "grad_norm": 2.078125, + "grad_norm_var": 0.014802805582682292, + "learning_rate": 0.0001, + "loss": 4.3664, + "loss/crossentropy": 2.0803651213645935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22251859307289124, + "step": 13406 + }, + { + "epoch": 0.26816, + "grad_norm": 2.046875, + "grad_norm_var": 0.015827433268229166, + "learning_rate": 0.0001, + "loss": 4.0473, + "loss/crossentropy": 1.9151242971420288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19792036712169647, + "step": 13408 + }, + { + "epoch": 0.2682, + "grad_norm": 2.96875, + "grad_norm_var": 0.07619196573893229, + "learning_rate": 0.0001, + "loss": 4.0321, + "loss/crossentropy": 1.908549964427948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21568355709314346, + "step": 13410 + }, + { + "epoch": 0.26824, + "grad_norm": 2.109375, + "grad_norm_var": 0.06887919108072917, + "learning_rate": 0.0001, + "loss": 4.3844, + "loss/crossentropy": 2.3226535320281982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2332221046090126, + "step": 13412 + }, + { + "epoch": 0.26828, + "grad_norm": 1.9765625, + "grad_norm_var": 0.06696548461914062, + "learning_rate": 0.0001, + "loss": 4.24, + "loss/crossentropy": 1.9767839312553406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2038547545671463, + "step": 13414 + }, + { + "epoch": 0.26832, + "grad_norm": 1.859375, + "grad_norm_var": 0.06800028483072916, + "learning_rate": 0.0001, + "loss": 3.9592, + "loss/crossentropy": 2.2013003826141357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2076154574751854, + "step": 13416 + }, + { + "epoch": 0.26836, + "grad_norm": 1.9921875, + "grad_norm_var": 0.06597671508789063, + "learning_rate": 0.0001, + "loss": 4.1128, + "loss/crossentropy": 2.0941338539123535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20538556575775146, + "step": 13418 + }, + { + "epoch": 0.2684, + "grad_norm": 2.125, + "grad_norm_var": 0.06494954427083334, + "learning_rate": 0.0001, + "loss": 4.3656, + "loss/crossentropy": 2.0206560492515564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20901528000831604, + "step": 13420 + }, + { + "epoch": 0.26844, + "grad_norm": 2.171875, + "grad_norm_var": 0.06690648396809896, + "learning_rate": 0.0001, + "loss": 4.2606, + "loss/crossentropy": 2.1317169070243835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21660470962524414, + "step": 13422 + }, + { + "epoch": 0.26848, + "grad_norm": 1.8203125, + "grad_norm_var": 0.07109273274739583, + "learning_rate": 0.0001, + "loss": 3.6026, + "loss/crossentropy": 1.631429135799408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1729787141084671, + "step": 13424 + }, + { + "epoch": 0.26852, + "grad_norm": 1.8203125, + "grad_norm_var": 0.012491607666015625, + "learning_rate": 0.0001, + "loss": 3.923, + "loss/crossentropy": 1.8428707122802734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18231894075870514, + "step": 13426 + }, + { + "epoch": 0.26856, + "grad_norm": 1.984375, + "grad_norm_var": 0.011104075113932292, + "learning_rate": 0.0001, + "loss": 4.2351, + "loss/crossentropy": 1.860277235507965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18769516795873642, + "step": 13428 + }, + { + "epoch": 0.2686, + "grad_norm": 1.890625, + "grad_norm_var": 0.012542470296223959, + "learning_rate": 0.0001, + "loss": 4.0199, + "loss/crossentropy": 2.02046799659729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2160683423280716, + "step": 13430 + }, + { + "epoch": 0.26864, + "grad_norm": 2.359375, + "grad_norm_var": 0.021491495768229167, + "learning_rate": 0.0001, + "loss": 4.5105, + "loss/crossentropy": 2.094564437866211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22155898064374924, + "step": 13432 + }, + { + "epoch": 0.26868, + "grad_norm": 2.109375, + "grad_norm_var": 0.02215550740559896, + "learning_rate": 0.0001, + "loss": 4.0382, + "loss/crossentropy": 2.013366401195526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126912623643875, + "step": 13434 + }, + { + "epoch": 0.26872, + "grad_norm": 2.015625, + "grad_norm_var": 0.02212702433268229, + "learning_rate": 0.0001, + "loss": 4.5826, + "loss/crossentropy": 2.360998272895813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2355523630976677, + "step": 13436 + }, + { + "epoch": 0.26876, + "grad_norm": 1.7890625, + "grad_norm_var": 0.02289606730143229, + "learning_rate": 0.0001, + "loss": 3.7129, + "loss/crossentropy": 1.754651427268982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18666986376047134, + "step": 13438 + }, + { + "epoch": 0.2688, + "grad_norm": 2.03125, + "grad_norm_var": 0.01812922159830729, + "learning_rate": 0.0001, + "loss": 4.1024, + "loss/crossentropy": 1.9176424741744995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19241741299629211, + "step": 13440 + }, + { + "epoch": 0.26884, + "grad_norm": 2.078125, + "grad_norm_var": 0.0154449462890625, + "learning_rate": 0.0001, + "loss": 4.2548, + "loss/crossentropy": 2.324304223060608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22302107512950897, + "step": 13442 + }, + { + "epoch": 0.26888, + "grad_norm": 2.5, + "grad_norm_var": 0.028758748372395834, + "learning_rate": 0.0001, + "loss": 4.0693, + "loss/crossentropy": 1.8527624011039734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18644367158412933, + "step": 13444 + }, + { + "epoch": 0.26892, + "grad_norm": 2.1875, + "grad_norm_var": 0.027570597330729165, + "learning_rate": 0.0001, + "loss": 4.4437, + "loss/crossentropy": 2.0999260544776917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21172572672367096, + "step": 13446 + }, + { + "epoch": 0.26896, + "grad_norm": 1.90625, + "grad_norm_var": 0.026627349853515624, + "learning_rate": 0.0001, + "loss": 3.8125, + "loss/crossentropy": 1.8577081561088562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17302225530147552, + "step": 13448 + }, + { + "epoch": 0.269, + "grad_norm": 1.921875, + "grad_norm_var": 0.026956939697265626, + "learning_rate": 0.0001, + "loss": 4.2182, + "loss/crossentropy": 2.11561119556427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20249811559915543, + "step": 13450 + }, + { + "epoch": 0.26904, + "grad_norm": 1.921875, + "grad_norm_var": 0.026656087239583334, + "learning_rate": 0.0001, + "loss": 4.2854, + "loss/crossentropy": 2.3036913871765137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20101947337388992, + "step": 13452 + }, + { + "epoch": 0.26908, + "grad_norm": 2.078125, + "grad_norm_var": 0.02341283162434896, + "learning_rate": 0.0001, + "loss": 3.9013, + "loss/crossentropy": 2.0498175621032715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19935546070337296, + "step": 13454 + }, + { + "epoch": 0.26912, + "grad_norm": 2.09375, + "grad_norm_var": 0.08842137654622396, + "learning_rate": 0.0001, + "loss": 4.2601, + "loss/crossentropy": 2.27053964138031, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31231266260147095, + "step": 13456 + }, + { + "epoch": 0.26916, + "grad_norm": 1.921875, + "grad_norm_var": 0.0908953348795573, + "learning_rate": 0.0001, + "loss": 4.0035, + "loss/crossentropy": 2.191486120223999, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2058379054069519, + "step": 13458 + }, + { + "epoch": 0.2692, + "grad_norm": 2.015625, + "grad_norm_var": 0.07780939737955729, + "learning_rate": 0.0001, + "loss": 4.1193, + "loss/crossentropy": 2.008154332637787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20862725377082825, + "step": 13460 + }, + { + "epoch": 0.26924, + "grad_norm": 2.109375, + "grad_norm_var": 0.07779947916666667, + "learning_rate": 0.0001, + "loss": 4.3191, + "loss/crossentropy": 2.3374814987182617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24534277617931366, + "step": 13462 + }, + { + "epoch": 0.26928, + "grad_norm": 2.0625, + "grad_norm_var": 0.07355855305989584, + "learning_rate": 0.0001, + "loss": 4.0558, + "loss/crossentropy": 2.2654502391815186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21406329423189163, + "step": 13464 + }, + { + "epoch": 0.26932, + "grad_norm": 2.21875, + "grad_norm_var": 0.07412821451822917, + "learning_rate": 0.0001, + "loss": 4.2078, + "loss/crossentropy": 2.1021856665611267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19653601199388504, + "step": 13466 + }, + { + "epoch": 0.26936, + "grad_norm": 2.140625, + "grad_norm_var": 0.0727068583170573, + "learning_rate": 0.0001, + "loss": 4.4466, + "loss/crossentropy": 1.9482329487800598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20822357386350632, + "step": 13468 + }, + { + "epoch": 0.2694, + "grad_norm": 1.8125, + "grad_norm_var": 0.07814915974934895, + "learning_rate": 0.0001, + "loss": 4.1036, + "loss/crossentropy": 2.122725486755371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2132444903254509, + "step": 13470 + }, + { + "epoch": 0.26944, + "grad_norm": 2.0, + "grad_norm_var": 0.015044911702473959, + "learning_rate": 0.0001, + "loss": 4.0337, + "loss/crossentropy": 1.760430932044983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1835266426205635, + "step": 13472 + }, + { + "epoch": 0.26948, + "grad_norm": 2.046875, + "grad_norm_var": 0.014847564697265624, + "learning_rate": 0.0001, + "loss": 4.0338, + "loss/crossentropy": 1.610491931438446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19731693714857101, + "step": 13474 + }, + { + "epoch": 0.26952, + "grad_norm": 2.03125, + "grad_norm_var": 0.015313466389973959, + "learning_rate": 0.0001, + "loss": 4.213, + "loss/crossentropy": 2.1966941356658936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22074176371097565, + "step": 13476 + }, + { + "epoch": 0.26956, + "grad_norm": 1.8515625, + "grad_norm_var": 0.029904937744140624, + "learning_rate": 0.0001, + "loss": 4.1291, + "loss/crossentropy": 2.045714259147644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2193753719329834, + "step": 13478 + }, + { + "epoch": 0.2696, + "grad_norm": 2.0, + "grad_norm_var": 0.029886881510416668, + "learning_rate": 0.0001, + "loss": 4.0127, + "loss/crossentropy": 1.860603928565979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1900448128581047, + "step": 13480 + }, + { + "epoch": 0.26964, + "grad_norm": 2.125, + "grad_norm_var": 0.028449503580729167, + "learning_rate": 0.0001, + "loss": 4.0643, + "loss/crossentropy": 2.171591639518738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21908972412347794, + "step": 13482 + }, + { + "epoch": 0.26968, + "grad_norm": 2.296875, + "grad_norm_var": 0.030265299479166667, + "learning_rate": 0.0001, + "loss": 4.1351, + "loss/crossentropy": 2.271330237388611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21823390573263168, + "step": 13484 + }, + { + "epoch": 0.26972, + "grad_norm": 1.953125, + "grad_norm_var": 0.026875813802083332, + "learning_rate": 0.0001, + "loss": 4.192, + "loss/crossentropy": 2.155557870864868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20512574911117554, + "step": 13486 + }, + { + "epoch": 0.26976, + "grad_norm": 1.9609375, + "grad_norm_var": 0.027164459228515625, + "learning_rate": 0.0001, + "loss": 4.2561, + "loss/crossentropy": 2.3827039003372192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22485698014497757, + "step": 13488 + }, + { + "epoch": 0.2698, + "grad_norm": 1.9375, + "grad_norm_var": 0.028636678059895834, + "learning_rate": 0.0001, + "loss": 3.9937, + "loss/crossentropy": 2.225351929664612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22235903143882751, + "step": 13490 + }, + { + "epoch": 0.26984, + "grad_norm": 2.015625, + "grad_norm_var": 0.027717081705729167, + "learning_rate": 0.0001, + "loss": 4.0144, + "loss/crossentropy": 2.1125447750091553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20206717401742935, + "step": 13492 + }, + { + "epoch": 0.26988, + "grad_norm": 1.9296875, + "grad_norm_var": 0.013639068603515625, + "learning_rate": 0.0001, + "loss": 4.0318, + "loss/crossentropy": 2.30017626285553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22321031987667084, + "step": 13494 + }, + { + "epoch": 0.26992, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0145904541015625, + "learning_rate": 0.0001, + "loss": 4.3979, + "loss/crossentropy": 2.416603446006775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2413351908326149, + "step": 13496 + }, + { + "epoch": 0.26996, + "grad_norm": 2.046875, + "grad_norm_var": 0.013185373942057292, + "learning_rate": 0.0001, + "loss": 4.0436, + "loss/crossentropy": 2.06734299659729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23090296238660812, + "step": 13498 + }, + { + "epoch": 0.27, + "grad_norm": 1.96875, + "grad_norm_var": 0.0050961812337239586, + "learning_rate": 0.0001, + "loss": 4.0, + "loss/crossentropy": 1.8088473677635193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1988995596766472, + "step": 13500 + }, + { + "epoch": 0.27004, + "grad_norm": 2.015625, + "grad_norm_var": 0.005092112223307291, + "learning_rate": 0.0001, + "loss": 4.0205, + "loss/crossentropy": 1.7075524926185608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18463176488876343, + "step": 13502 + }, + { + "epoch": 0.27008, + "grad_norm": 2.21875, + "grad_norm_var": 0.0083160400390625, + "learning_rate": 0.0001, + "loss": 4.2291, + "loss/crossentropy": 2.1184898018836975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20941460877656937, + "step": 13504 + }, + { + "epoch": 0.27012, + "grad_norm": 1.890625, + "grad_norm_var": 0.007736968994140625, + "learning_rate": 0.0001, + "loss": 4.0139, + "loss/crossentropy": 1.8351567387580872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18913155794143677, + "step": 13506 + }, + { + "epoch": 0.27016, + "grad_norm": 1.8515625, + "grad_norm_var": 0.00914306640625, + "learning_rate": 0.0001, + "loss": 3.935, + "loss/crossentropy": 2.090391516685486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21020452678203583, + "step": 13508 + }, + { + "epoch": 0.2702, + "grad_norm": 2.046875, + "grad_norm_var": 0.0078521728515625, + "learning_rate": 0.0001, + "loss": 4.1805, + "loss/crossentropy": 2.0680218935012817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20998934656381607, + "step": 13510 + }, + { + "epoch": 0.27024, + "grad_norm": 1.90625, + "grad_norm_var": 0.007236480712890625, + "learning_rate": 0.0001, + "loss": 4.0074, + "loss/crossentropy": 2.258071780204773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22634688764810562, + "step": 13512 + }, + { + "epoch": 0.27028, + "grad_norm": 2.15625, + "grad_norm_var": 0.03350397745768229, + "learning_rate": 0.0001, + "loss": 4.775, + "loss/crossentropy": 1.931319773197174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29265259206295013, + "step": 13514 + }, + { + "epoch": 0.27032, + "grad_norm": 1.9296875, + "grad_norm_var": 0.03398844401041667, + "learning_rate": 0.0001, + "loss": 4.3408, + "loss/crossentropy": 2.45454478263855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22465243190526962, + "step": 13516 + }, + { + "epoch": 0.27036, + "grad_norm": 2.046875, + "grad_norm_var": 0.034063466389973956, + "learning_rate": 0.0001, + "loss": 4.2279, + "loss/crossentropy": 2.005887746810913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19818827509880066, + "step": 13518 + }, + { + "epoch": 0.2704, + "grad_norm": 2.03125, + "grad_norm_var": 0.031107330322265626, + "learning_rate": 0.0001, + "loss": 4.318, + "loss/crossentropy": 2.2329577207565308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21015001833438873, + "step": 13520 + }, + { + "epoch": 0.27044, + "grad_norm": 1.8359375, + "grad_norm_var": 0.032373046875, + "learning_rate": 0.0001, + "loss": 4.1026, + "loss/crossentropy": 2.1716688871383667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20930524915456772, + "step": 13522 + }, + { + "epoch": 0.27048, + "grad_norm": 1.9609375, + "grad_norm_var": 0.030651601155598958, + "learning_rate": 0.0001, + "loss": 3.957, + "loss/crossentropy": 1.659675419330597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1963459849357605, + "step": 13524 + }, + { + "epoch": 0.27052, + "grad_norm": 1.9140625, + "grad_norm_var": 0.035359700520833336, + "learning_rate": 0.0001, + "loss": 4.2214, + "loss/crossentropy": 1.9285706877708435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1708521470427513, + "step": 13526 + }, + { + "epoch": 0.27056, + "grad_norm": 2.046875, + "grad_norm_var": 0.03286031087239583, + "learning_rate": 0.0001, + "loss": 4.1771, + "loss/crossentropy": 2.0428953170776367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20893662422895432, + "step": 13528 + }, + { + "epoch": 0.2706, + "grad_norm": 2.078125, + "grad_norm_var": 0.011655426025390625, + "learning_rate": 0.0001, + "loss": 3.9885, + "loss/crossentropy": 1.979922592639923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19091727584600449, + "step": 13530 + }, + { + "epoch": 0.27064, + "grad_norm": 1.984375, + "grad_norm_var": 0.015922037760416667, + "learning_rate": 0.0001, + "loss": 4.3176, + "loss/crossentropy": 2.1879321336746216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24813223630189896, + "step": 13532 + }, + { + "epoch": 0.27068, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01631647745768229, + "learning_rate": 0.0001, + "loss": 4.1852, + "loss/crossentropy": 2.0450875759124756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21059003472328186, + "step": 13534 + }, + { + "epoch": 0.27072, + "grad_norm": 1.984375, + "grad_norm_var": 0.01673151652018229, + "learning_rate": 0.0001, + "loss": 4.3549, + "loss/crossentropy": 2.050394892692566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21235206723213196, + "step": 13536 + }, + { + "epoch": 0.27076, + "grad_norm": 2.0, + "grad_norm_var": 0.014793904622395833, + "learning_rate": 0.0001, + "loss": 4.0128, + "loss/crossentropy": 2.0687233805656433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22125782817602158, + "step": 13538 + }, + { + "epoch": 0.2708, + "grad_norm": 2.046875, + "grad_norm_var": 0.014647420247395833, + "learning_rate": 0.0001, + "loss": 4.1247, + "loss/crossentropy": 1.8587198853492737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19718395173549652, + "step": 13540 + }, + { + "epoch": 0.27084, + "grad_norm": 2.125, + "grad_norm_var": 0.010223134358723959, + "learning_rate": 0.0001, + "loss": 4.2992, + "loss/crossentropy": 2.2416387796401978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23797060549259186, + "step": 13542 + }, + { + "epoch": 0.27088, + "grad_norm": 4.8125, + "grad_norm_var": 0.495751953125, + "learning_rate": 0.0001, + "loss": 4.1875, + "loss/crossentropy": 2.251328468322754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23915010690689087, + "step": 13544 + }, + { + "epoch": 0.27092, + "grad_norm": 2.125, + "grad_norm_var": 0.4896074930826823, + "learning_rate": 0.0001, + "loss": 4.0601, + "loss/crossentropy": 1.990403652191162, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21710015833377838, + "step": 13546 + }, + { + "epoch": 0.27096, + "grad_norm": 2.09375, + "grad_norm_var": 0.4871070861816406, + "learning_rate": 0.0001, + "loss": 4.38, + "loss/crossentropy": 2.176861047744751, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2153458073735237, + "step": 13548 + }, + { + "epoch": 0.271, + "grad_norm": 1.953125, + "grad_norm_var": 0.4867286682128906, + "learning_rate": 0.0001, + "loss": 4.0311, + "loss/crossentropy": 1.581967830657959, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1787950098514557, + "step": 13550 + }, + { + "epoch": 0.27104, + "grad_norm": 1.90625, + "grad_norm_var": 0.4920183817545573, + "learning_rate": 0.0001, + "loss": 3.9803, + "loss/crossentropy": 1.827072560787201, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18847094476222992, + "step": 13552 + }, + { + "epoch": 0.27108, + "grad_norm": 2.03125, + "grad_norm_var": 0.49279683430989585, + "learning_rate": 0.0001, + "loss": 3.9381, + "loss/crossentropy": 1.5748514533042908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18339257687330246, + "step": 13554 + }, + { + "epoch": 0.27112, + "grad_norm": 2.125, + "grad_norm_var": 0.4897989908854167, + "learning_rate": 0.0001, + "loss": 4.2545, + "loss/crossentropy": 2.3187586069107056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2535529136657715, + "step": 13556 + }, + { + "epoch": 0.27116, + "grad_norm": 2.15625, + "grad_norm_var": 0.4885660807291667, + "learning_rate": 0.0001, + "loss": 4.4848, + "loss/crossentropy": 2.3675668239593506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23080945760011673, + "step": 13558 + }, + { + "epoch": 0.2712, + "grad_norm": 2.0625, + "grad_norm_var": 0.0066912333170572914, + "learning_rate": 0.0001, + "loss": 4.2032, + "loss/crossentropy": 2.3976542949676514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2418106645345688, + "step": 13560 + }, + { + "epoch": 0.27124, + "grad_norm": 1.984375, + "grad_norm_var": 0.006221262613932291, + "learning_rate": 0.0001, + "loss": 4.1877, + "loss/crossentropy": 2.1341328024864197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21845046430826187, + "step": 13562 + }, + { + "epoch": 0.27128, + "grad_norm": 2.046875, + "grad_norm_var": 0.006154123942057292, + "learning_rate": 0.0001, + "loss": 4.3067, + "loss/crossentropy": 2.0519689321517944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21991144120693207, + "step": 13564 + }, + { + "epoch": 0.27132, + "grad_norm": 2.0, + "grad_norm_var": 0.005812327067057292, + "learning_rate": 0.0001, + "loss": 4.106, + "loss/crossentropy": 1.9690070748329163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20945511013269424, + "step": 13566 + }, + { + "epoch": 0.27136, + "grad_norm": 2.09375, + "grad_norm_var": 0.004609934488932292, + "learning_rate": 0.0001, + "loss": 4.0929, + "loss/crossentropy": 1.9553492069244385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2299523577094078, + "step": 13568 + }, + { + "epoch": 0.2714, + "grad_norm": 2.25, + "grad_norm_var": 0.0055501302083333336, + "learning_rate": 0.0001, + "loss": 4.158, + "loss/crossentropy": 1.8214278817176819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1801338642835617, + "step": 13570 + }, + { + "epoch": 0.27144, + "grad_norm": 1.7578125, + "grad_norm_var": 0.011822255452473958, + "learning_rate": 0.0001, + "loss": 3.8658, + "loss/crossentropy": 1.7834638953208923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18421506136655807, + "step": 13572 + }, + { + "epoch": 0.27148, + "grad_norm": 1.984375, + "grad_norm_var": 0.013097890218098958, + "learning_rate": 0.0001, + "loss": 3.8404, + "loss/crossentropy": 1.7703429460525513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18263264745473862, + "step": 13574 + }, + { + "epoch": 0.27152, + "grad_norm": 2.21875, + "grad_norm_var": 0.0155181884765625, + "learning_rate": 0.0001, + "loss": 4.1489, + "loss/crossentropy": 2.228934168815613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21034922450780869, + "step": 13576 + }, + { + "epoch": 0.27156, + "grad_norm": 2.046875, + "grad_norm_var": 0.015379842122395833, + "learning_rate": 0.0001, + "loss": 4.4665, + "loss/crossentropy": 2.160655975341797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22300932556390762, + "step": 13578 + }, + { + "epoch": 0.2716, + "grad_norm": 2.078125, + "grad_norm_var": 0.0150634765625, + "learning_rate": 0.0001, + "loss": 4.2042, + "loss/crossentropy": 2.1582624912261963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21843481063842773, + "step": 13580 + }, + { + "epoch": 0.27164, + "grad_norm": 2.03125, + "grad_norm_var": 0.0150146484375, + "learning_rate": 0.0001, + "loss": 4.2513, + "loss/crossentropy": 2.0660162568092346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1985679790377617, + "step": 13582 + }, + { + "epoch": 0.27168, + "grad_norm": 2.0625, + "grad_norm_var": 0.015433502197265626, + "learning_rate": 0.0001, + "loss": 3.873, + "loss/crossentropy": 1.839695692062378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1772506907582283, + "step": 13584 + }, + { + "epoch": 0.27172, + "grad_norm": 2.015625, + "grad_norm_var": 0.012345123291015624, + "learning_rate": 0.0001, + "loss": 3.9649, + "loss/crossentropy": 2.0837132930755615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19461007416248322, + "step": 13586 + }, + { + "epoch": 0.27176, + "grad_norm": 2.09375, + "grad_norm_var": 0.00750732421875, + "learning_rate": 0.0001, + "loss": 4.1986, + "loss/crossentropy": 1.910473346710205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20272061973810196, + "step": 13588 + }, + { + "epoch": 0.2718, + "grad_norm": 2.921875, + "grad_norm_var": 0.054402669270833336, + "learning_rate": 0.0001, + "loss": 4.2259, + "loss/crossentropy": 2.2427467107772827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21372541785240173, + "step": 13590 + }, + { + "epoch": 0.27184, + "grad_norm": 2.0625, + "grad_norm_var": 0.055272420247395836, + "learning_rate": 0.0001, + "loss": 4.1123, + "loss/crossentropy": 2.165425181388855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2552504763007164, + "step": 13592 + }, + { + "epoch": 0.27188, + "grad_norm": 1.96875, + "grad_norm_var": 0.05620930989583333, + "learning_rate": 0.0001, + "loss": 4.0993, + "loss/crossentropy": 1.9684009552001953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19894393533468246, + "step": 13594 + }, + { + "epoch": 0.27192, + "grad_norm": 2.046875, + "grad_norm_var": 0.055863444010416666, + "learning_rate": 0.0001, + "loss": 4.2421, + "loss/crossentropy": 2.0572606325149536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20954644680023193, + "step": 13596 + }, + { + "epoch": 0.27196, + "grad_norm": 1.78125, + "grad_norm_var": 0.0611724853515625, + "learning_rate": 0.0001, + "loss": 3.5511, + "loss/crossentropy": 1.8215171694755554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1905388981103897, + "step": 13598 + }, + { + "epoch": 0.272, + "grad_norm": 2.28125, + "grad_norm_var": 0.06299209594726562, + "learning_rate": 0.0001, + "loss": 4.2407, + "loss/crossentropy": 2.114215850830078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21045731008052826, + "step": 13600 + }, + { + "epoch": 0.27204, + "grad_norm": 1.9921875, + "grad_norm_var": 0.06243057250976562, + "learning_rate": 0.0001, + "loss": 4.1409, + "loss/crossentropy": 2.301407814025879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148626372218132, + "step": 13602 + }, + { + "epoch": 0.27208, + "grad_norm": 1.9453125, + "grad_norm_var": 0.06350809733072917, + "learning_rate": 0.0001, + "loss": 4.4367, + "loss/crossentropy": 2.0508424639701843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21292225271463394, + "step": 13604 + }, + { + "epoch": 0.27212, + "grad_norm": 2.1875, + "grad_norm_var": 0.014826456705729166, + "learning_rate": 0.0001, + "loss": 4.0024, + "loss/crossentropy": 2.095071792602539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23258864879608154, + "step": 13606 + }, + { + "epoch": 0.27216, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013802083333333333, + "learning_rate": 0.0001, + "loss": 4.1722, + "loss/crossentropy": 2.3613405227661133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20384693890810013, + "step": 13608 + }, + { + "epoch": 0.2722, + "grad_norm": 2.0625, + "grad_norm_var": 0.0142578125, + "learning_rate": 0.0001, + "loss": 4.1397, + "loss/crossentropy": 2.1618664264678955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21691302955150604, + "step": 13610 + }, + { + "epoch": 0.27224, + "grad_norm": 2.03125, + "grad_norm_var": 0.015620676676432292, + "learning_rate": 0.0001, + "loss": 3.8335, + "loss/crossentropy": 1.6352717280387878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18613358587026596, + "step": 13612 + }, + { + "epoch": 0.27228, + "grad_norm": 2.15625, + "grad_norm_var": 0.012851715087890625, + "learning_rate": 0.0001, + "loss": 4.2405, + "loss/crossentropy": 2.304627537727356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21715758740901947, + "step": 13614 + }, + { + "epoch": 0.27232, + "grad_norm": 2.078125, + "grad_norm_var": 0.009357706705729166, + "learning_rate": 0.0001, + "loss": 4.1845, + "loss/crossentropy": 2.232232451438904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20997462421655655, + "step": 13616 + }, + { + "epoch": 0.27236, + "grad_norm": 2.109375, + "grad_norm_var": 0.011503092447916667, + "learning_rate": 0.0001, + "loss": 4.1273, + "loss/crossentropy": 2.0350120663642883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20753738284111023, + "step": 13618 + }, + { + "epoch": 0.2724, + "grad_norm": 1.8828125, + "grad_norm_var": 0.015746053059895834, + "learning_rate": 0.0001, + "loss": 4.1955, + "loss/crossentropy": 1.892416536808014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1920388638973236, + "step": 13620 + }, + { + "epoch": 0.27244, + "grad_norm": 2.015625, + "grad_norm_var": 0.0128814697265625, + "learning_rate": 0.0001, + "loss": 4.3283, + "loss/crossentropy": 2.1694198846817017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.217724971473217, + "step": 13622 + }, + { + "epoch": 0.27248, + "grad_norm": 1.96875, + "grad_norm_var": 0.013444010416666667, + "learning_rate": 0.0001, + "loss": 3.9786, + "loss/crossentropy": 2.1418001651763916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016705498099327, + "step": 13624 + }, + { + "epoch": 0.27252, + "grad_norm": 2.171875, + "grad_norm_var": 0.014240519205729166, + "learning_rate": 0.0001, + "loss": 4.3992, + "loss/crossentropy": 2.32223117351532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21616849303245544, + "step": 13626 + }, + { + "epoch": 0.27256, + "grad_norm": 2.03125, + "grad_norm_var": 0.013044230143229167, + "learning_rate": 0.0001, + "loss": 4.1847, + "loss/crossentropy": 1.9991823434829712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2077011615037918, + "step": 13628 + }, + { + "epoch": 0.2726, + "grad_norm": 1.8984375, + "grad_norm_var": 0.012434641520182291, + "learning_rate": 0.0001, + "loss": 4.1331, + "loss/crossentropy": 2.1381043195724487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20748750865459442, + "step": 13630 + }, + { + "epoch": 0.27264, + "grad_norm": 2.015625, + "grad_norm_var": 0.012303670247395834, + "learning_rate": 0.0001, + "loss": 4.2215, + "loss/crossentropy": 2.018588602542877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20897169411182404, + "step": 13632 + }, + { + "epoch": 0.27268, + "grad_norm": 2.15625, + "grad_norm_var": 0.01114501953125, + "learning_rate": 0.0001, + "loss": 4.3528, + "loss/crossentropy": 2.1859233379364014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21695572137832642, + "step": 13634 + }, + { + "epoch": 0.27272, + "grad_norm": 2.015625, + "grad_norm_var": 0.0060198465983072914, + "learning_rate": 0.0001, + "loss": 4.2106, + "loss/crossentropy": 2.17002010345459, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21605493128299713, + "step": 13636 + }, + { + "epoch": 0.27276, + "grad_norm": 2.046875, + "grad_norm_var": 0.006058502197265625, + "learning_rate": 0.0001, + "loss": 4.261, + "loss/crossentropy": 2.186649441719055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172158733010292, + "step": 13638 + }, + { + "epoch": 0.2728, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007306925455729167, + "learning_rate": 0.0001, + "loss": 3.8394, + "loss/crossentropy": 1.7324808835983276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17334696650505066, + "step": 13640 + }, + { + "epoch": 0.27284, + "grad_norm": 2.03125, + "grad_norm_var": 0.0049550374348958336, + "learning_rate": 0.0001, + "loss": 3.9805, + "loss/crossentropy": 1.7768054008483887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1750154346227646, + "step": 13642 + }, + { + "epoch": 0.27288, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0047515869140625, + "learning_rate": 0.0001, + "loss": 4.2292, + "loss/crossentropy": 1.9629738330841064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19489944726228714, + "step": 13644 + }, + { + "epoch": 0.27292, + "grad_norm": 2.109375, + "grad_norm_var": 0.006882476806640625, + "learning_rate": 0.0001, + "loss": 4.4096, + "loss/crossentropy": 2.144750416278839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19367601722478867, + "step": 13646 + }, + { + "epoch": 0.27296, + "grad_norm": 2.03125, + "grad_norm_var": 0.0067291259765625, + "learning_rate": 0.0001, + "loss": 4.1301, + "loss/crossentropy": 1.9281827211380005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19894402474164963, + "step": 13648 + }, + { + "epoch": 0.273, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007132720947265625, + "learning_rate": 0.0001, + "loss": 4.1998, + "loss/crossentropy": 1.8092535138130188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18536384403705597, + "step": 13650 + }, + { + "epoch": 0.27304, + "grad_norm": 2.046875, + "grad_norm_var": 0.007726796468098958, + "learning_rate": 0.0001, + "loss": 4.1697, + "loss/crossentropy": 2.5707184076309204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23067744076251984, + "step": 13652 + }, + { + "epoch": 0.27308, + "grad_norm": 2.03125, + "grad_norm_var": 0.008434804280598958, + "learning_rate": 0.0001, + "loss": 4.0189, + "loss/crossentropy": 2.108555316925049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2052021473646164, + "step": 13654 + }, + { + "epoch": 0.27312, + "grad_norm": 1.984375, + "grad_norm_var": 0.007228342692057291, + "learning_rate": 0.0001, + "loss": 4.3412, + "loss/crossentropy": 1.836738109588623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20734501630067825, + "step": 13656 + }, + { + "epoch": 0.27316, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007990519205729166, + "learning_rate": 0.0001, + "loss": 4.0783, + "loss/crossentropy": 2.223781406879425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20618071407079697, + "step": 13658 + }, + { + "epoch": 0.2732, + "grad_norm": 2.0, + "grad_norm_var": 0.007207997639973958, + "learning_rate": 0.0001, + "loss": 4.1722, + "loss/crossentropy": 2.028991222381592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20579702407121658, + "step": 13660 + }, + { + "epoch": 0.27324, + "grad_norm": 2.015625, + "grad_norm_var": 0.006915028889973958, + "learning_rate": 0.0001, + "loss": 4.0294, + "loss/crossentropy": 2.1003119349479675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21900994330644608, + "step": 13662 + }, + { + "epoch": 0.27328, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007289377848307291, + "learning_rate": 0.0001, + "loss": 4.1695, + "loss/crossentropy": 2.1813069581985474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2117675319314003, + "step": 13664 + }, + { + "epoch": 0.27332, + "grad_norm": 2.140625, + "grad_norm_var": 0.0105621337890625, + "learning_rate": 0.0001, + "loss": 3.8666, + "loss/crossentropy": 2.2203832864761353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20088640600442886, + "step": 13666 + }, + { + "epoch": 0.27336, + "grad_norm": 1.90625, + "grad_norm_var": 0.0103759765625, + "learning_rate": 0.0001, + "loss": 4.1084, + "loss/crossentropy": 1.8810867071151733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20646882802248, + "step": 13668 + }, + { + "epoch": 0.2734, + "grad_norm": 1.9609375, + "grad_norm_var": 0.010060373942057292, + "learning_rate": 0.0001, + "loss": 4.0746, + "loss/crossentropy": 2.2147200107574463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2038613259792328, + "step": 13670 + }, + { + "epoch": 0.27344, + "grad_norm": 2.078125, + "grad_norm_var": 0.008408355712890624, + "learning_rate": 0.0001, + "loss": 4.2151, + "loss/crossentropy": 2.3654199838638306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22163932025432587, + "step": 13672 + }, + { + "epoch": 0.27348, + "grad_norm": 1.9375, + "grad_norm_var": 0.0086669921875, + "learning_rate": 0.0001, + "loss": 4.1584, + "loss/crossentropy": 2.422420859336853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23172692954540253, + "step": 13674 + }, + { + "epoch": 0.27352, + "grad_norm": 2.015625, + "grad_norm_var": 0.008634440104166667, + "learning_rate": 0.0001, + "loss": 4.391, + "loss/crossentropy": 2.0776702165603638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2030162662267685, + "step": 13676 + }, + { + "epoch": 0.27356, + "grad_norm": 2.125, + "grad_norm_var": 0.009611002604166667, + "learning_rate": 0.0001, + "loss": 3.9108, + "loss/crossentropy": 1.7956212162971497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1932467818260193, + "step": 13678 + }, + { + "epoch": 0.2736, + "grad_norm": 2.046875, + "grad_norm_var": 0.010573069254557291, + "learning_rate": 0.0001, + "loss": 4.1768, + "loss/crossentropy": 2.176727533340454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20450271666049957, + "step": 13680 + }, + { + "epoch": 0.27364, + "grad_norm": 1.90625, + "grad_norm_var": 0.0062255859375, + "learning_rate": 0.0001, + "loss": 4.061, + "loss/crossentropy": 2.0534290075302124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21150045841932297, + "step": 13682 + }, + { + "epoch": 0.27368, + "grad_norm": 2.015625, + "grad_norm_var": 0.0061920166015625, + "learning_rate": 0.0001, + "loss": 4.1682, + "loss/crossentropy": 1.726151466369629, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19256682693958282, + "step": 13684 + }, + { + "epoch": 0.27372, + "grad_norm": 2.09375, + "grad_norm_var": 0.005956013997395833, + "learning_rate": 0.0001, + "loss": 4.2666, + "loss/crossentropy": 2.2024354934692383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2222209945321083, + "step": 13686 + }, + { + "epoch": 0.27376, + "grad_norm": 2.171875, + "grad_norm_var": 0.007347615559895834, + "learning_rate": 0.0001, + "loss": 4.3252, + "loss/crossentropy": 2.095518469810486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2092512547969818, + "step": 13688 + }, + { + "epoch": 0.2738, + "grad_norm": 2.078125, + "grad_norm_var": 0.0074460347493489586, + "learning_rate": 0.0001, + "loss": 4.0389, + "loss/crossentropy": 1.9978103637695312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20576970279216766, + "step": 13690 + }, + { + "epoch": 0.27384, + "grad_norm": 1.875, + "grad_norm_var": 0.008599599202473959, + "learning_rate": 0.0001, + "loss": 4.1023, + "loss/crossentropy": 2.233831286430359, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20514021068811417, + "step": 13692 + }, + { + "epoch": 0.27388, + "grad_norm": 1.921875, + "grad_norm_var": 0.010251617431640625, + "learning_rate": 0.0001, + "loss": 4.0479, + "loss/crossentropy": 1.6729156970977783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18054980039596558, + "step": 13694 + }, + { + "epoch": 0.27392, + "grad_norm": 2.140625, + "grad_norm_var": 0.010910797119140624, + "learning_rate": 0.0001, + "loss": 4.1911, + "loss/crossentropy": 2.0113388895988464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21645111590623856, + "step": 13696 + }, + { + "epoch": 0.27396, + "grad_norm": 2.140625, + "grad_norm_var": 0.010469563802083333, + "learning_rate": 0.0001, + "loss": 4.3487, + "loss/crossentropy": 1.9311461448669434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23587358742952347, + "step": 13698 + }, + { + "epoch": 0.274, + "grad_norm": 1.9453125, + "grad_norm_var": 0.01889012654622396, + "learning_rate": 0.0001, + "loss": 4.1099, + "loss/crossentropy": 2.1205984354019165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20603877305984497, + "step": 13700 + }, + { + "epoch": 0.27404, + "grad_norm": 1.9296875, + "grad_norm_var": 0.01889012654622396, + "learning_rate": 0.0001, + "loss": 4.2947, + "loss/crossentropy": 2.1700201630592346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21855031698942184, + "step": 13702 + }, + { + "epoch": 0.27408, + "grad_norm": 1.8828125, + "grad_norm_var": 0.022484334309895833, + "learning_rate": 0.0001, + "loss": 3.7717, + "loss/crossentropy": 1.5643411874771118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.161862351000309, + "step": 13704 + }, + { + "epoch": 0.27412, + "grad_norm": 1.8671875, + "grad_norm_var": 0.024881998697916668, + "learning_rate": 0.0001, + "loss": 3.7292, + "loss/crossentropy": 2.032666802406311, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20749235153198242, + "step": 13706 + }, + { + "epoch": 0.27416, + "grad_norm": 1.890625, + "grad_norm_var": 0.024738566080729166, + "learning_rate": 0.0001, + "loss": 4.0899, + "loss/crossentropy": 1.9313859939575195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19065556675195694, + "step": 13708 + }, + { + "epoch": 0.2742, + "grad_norm": 2.0, + "grad_norm_var": 0.02225519816080729, + "learning_rate": 0.0001, + "loss": 4.3295, + "loss/crossentropy": 2.1200226545333862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1967960000038147, + "step": 13710 + }, + { + "epoch": 0.27424, + "grad_norm": 1.953125, + "grad_norm_var": 0.020765940348307293, + "learning_rate": 0.0001, + "loss": 4.0655, + "loss/crossentropy": 2.144526958465576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20019693672657013, + "step": 13712 + }, + { + "epoch": 0.27428, + "grad_norm": 1.9609375, + "grad_norm_var": 0.01809666951497396, + "learning_rate": 0.0001, + "loss": 4.2039, + "loss/crossentropy": 1.9802407622337341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20123689621686935, + "step": 13714 + }, + { + "epoch": 0.27432, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0062334696451822914, + "learning_rate": 0.0001, + "loss": 3.9415, + "loss/crossentropy": 2.096527099609375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20282836258411407, + "step": 13716 + }, + { + "epoch": 0.27436, + "grad_norm": 2.046875, + "grad_norm_var": 0.005558013916015625, + "learning_rate": 0.0001, + "loss": 4.1829, + "loss/crossentropy": 2.1037757992744446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039530873298645, + "step": 13718 + }, + { + "epoch": 0.2744, + "grad_norm": 1.9609375, + "grad_norm_var": 0.004752349853515625, + "learning_rate": 0.0001, + "loss": 3.8416, + "loss/crossentropy": 1.5415751934051514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18478797376155853, + "step": 13720 + }, + { + "epoch": 0.27444, + "grad_norm": 2.015625, + "grad_norm_var": 0.001859283447265625, + "learning_rate": 0.0001, + "loss": 4.071, + "loss/crossentropy": 2.0357913970947266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20176240801811218, + "step": 13722 + }, + { + "epoch": 0.27448, + "grad_norm": 1.8125, + "grad_norm_var": 0.0034543355305989582, + "learning_rate": 0.0001, + "loss": 3.8472, + "loss/crossentropy": 1.9013367891311646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20012690126895905, + "step": 13724 + }, + { + "epoch": 0.27452, + "grad_norm": 1.9921875, + "grad_norm_var": 0.00343017578125, + "learning_rate": 0.0001, + "loss": 3.8072, + "loss/crossentropy": 1.5798682570457458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18169572949409485, + "step": 13726 + }, + { + "epoch": 0.27456, + "grad_norm": 1.984375, + "grad_norm_var": 0.0034624735514322915, + "learning_rate": 0.0001, + "loss": 4.163, + "loss/crossentropy": 2.196288228034973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22311393916606903, + "step": 13728 + }, + { + "epoch": 0.2746, + "grad_norm": 1.96875, + "grad_norm_var": 0.003928375244140625, + "learning_rate": 0.0001, + "loss": 4.0172, + "loss/crossentropy": 1.714774489402771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17624646425247192, + "step": 13730 + }, + { + "epoch": 0.27464, + "grad_norm": 2.15625, + "grad_norm_var": 0.0053484598795572914, + "learning_rate": 0.0001, + "loss": 4.3264, + "loss/crossentropy": 1.9853646159172058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18827050924301147, + "step": 13732 + }, + { + "epoch": 0.27468, + "grad_norm": 1.7890625, + "grad_norm_var": 0.0074045817057291664, + "learning_rate": 0.0001, + "loss": 4.0074, + "loss/crossentropy": 1.7830212116241455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1801598072052002, + "step": 13734 + }, + { + "epoch": 0.27472, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008046213785807292, + "learning_rate": 0.0001, + "loss": 3.7923, + "loss/crossentropy": 1.7400763034820557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1930072009563446, + "step": 13736 + }, + { + "epoch": 0.27476, + "grad_norm": 2.046875, + "grad_norm_var": 0.008421834309895833, + "learning_rate": 0.0001, + "loss": 4.0992, + "loss/crossentropy": 2.2292455434799194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20573973655700684, + "step": 13738 + }, + { + "epoch": 0.2748, + "grad_norm": 2.09375, + "grad_norm_var": 0.009056599934895833, + "learning_rate": 0.0001, + "loss": 4.4305, + "loss/crossentropy": 2.1399097442626953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2265445813536644, + "step": 13740 + }, + { + "epoch": 0.27484, + "grad_norm": 1.953125, + "grad_norm_var": 0.009366861979166667, + "learning_rate": 0.0001, + "loss": 4.0596, + "loss/crossentropy": 2.049125075340271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20623435080051422, + "step": 13742 + }, + { + "epoch": 0.27488, + "grad_norm": 2.078125, + "grad_norm_var": 0.010689036051432291, + "learning_rate": 0.0001, + "loss": 4.2678, + "loss/crossentropy": 2.388680577278137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23650053143501282, + "step": 13744 + }, + { + "epoch": 0.27492, + "grad_norm": 2.390625, + "grad_norm_var": 0.019577789306640624, + "learning_rate": 0.0001, + "loss": 4.5091, + "loss/crossentropy": 2.05421245098114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19932237267494202, + "step": 13746 + }, + { + "epoch": 0.27496, + "grad_norm": 2.0, + "grad_norm_var": 0.01969172159830729, + "learning_rate": 0.0001, + "loss": 3.8274, + "loss/crossentropy": 1.892092227935791, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1907898187637329, + "step": 13748 + }, + { + "epoch": 0.275, + "grad_norm": 2.109375, + "grad_norm_var": 0.018507639567057293, + "learning_rate": 0.0001, + "loss": 4.2196, + "loss/crossentropy": 2.160037875175476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24229361861944199, + "step": 13750 + }, + { + "epoch": 0.27504, + "grad_norm": 1.8984375, + "grad_norm_var": 0.017451985677083334, + "learning_rate": 0.0001, + "loss": 4.1278, + "loss/crossentropy": 2.5185710191726685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2394627332687378, + "step": 13752 + }, + { + "epoch": 0.27508, + "grad_norm": 1.953125, + "grad_norm_var": 0.01754735310872396, + "learning_rate": 0.0001, + "loss": 4.1551, + "loss/crossentropy": 2.0613635778427124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19758722186088562, + "step": 13754 + }, + { + "epoch": 0.27512, + "grad_norm": 2.046875, + "grad_norm_var": 0.019496409098307292, + "learning_rate": 0.0001, + "loss": 4.1905, + "loss/crossentropy": 1.9780999422073364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21773972362279892, + "step": 13756 + }, + { + "epoch": 0.27516, + "grad_norm": 2.1875, + "grad_norm_var": 0.01987482706705729, + "learning_rate": 0.0001, + "loss": 4.3297, + "loss/crossentropy": 2.1116772890090942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20809265226125717, + "step": 13758 + }, + { + "epoch": 0.2752, + "grad_norm": 2.109375, + "grad_norm_var": 0.021201324462890626, + "learning_rate": 0.0001, + "loss": 4.2079, + "loss/crossentropy": 2.1072784662246704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21189533174037933, + "step": 13760 + }, + { + "epoch": 0.27524, + "grad_norm": 1.953125, + "grad_norm_var": 0.014902496337890625, + "learning_rate": 0.0001, + "loss": 3.9102, + "loss/crossentropy": 2.261967897415161, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2058071494102478, + "step": 13762 + }, + { + "epoch": 0.27528, + "grad_norm": 2.28125, + "grad_norm_var": 0.01571044921875, + "learning_rate": 0.0001, + "loss": 4.0684, + "loss/crossentropy": 2.240622043609619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2305934578180313, + "step": 13764 + }, + { + "epoch": 0.27532, + "grad_norm": 2.078125, + "grad_norm_var": 0.013874308268229166, + "learning_rate": 0.0001, + "loss": 3.8268, + "loss/crossentropy": 1.6203233003616333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17164954543113708, + "step": 13766 + }, + { + "epoch": 0.27536, + "grad_norm": 1.8984375, + "grad_norm_var": 0.013521067301432292, + "learning_rate": 0.0001, + "loss": 3.928, + "loss/crossentropy": 2.171602725982666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20083081722259521, + "step": 13768 + }, + { + "epoch": 0.2754, + "grad_norm": 2.046875, + "grad_norm_var": 0.012645467122395834, + "learning_rate": 0.0001, + "loss": 4.2738, + "loss/crossentropy": 2.2198195457458496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21113458275794983, + "step": 13770 + }, + { + "epoch": 0.27544, + "grad_norm": 1.9375, + "grad_norm_var": 0.012296549479166667, + "learning_rate": 0.0001, + "loss": 4.2833, + "loss/crossentropy": 2.1037912368774414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21217171102762222, + "step": 13772 + }, + { + "epoch": 0.27548, + "grad_norm": 1.9765625, + "grad_norm_var": 0.012015533447265626, + "learning_rate": 0.0001, + "loss": 3.9824, + "loss/crossentropy": 2.252090096473694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20365406572818756, + "step": 13774 + }, + { + "epoch": 0.27552, + "grad_norm": 1.8828125, + "grad_norm_var": 0.011701456705729167, + "learning_rate": 0.0001, + "loss": 3.9513, + "loss/crossentropy": 1.8044906258583069, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1854797750711441, + "step": 13776 + }, + { + "epoch": 0.27556, + "grad_norm": 2.09375, + "grad_norm_var": 0.012284342447916667, + "learning_rate": 0.0001, + "loss": 4.5229, + "loss/crossentropy": 2.2397952675819397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.214948832988739, + "step": 13778 + }, + { + "epoch": 0.2756, + "grad_norm": 2.34375, + "grad_norm_var": 0.0164215087890625, + "learning_rate": 0.0001, + "loss": 4.3569, + "loss/crossentropy": 1.776510238647461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19995201379060745, + "step": 13780 + }, + { + "epoch": 0.27564, + "grad_norm": 4.0625, + "grad_norm_var": 0.26768290201822914, + "learning_rate": 0.0001, + "loss": 4.3061, + "loss/crossentropy": 2.470343589782715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2194378450512886, + "step": 13782 + }, + { + "epoch": 0.27568, + "grad_norm": 1.9921875, + "grad_norm_var": 0.26253433227539064, + "learning_rate": 0.0001, + "loss": 4.0708, + "loss/crossentropy": 2.1340490579605103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20280296355485916, + "step": 13784 + }, + { + "epoch": 0.27572, + "grad_norm": 2.03125, + "grad_norm_var": 0.26368179321289065, + "learning_rate": 0.0001, + "loss": 4.1072, + "loss/crossentropy": 1.6801128387451172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16750500351190567, + "step": 13786 + }, + { + "epoch": 0.27576, + "grad_norm": 1.9609375, + "grad_norm_var": 0.26387939453125, + "learning_rate": 0.0001, + "loss": 3.9902, + "loss/crossentropy": 1.9632562398910522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19115211814641953, + "step": 13788 + }, + { + "epoch": 0.2758, + "grad_norm": 2.125, + "grad_norm_var": 0.25916519165039065, + "learning_rate": 0.0001, + "loss": 4.2139, + "loss/crossentropy": 2.0817149877548218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21430402994155884, + "step": 13790 + }, + { + "epoch": 0.27584, + "grad_norm": 1.9296875, + "grad_norm_var": 0.253088124593099, + "learning_rate": 0.0001, + "loss": 4.3244, + "loss/crossentropy": 2.2736687660217285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22024155408143997, + "step": 13792 + }, + { + "epoch": 0.27588, + "grad_norm": 1.9921875, + "grad_norm_var": 0.25625712076822915, + "learning_rate": 0.0001, + "loss": 4.2548, + "loss/crossentropy": 2.2632944583892822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.210355743765831, + "step": 13794 + }, + { + "epoch": 0.27592, + "grad_norm": 1.953125, + "grad_norm_var": 0.26166966756184895, + "learning_rate": 0.0001, + "loss": 4.2799, + "loss/crossentropy": 2.5457879304885864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22530706226825714, + "step": 13796 + }, + { + "epoch": 0.27596, + "grad_norm": 1.859375, + "grad_norm_var": 0.008410390218098958, + "learning_rate": 0.0001, + "loss": 3.8969, + "loss/crossentropy": 2.0878910422325134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19901857525110245, + "step": 13798 + }, + { + "epoch": 0.276, + "grad_norm": 2.015625, + "grad_norm_var": 0.008348592122395833, + "learning_rate": 0.0001, + "loss": 4.3213, + "loss/crossentropy": 1.803489863872528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19378191232681274, + "step": 13800 + }, + { + "epoch": 0.27604, + "grad_norm": 2.0, + "grad_norm_var": 0.009596506754557291, + "learning_rate": 0.0001, + "loss": 4.1916, + "loss/crossentropy": 2.156951904296875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2091423124074936, + "step": 13802 + }, + { + "epoch": 0.27608, + "grad_norm": 1.890625, + "grad_norm_var": 0.010493977864583334, + "learning_rate": 0.0001, + "loss": 3.9346, + "loss/crossentropy": 1.7714723944664001, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20779500901699066, + "step": 13804 + }, + { + "epoch": 0.27612, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009590403238932291, + "learning_rate": 0.0001, + "loss": 4.0499, + "loss/crossentropy": 1.8808923363685608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19429123401641846, + "step": 13806 + }, + { + "epoch": 0.27616, + "grad_norm": 1.9375, + "grad_norm_var": 0.005516560872395834, + "learning_rate": 0.0001, + "loss": 3.706, + "loss/crossentropy": 1.6658846735954285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17177510261535645, + "step": 13808 + }, + { + "epoch": 0.2762, + "grad_norm": 1.8359375, + "grad_norm_var": 0.018697102864583332, + "learning_rate": 0.0001, + "loss": 4.0909, + "loss/crossentropy": 2.052769422531128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20889723300933838, + "step": 13810 + }, + { + "epoch": 0.27624, + "grad_norm": 2.015625, + "grad_norm_var": 0.18883031209309895, + "learning_rate": 0.0001, + "loss": 4.0225, + "loss/crossentropy": 1.5800148844718933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24654322117567062, + "step": 13812 + }, + { + "epoch": 0.27628, + "grad_norm": 2.140625, + "grad_norm_var": 0.5619504292805989, + "learning_rate": 0.0001, + "loss": 4.1059, + "loss/crossentropy": 1.836085557937622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19563084840774536, + "step": 13814 + }, + { + "epoch": 0.27632, + "grad_norm": 2.046875, + "grad_norm_var": 0.5601722717285156, + "learning_rate": 0.0001, + "loss": 4.0077, + "loss/crossentropy": 1.627321183681488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17698387801647186, + "step": 13816 + }, + { + "epoch": 0.27636, + "grad_norm": 2.046875, + "grad_norm_var": 0.55164794921875, + "learning_rate": 0.0001, + "loss": 4.2038, + "loss/crossentropy": 2.0546024441719055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19832595437765121, + "step": 13818 + }, + { + "epoch": 0.2764, + "grad_norm": 1.96875, + "grad_norm_var": 0.5499501546223958, + "learning_rate": 0.0001, + "loss": 4.064, + "loss/crossentropy": 2.112219452857971, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19605965912342072, + "step": 13820 + }, + { + "epoch": 0.27644, + "grad_norm": 2.078125, + "grad_norm_var": 0.5405006408691406, + "learning_rate": 0.0001, + "loss": 4.3825, + "loss/crossentropy": 1.9582284688949585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015855312347412, + "step": 13822 + }, + { + "epoch": 0.27648, + "grad_norm": 1.953125, + "grad_norm_var": 0.5307573954264323, + "learning_rate": 0.0001, + "loss": 4.2699, + "loss/crossentropy": 2.54653799533844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23546704649925232, + "step": 13824 + }, + { + "epoch": 0.27652, + "grad_norm": 1.953125, + "grad_norm_var": 0.5336090087890625, + "learning_rate": 0.0001, + "loss": 4.0729, + "loss/crossentropy": 1.630593478679657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16478916257619858, + "step": 13826 + }, + { + "epoch": 0.27656, + "grad_norm": 2.046875, + "grad_norm_var": 0.40966796875, + "learning_rate": 0.0001, + "loss": 4.0123, + "loss/crossentropy": 1.7995671033859253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19886638224124908, + "step": 13828 + }, + { + "epoch": 0.2766, + "grad_norm": 2.21875, + "grad_norm_var": 0.0177886962890625, + "learning_rate": 0.0001, + "loss": 4.4547, + "loss/crossentropy": 2.5423338413238525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28057335317134857, + "step": 13830 + }, + { + "epoch": 0.27664, + "grad_norm": 2.078125, + "grad_norm_var": 0.018631744384765624, + "learning_rate": 0.0001, + "loss": 4.0213, + "loss/crossentropy": 1.8524783849716187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20141787081956863, + "step": 13832 + }, + { + "epoch": 0.27668, + "grad_norm": 1.9453125, + "grad_norm_var": 0.019440714518229166, + "learning_rate": 0.0001, + "loss": 4.2155, + "loss/crossentropy": 2.242451548576355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21003998070955276, + "step": 13834 + }, + { + "epoch": 0.27672, + "grad_norm": 2.046875, + "grad_norm_var": 0.0192047119140625, + "learning_rate": 0.0001, + "loss": 4.2665, + "loss/crossentropy": 2.2434345483779907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21679577976465225, + "step": 13836 + }, + { + "epoch": 0.27676, + "grad_norm": 2.0, + "grad_norm_var": 0.019551595052083332, + "learning_rate": 0.0001, + "loss": 4.014, + "loss/crossentropy": 1.6854392290115356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19326400756835938, + "step": 13838 + }, + { + "epoch": 0.2768, + "grad_norm": 2.171875, + "grad_norm_var": 0.026668294270833334, + "learning_rate": 0.0001, + "loss": 4.2837, + "loss/crossentropy": 2.0702155232429504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2043909877538681, + "step": 13840 + }, + { + "epoch": 0.27684, + "grad_norm": 2.015625, + "grad_norm_var": 0.024738566080729166, + "learning_rate": 0.0001, + "loss": 4.0732, + "loss/crossentropy": 1.964758813381195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19540227204561234, + "step": 13842 + }, + { + "epoch": 0.27688, + "grad_norm": 2.046875, + "grad_norm_var": 0.0151611328125, + "learning_rate": 0.0001, + "loss": 4.1681, + "loss/crossentropy": 2.0383411645889282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21456278860569, + "step": 13844 + }, + { + "epoch": 0.27692, + "grad_norm": 1.8984375, + "grad_norm_var": 0.015461222330729166, + "learning_rate": 0.0001, + "loss": 4.1407, + "loss/crossentropy": 2.020030975341797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20521440356969833, + "step": 13846 + }, + { + "epoch": 0.27696, + "grad_norm": 1.890625, + "grad_norm_var": 0.016110992431640624, + "learning_rate": 0.0001, + "loss": 4.1142, + "loss/crossentropy": 1.7514970302581787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19116375595331192, + "step": 13848 + }, + { + "epoch": 0.277, + "grad_norm": 1.9296875, + "grad_norm_var": 0.016437784830729166, + "learning_rate": 0.0001, + "loss": 3.9209, + "loss/crossentropy": 1.8194025754928589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18198709934949875, + "step": 13850 + }, + { + "epoch": 0.27704, + "grad_norm": 1.9375, + "grad_norm_var": 0.017223866780598958, + "learning_rate": 0.0001, + "loss": 3.7911, + "loss/crossentropy": 1.878225862979889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18159592151641846, + "step": 13852 + }, + { + "epoch": 0.27708, + "grad_norm": 2.0, + "grad_norm_var": 0.0178619384765625, + "learning_rate": 0.0001, + "loss": 4.008, + "loss/crossentropy": 1.8619664311408997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18098927289247513, + "step": 13854 + }, + { + "epoch": 0.27712, + "grad_norm": 2.046875, + "grad_norm_var": 0.003474934895833333, + "learning_rate": 0.0001, + "loss": 4.2336, + "loss/crossentropy": 2.0475016832351685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19999271631240845, + "step": 13856 + }, + { + "epoch": 0.27716, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004443105061848958, + "learning_rate": 0.0001, + "loss": 4.2412, + "loss/crossentropy": 2.2279679775238037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21767108887434006, + "step": 13858 + }, + { + "epoch": 0.2772, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0039670308430989586, + "learning_rate": 0.0001, + "loss": 3.8087, + "loss/crossentropy": 1.801176130771637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18873406946659088, + "step": 13860 + }, + { + "epoch": 0.27724, + "grad_norm": 1.8046875, + "grad_norm_var": 0.005303700764973958, + "learning_rate": 0.0001, + "loss": 3.7958, + "loss/crossentropy": 1.948195457458496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19519966840744019, + "step": 13862 + }, + { + "epoch": 0.27728, + "grad_norm": 1.90625, + "grad_norm_var": 0.0050432840983072914, + "learning_rate": 0.0001, + "loss": 4.0754, + "loss/crossentropy": 2.1044042110443115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20454693585634232, + "step": 13864 + }, + { + "epoch": 0.27732, + "grad_norm": 2.015625, + "grad_norm_var": 0.0051910400390625, + "learning_rate": 0.0001, + "loss": 4.1338, + "loss/crossentropy": 2.383382737636566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2179596871137619, + "step": 13866 + }, + { + "epoch": 0.27736, + "grad_norm": 1.9375, + "grad_norm_var": 0.004621378580729167, + "learning_rate": 0.0001, + "loss": 3.9475, + "loss/crossentropy": 1.7798078656196594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17871373891830444, + "step": 13868 + }, + { + "epoch": 0.2774, + "grad_norm": 1.984375, + "grad_norm_var": 0.004756418863932291, + "learning_rate": 0.0001, + "loss": 3.9937, + "loss/crossentropy": 2.1839531660079956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20030800253152847, + "step": 13870 + }, + { + "epoch": 0.27744, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005028279622395834, + "learning_rate": 0.0001, + "loss": 4.1637, + "loss/crossentropy": 1.990351676940918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1951240971684456, + "step": 13872 + }, + { + "epoch": 0.27748, + "grad_norm": 2.140625, + "grad_norm_var": 0.006573232014973959, + "learning_rate": 0.0001, + "loss": 4.2919, + "loss/crossentropy": 2.191170334815979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.230004720389843, + "step": 13874 + }, + { + "epoch": 0.27752, + "grad_norm": 1.953125, + "grad_norm_var": 0.015726470947265626, + "learning_rate": 0.0001, + "loss": 4.1432, + "loss/crossentropy": 1.8893607258796692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2138589546084404, + "step": 13876 + }, + { + "epoch": 0.27756, + "grad_norm": 1.953125, + "grad_norm_var": 0.012483723958333333, + "learning_rate": 0.0001, + "loss": 4.2749, + "loss/crossentropy": 2.059523820877075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20147833973169327, + "step": 13878 + }, + { + "epoch": 0.2776, + "grad_norm": 2.125, + "grad_norm_var": 0.013206990559895833, + "learning_rate": 0.0001, + "loss": 4.4959, + "loss/crossentropy": 2.0867209434509277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23967817425727844, + "step": 13880 + }, + { + "epoch": 0.27764, + "grad_norm": 2.15625, + "grad_norm_var": 0.028393300374348958, + "learning_rate": 0.0001, + "loss": 4.5704, + "loss/crossentropy": 2.3017314672470093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23674342036247253, + "step": 13882 + }, + { + "epoch": 0.27768, + "grad_norm": 2.046875, + "grad_norm_var": 0.0261138916015625, + "learning_rate": 0.0001, + "loss": 4.3316, + "loss/crossentropy": 2.241922974586487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21731412410736084, + "step": 13884 + }, + { + "epoch": 0.27772, + "grad_norm": 2.03125, + "grad_norm_var": 0.025724283854166665, + "learning_rate": 0.0001, + "loss": 4.3317, + "loss/crossentropy": 2.3800796270370483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21916473656892776, + "step": 13886 + }, + { + "epoch": 0.27776, + "grad_norm": 2.09375, + "grad_norm_var": 0.02467625935872396, + "learning_rate": 0.0001, + "loss": 4.2972, + "loss/crossentropy": 1.966018259525299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21862812340259552, + "step": 13888 + }, + { + "epoch": 0.2778, + "grad_norm": 1.828125, + "grad_norm_var": 0.030104319254557293, + "learning_rate": 0.0001, + "loss": 4.1955, + "loss/crossentropy": 1.920684039592743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2026856392621994, + "step": 13890 + }, + { + "epoch": 0.27784, + "grad_norm": 2.0625, + "grad_norm_var": 0.02601318359375, + "learning_rate": 0.0001, + "loss": 3.9662, + "loss/crossentropy": 2.1939095854759216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20429420471191406, + "step": 13892 + }, + { + "epoch": 0.27788, + "grad_norm": 1.9765625, + "grad_norm_var": 0.026292928059895835, + "learning_rate": 0.0001, + "loss": 4.2335, + "loss/crossentropy": 2.1600992679595947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20445890724658966, + "step": 13894 + }, + { + "epoch": 0.27792, + "grad_norm": 1.9453125, + "grad_norm_var": 0.026805623372395834, + "learning_rate": 0.0001, + "loss": 3.9973, + "loss/crossentropy": 1.9898765683174133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19056615233421326, + "step": 13896 + }, + { + "epoch": 0.27796, + "grad_norm": 1.859375, + "grad_norm_var": 0.008861287434895834, + "learning_rate": 0.0001, + "loss": 4.3037, + "loss/crossentropy": 2.0497539043426514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1878463476896286, + "step": 13898 + }, + { + "epoch": 0.278, + "grad_norm": 1.9375, + "grad_norm_var": 0.007373046875, + "learning_rate": 0.0001, + "loss": 4.2299, + "loss/crossentropy": 1.894934356212616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.183290496468544, + "step": 13900 + }, + { + "epoch": 0.27804, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010094960530598959, + "learning_rate": 0.0001, + "loss": 3.858, + "loss/crossentropy": 1.808231770992279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19447006285190582, + "step": 13902 + }, + { + "epoch": 0.27808, + "grad_norm": 2.03125, + "grad_norm_var": 0.009688059488932291, + "learning_rate": 0.0001, + "loss": 4.0553, + "loss/crossentropy": 1.9384279251098633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20293861627578735, + "step": 13904 + }, + { + "epoch": 0.27812, + "grad_norm": 2.28125, + "grad_norm_var": 0.012835439046223958, + "learning_rate": 0.0001, + "loss": 4.3092, + "loss/crossentropy": 2.4291017055511475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24332769960165024, + "step": 13906 + }, + { + "epoch": 0.27816, + "grad_norm": 2.03125, + "grad_norm_var": 0.012669881184895834, + "learning_rate": 0.0001, + "loss": 4.331, + "loss/crossentropy": 2.3018234968185425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21657106280326843, + "step": 13908 + }, + { + "epoch": 0.2782, + "grad_norm": 2.390625, + "grad_norm_var": 0.0204742431640625, + "learning_rate": 0.0001, + "loss": 4.1756, + "loss/crossentropy": 2.211503267288208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19193635880947113, + "step": 13910 + }, + { + "epoch": 0.27824, + "grad_norm": 2.171875, + "grad_norm_var": 0.02191162109375, + "learning_rate": 0.0001, + "loss": 4.372, + "loss/crossentropy": 2.17062246799469, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21989908069372177, + "step": 13912 + }, + { + "epoch": 0.27828, + "grad_norm": 2.015625, + "grad_norm_var": 0.018839518229166668, + "learning_rate": 0.0001, + "loss": 4.0895, + "loss/crossentropy": 2.1446024775505066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2036902904510498, + "step": 13914 + }, + { + "epoch": 0.27832, + "grad_norm": 2.015625, + "grad_norm_var": 0.0177642822265625, + "learning_rate": 0.0001, + "loss": 4.0729, + "loss/crossentropy": 1.8176022171974182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19358078390359879, + "step": 13916 + }, + { + "epoch": 0.27836, + "grad_norm": 2.09375, + "grad_norm_var": 0.01946996053059896, + "learning_rate": 0.0001, + "loss": 4.2376, + "loss/crossentropy": 2.341936469078064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22822833061218262, + "step": 13918 + }, + { + "epoch": 0.2784, + "grad_norm": 2.046875, + "grad_norm_var": 0.01697565714518229, + "learning_rate": 0.0001, + "loss": 4.0943, + "loss/crossentropy": 1.9315840601921082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17218464612960815, + "step": 13920 + }, + { + "epoch": 0.27844, + "grad_norm": 1.890625, + "grad_norm_var": 0.01615168253580729, + "learning_rate": 0.0001, + "loss": 4.1507, + "loss/crossentropy": 1.9442673921585083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19567463546991348, + "step": 13922 + }, + { + "epoch": 0.27848, + "grad_norm": 1.9921875, + "grad_norm_var": 0.016673787434895834, + "learning_rate": 0.0001, + "loss": 4.1611, + "loss/crossentropy": 2.3023592233657837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21824805438518524, + "step": 13924 + }, + { + "epoch": 0.27852, + "grad_norm": 1.8359375, + "grad_norm_var": 0.01170654296875, + "learning_rate": 0.0001, + "loss": 3.7557, + "loss/crossentropy": 2.116863250732422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2087356001138687, + "step": 13926 + }, + { + "epoch": 0.27856, + "grad_norm": 1.953125, + "grad_norm_var": 0.010936482747395834, + "learning_rate": 0.0001, + "loss": 4.3746, + "loss/crossentropy": 2.0485963821411133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18910974264144897, + "step": 13928 + }, + { + "epoch": 0.2786, + "grad_norm": 2.0, + "grad_norm_var": 0.024857584635416666, + "learning_rate": 0.0001, + "loss": 4.1309, + "loss/crossentropy": 2.031753659248352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23851437866687775, + "step": 13930 + }, + { + "epoch": 0.27864, + "grad_norm": 1.953125, + "grad_norm_var": 0.025585683186848958, + "learning_rate": 0.0001, + "loss": 4.0688, + "loss/crossentropy": 1.8019705414772034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18659329414367676, + "step": 13932 + }, + { + "epoch": 0.27868, + "grad_norm": 2.109375, + "grad_norm_var": 0.025243123372395832, + "learning_rate": 0.0001, + "loss": 3.9609, + "loss/crossentropy": 1.9191861152648926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19393808394670486, + "step": 13934 + }, + { + "epoch": 0.27872, + "grad_norm": 1.984375, + "grad_norm_var": 0.025763956705729167, + "learning_rate": 0.0001, + "loss": 4.1765, + "loss/crossentropy": 2.242986798286438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.213445246219635, + "step": 13936 + }, + { + "epoch": 0.27876, + "grad_norm": 2.09375, + "grad_norm_var": 0.0300689697265625, + "learning_rate": 0.0001, + "loss": 4.1767, + "loss/crossentropy": 2.1652570962905884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2070690169930458, + "step": 13938 + }, + { + "epoch": 0.2788, + "grad_norm": 1.9140625, + "grad_norm_var": 0.031583404541015624, + "learning_rate": 0.0001, + "loss": 3.9643, + "loss/crossentropy": 2.0970187187194824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20118873566389084, + "step": 13940 + }, + { + "epoch": 0.27884, + "grad_norm": 2.078125, + "grad_norm_var": 0.02823053995768229, + "learning_rate": 0.0001, + "loss": 4.1116, + "loss/crossentropy": 1.9957542419433594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2377041131258011, + "step": 13942 + }, + { + "epoch": 0.27888, + "grad_norm": 2.0, + "grad_norm_var": 0.02522761027018229, + "learning_rate": 0.0001, + "loss": 4.0718, + "loss/crossentropy": 1.8680259585380554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1919926255941391, + "step": 13944 + }, + { + "epoch": 0.27892, + "grad_norm": 2.0625, + "grad_norm_var": 0.015516916910807291, + "learning_rate": 0.0001, + "loss": 4.301, + "loss/crossentropy": 2.369240164756775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24805612862110138, + "step": 13946 + }, + { + "epoch": 0.27896, + "grad_norm": 2.0625, + "grad_norm_var": 0.014530436197916666, + "learning_rate": 0.0001, + "loss": 4.3082, + "loss/crossentropy": 2.0923795104026794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20823675394058228, + "step": 13948 + }, + { + "epoch": 0.279, + "grad_norm": 2.1875, + "grad_norm_var": 0.013197580973307291, + "learning_rate": 0.0001, + "loss": 4.3049, + "loss/crossentropy": 2.1777398586273193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21319355070590973, + "step": 13950 + }, + { + "epoch": 0.27904, + "grad_norm": 1.890625, + "grad_norm_var": 0.014869944254557291, + "learning_rate": 0.0001, + "loss": 4.0918, + "loss/crossentropy": 2.2155595421791077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2029823139309883, + "step": 13952 + }, + { + "epoch": 0.27908, + "grad_norm": 1.96875, + "grad_norm_var": 0.011435699462890626, + "learning_rate": 0.0001, + "loss": 3.7195, + "loss/crossentropy": 1.5916873812675476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18207873404026031, + "step": 13954 + }, + { + "epoch": 0.27912, + "grad_norm": 2.09375, + "grad_norm_var": 0.009791819254557292, + "learning_rate": 0.0001, + "loss": 4.3336, + "loss/crossentropy": 2.165639281272888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22917281091213226, + "step": 13956 + }, + { + "epoch": 0.27916, + "grad_norm": 1.8359375, + "grad_norm_var": 0.012851715087890625, + "learning_rate": 0.0001, + "loss": 3.9586, + "loss/crossentropy": 2.0245776772499084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19862860441207886, + "step": 13958 + }, + { + "epoch": 0.2792, + "grad_norm": 2.0625, + "grad_norm_var": 0.013646443684895834, + "learning_rate": 0.0001, + "loss": 4.1593, + "loss/crossentropy": 2.0179941654205322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20749768614768982, + "step": 13960 + }, + { + "epoch": 0.27924, + "grad_norm": 2.140625, + "grad_norm_var": 0.011966705322265625, + "learning_rate": 0.0001, + "loss": 4.3978, + "loss/crossentropy": 1.998136818408966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1992153376340866, + "step": 13962 + }, + { + "epoch": 0.27928, + "grad_norm": 1.890625, + "grad_norm_var": 0.015346018473307292, + "learning_rate": 0.0001, + "loss": 4.1801, + "loss/crossentropy": 2.041996657848358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22224828600883484, + "step": 13964 + }, + { + "epoch": 0.27932, + "grad_norm": 2.140625, + "grad_norm_var": 0.019406890869140624, + "learning_rate": 0.0001, + "loss": 4.4133, + "loss/crossentropy": 2.0259117484092712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2001984491944313, + "step": 13966 + }, + { + "epoch": 0.27936, + "grad_norm": 2.015625, + "grad_norm_var": 0.018436686197916666, + "learning_rate": 0.0001, + "loss": 4.0439, + "loss/crossentropy": 1.8219285607337952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19637393951416016, + "step": 13968 + }, + { + "epoch": 0.2794, + "grad_norm": 2.046875, + "grad_norm_var": 0.017073567708333334, + "learning_rate": 0.0001, + "loss": 4.0294, + "loss/crossentropy": 2.1451289653778076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21846695989370346, + "step": 13970 + }, + { + "epoch": 0.27944, + "grad_norm": 2.125, + "grad_norm_var": 0.016877237955729166, + "learning_rate": 0.0001, + "loss": 4.349, + "loss/crossentropy": 2.0527132749557495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2147705778479576, + "step": 13972 + }, + { + "epoch": 0.27948, + "grad_norm": 1.9921875, + "grad_norm_var": 0.015143839518229167, + "learning_rate": 0.0001, + "loss": 4.0188, + "loss/crossentropy": 1.9238123893737793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21083690226078033, + "step": 13974 + }, + { + "epoch": 0.27952, + "grad_norm": 1.953125, + "grad_norm_var": 0.015057118733723958, + "learning_rate": 0.0001, + "loss": 4.0072, + "loss/crossentropy": 1.894473671913147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20077680051326752, + "step": 13976 + }, + { + "epoch": 0.27956, + "grad_norm": 2.03125, + "grad_norm_var": 0.013871256510416667, + "learning_rate": 0.0001, + "loss": 4.2344, + "loss/crossentropy": 2.139513611793518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2138260081410408, + "step": 13978 + }, + { + "epoch": 0.2796, + "grad_norm": 2.171875, + "grad_norm_var": 0.013468424479166666, + "learning_rate": 0.0001, + "loss": 4.4519, + "loss/crossentropy": 2.4868407249450684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23635651916265488, + "step": 13980 + }, + { + "epoch": 0.27964, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007940419514973958, + "learning_rate": 0.0001, + "loss": 3.7428, + "loss/crossentropy": 1.426945686340332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16342981159687042, + "step": 13982 + }, + { + "epoch": 0.27968, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007885487874348958, + "learning_rate": 0.0001, + "loss": 4.0457, + "loss/crossentropy": 2.0577683448791504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20394782721996307, + "step": 13984 + }, + { + "epoch": 0.27972, + "grad_norm": 2.0625, + "grad_norm_var": 0.008063761393229167, + "learning_rate": 0.0001, + "loss": 4.0045, + "loss/crossentropy": 1.884658396244049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18946699798107147, + "step": 13986 + }, + { + "epoch": 0.27976, + "grad_norm": 1.984375, + "grad_norm_var": 0.0070383707682291664, + "learning_rate": 0.0001, + "loss": 3.9423, + "loss/crossentropy": 1.7344964742660522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1923774629831314, + "step": 13988 + }, + { + "epoch": 0.2798, + "grad_norm": 1.84375, + "grad_norm_var": 0.00888671875, + "learning_rate": 0.0001, + "loss": 3.9541, + "loss/crossentropy": 1.9660141468048096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18560642004013062, + "step": 13990 + }, + { + "epoch": 0.27984, + "grad_norm": 2.046875, + "grad_norm_var": 0.008998362223307292, + "learning_rate": 0.0001, + "loss": 3.9308, + "loss/crossentropy": 1.7832527160644531, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19922740012407303, + "step": 13992 + }, + { + "epoch": 0.27988, + "grad_norm": 2.078125, + "grad_norm_var": 0.009987131754557291, + "learning_rate": 0.0001, + "loss": 4.3919, + "loss/crossentropy": 1.9763087630271912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2111397087574005, + "step": 13994 + }, + { + "epoch": 0.27992, + "grad_norm": 2.0625, + "grad_norm_var": 0.005252838134765625, + "learning_rate": 0.0001, + "loss": 4.1331, + "loss/crossentropy": 2.262092709541321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21212221682071686, + "step": 13996 + }, + { + "epoch": 0.27996, + "grad_norm": 2.015625, + "grad_norm_var": 0.0054595947265625, + "learning_rate": 0.0001, + "loss": 4.1946, + "loss/crossentropy": 2.237086296081543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20895393192768097, + "step": 13998 + }, + { + "epoch": 0.28, + "grad_norm": 1.890625, + "grad_norm_var": 0.0061337788899739586, + "learning_rate": 0.0001, + "loss": 4.063, + "loss/crossentropy": 1.9579973816871643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18024132400751114, + "step": 14000 + }, + { + "epoch": 0.28004, + "grad_norm": 2.015625, + "grad_norm_var": 0.0055844624837239586, + "learning_rate": 0.0001, + "loss": 4.1463, + "loss/crossentropy": 2.2264128923416138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2181634083390236, + "step": 14002 + }, + { + "epoch": 0.28008, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0058013916015625, + "learning_rate": 0.0001, + "loss": 4.0081, + "loss/crossentropy": 1.836763322353363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1983085498213768, + "step": 14004 + }, + { + "epoch": 0.28012, + "grad_norm": 1.8515625, + "grad_norm_var": 0.005448404947916667, + "learning_rate": 0.0001, + "loss": 4.1192, + "loss/crossentropy": 2.0163257718086243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.214015431702137, + "step": 14006 + }, + { + "epoch": 0.28016, + "grad_norm": 2.140625, + "grad_norm_var": 0.0070383707682291664, + "learning_rate": 0.0001, + "loss": 4.139, + "loss/crossentropy": 2.31567645072937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22832776606082916, + "step": 14008 + }, + { + "epoch": 0.2802, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006685129801432292, + "learning_rate": 0.0001, + "loss": 4.0753, + "loss/crossentropy": 2.3515210151672363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21946918964385986, + "step": 14010 + }, + { + "epoch": 0.28024, + "grad_norm": 2.0, + "grad_norm_var": 0.006845855712890625, + "learning_rate": 0.0001, + "loss": 3.9761, + "loss/crossentropy": 2.0987448692321777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2180740088224411, + "step": 14012 + }, + { + "epoch": 0.28028, + "grad_norm": 2.015625, + "grad_norm_var": 0.00662841796875, + "learning_rate": 0.0001, + "loss": 4.0909, + "loss/crossentropy": 2.2203436493873596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2157250940799713, + "step": 14014 + }, + { + "epoch": 0.28032, + "grad_norm": 1.953125, + "grad_norm_var": 0.006241861979166667, + "learning_rate": 0.0001, + "loss": 4.3152, + "loss/crossentropy": 2.083233594894409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2187238186597824, + "step": 14016 + }, + { + "epoch": 0.28036, + "grad_norm": 2.015625, + "grad_norm_var": 0.0061075846354166664, + "learning_rate": 0.0001, + "loss": 4.1311, + "loss/crossentropy": 2.1261327266693115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063112109899521, + "step": 14018 + }, + { + "epoch": 0.2804, + "grad_norm": 2.109375, + "grad_norm_var": 0.006648508707682291, + "learning_rate": 0.0001, + "loss": 4.0901, + "loss/crossentropy": 2.012390434741974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20225800573825836, + "step": 14020 + }, + { + "epoch": 0.28044, + "grad_norm": 2.046875, + "grad_norm_var": 0.0047271728515625, + "learning_rate": 0.0001, + "loss": 3.9985, + "loss/crossentropy": 1.8605966567993164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2003621682524681, + "step": 14022 + }, + { + "epoch": 0.28048, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0028157552083333333, + "learning_rate": 0.0001, + "loss": 3.9774, + "loss/crossentropy": 1.9208670258522034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20056191086769104, + "step": 14024 + }, + { + "epoch": 0.28052, + "grad_norm": 2.046875, + "grad_norm_var": 0.003951009114583333, + "learning_rate": 0.0001, + "loss": 3.7462, + "loss/crossentropy": 1.924518644809723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19410791993141174, + "step": 14026 + }, + { + "epoch": 0.28056, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0035113016764322918, + "learning_rate": 0.0001, + "loss": 4.0319, + "loss/crossentropy": 2.103038251399994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2054024636745453, + "step": 14028 + }, + { + "epoch": 0.2806, + "grad_norm": 1.8984375, + "grad_norm_var": 0.004303995768229167, + "learning_rate": 0.0001, + "loss": 4.0526, + "loss/crossentropy": 1.8974853157997131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18592579662799835, + "step": 14030 + }, + { + "epoch": 0.28064, + "grad_norm": 2.03125, + "grad_norm_var": 0.004447428385416666, + "learning_rate": 0.0001, + "loss": 4.1735, + "loss/crossentropy": 2.0207908749580383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.216909758746624, + "step": 14032 + }, + { + "epoch": 0.28068, + "grad_norm": 2.171875, + "grad_norm_var": 0.006119537353515625, + "learning_rate": 0.0001, + "loss": 4.5759, + "loss/crossentropy": 2.0010873079299927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20815415680408478, + "step": 14034 + }, + { + "epoch": 0.28072, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006961822509765625, + "learning_rate": 0.0001, + "loss": 3.9162, + "loss/crossentropy": 2.0840484499931335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21481210738420486, + "step": 14036 + }, + { + "epoch": 0.28076, + "grad_norm": 2.03125, + "grad_norm_var": 0.006880442301432292, + "learning_rate": 0.0001, + "loss": 4.1298, + "loss/crossentropy": 2.0026179552078247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1904347836971283, + "step": 14038 + }, + { + "epoch": 0.2808, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006880442301432292, + "learning_rate": 0.0001, + "loss": 4.0618, + "loss/crossentropy": 2.1660486459732056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21236423403024673, + "step": 14040 + }, + { + "epoch": 0.28084, + "grad_norm": 2.15625, + "grad_norm_var": 0.007669830322265625, + "learning_rate": 0.0001, + "loss": 4.0474, + "loss/crossentropy": 2.312765598297119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20535308122634888, + "step": 14042 + }, + { + "epoch": 0.28088, + "grad_norm": 1.984375, + "grad_norm_var": 0.0077789306640625, + "learning_rate": 0.0001, + "loss": 4.1959, + "loss/crossentropy": 1.8449203372001648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1986582651734352, + "step": 14044 + }, + { + "epoch": 0.28092, + "grad_norm": 1.8359375, + "grad_norm_var": 0.008894602457682291, + "learning_rate": 0.0001, + "loss": 4.016, + "loss/crossentropy": 2.0159433484077454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19724585115909576, + "step": 14046 + }, + { + "epoch": 0.28096, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0086090087890625, + "learning_rate": 0.0001, + "loss": 4.0691, + "loss/crossentropy": 2.138873815536499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21438196301460266, + "step": 14048 + }, + { + "epoch": 0.281, + "grad_norm": 2.25, + "grad_norm_var": 0.011372629801432292, + "learning_rate": 0.0001, + "loss": 4.0985, + "loss/crossentropy": 1.6896708607673645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19260118901729584, + "step": 14050 + }, + { + "epoch": 0.28104, + "grad_norm": 1.828125, + "grad_norm_var": 0.014656321207682291, + "learning_rate": 0.0001, + "loss": 3.8383, + "loss/crossentropy": 1.8495243191719055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18072611093521118, + "step": 14052 + }, + { + "epoch": 0.28108, + "grad_norm": 1.9765625, + "grad_norm_var": 0.017354329427083332, + "learning_rate": 0.0001, + "loss": 4.3581, + "loss/crossentropy": 2.193492293357849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2247578352689743, + "step": 14054 + }, + { + "epoch": 0.28112, + "grad_norm": 1.96875, + "grad_norm_var": 0.01761652628580729, + "learning_rate": 0.0001, + "loss": 4.1252, + "loss/crossentropy": 2.154744803905487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20149191468954086, + "step": 14056 + }, + { + "epoch": 0.28116, + "grad_norm": 1.9921875, + "grad_norm_var": 0.016022745768229166, + "learning_rate": 0.0001, + "loss": 4.0832, + "loss/crossentropy": 2.137023687362671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20409516990184784, + "step": 14058 + }, + { + "epoch": 0.2812, + "grad_norm": 2.96875, + "grad_norm_var": 0.0767242431640625, + "learning_rate": 0.0001, + "loss": 4.6582, + "loss/crossentropy": 2.4530670642852783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24031317979097366, + "step": 14060 + }, + { + "epoch": 0.28124, + "grad_norm": 2.265625, + "grad_norm_var": 0.07713394165039063, + "learning_rate": 0.0001, + "loss": 4.2973, + "loss/crossentropy": 2.3285664319992065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.223993182182312, + "step": 14062 + }, + { + "epoch": 0.28128, + "grad_norm": 1.90625, + "grad_norm_var": 0.0784088134765625, + "learning_rate": 0.0001, + "loss": 4.0899, + "loss/crossentropy": 2.126828193664551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21428050100803375, + "step": 14064 + }, + { + "epoch": 0.28132, + "grad_norm": 1.796875, + "grad_norm_var": 0.08012288411458333, + "learning_rate": 0.0001, + "loss": 3.8083, + "loss/crossentropy": 2.0194268226623535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20124298334121704, + "step": 14066 + }, + { + "epoch": 0.28136, + "grad_norm": 2.125, + "grad_norm_var": 0.07445882161458334, + "learning_rate": 0.0001, + "loss": 4.0002, + "loss/crossentropy": 1.796358585357666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1902359575033188, + "step": 14068 + }, + { + "epoch": 0.2814, + "grad_norm": 2.15625, + "grad_norm_var": 0.07318700154622396, + "learning_rate": 0.0001, + "loss": 4.2, + "loss/crossentropy": 1.8975006341934204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19528520107269287, + "step": 14070 + }, + { + "epoch": 0.28144, + "grad_norm": 1.9296875, + "grad_norm_var": 0.07691141764322916, + "learning_rate": 0.0001, + "loss": 3.9885, + "loss/crossentropy": 1.628948986530304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1823427528142929, + "step": 14072 + }, + { + "epoch": 0.28148, + "grad_norm": 1.9140625, + "grad_norm_var": 0.07718480428059896, + "learning_rate": 0.0001, + "loss": 4.0404, + "loss/crossentropy": 1.9894697070121765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1945328414440155, + "step": 14074 + }, + { + "epoch": 0.28152, + "grad_norm": 2.125, + "grad_norm_var": 0.022489166259765624, + "learning_rate": 0.0001, + "loss": 4.0927, + "loss/crossentropy": 1.9978017210960388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21055076271295547, + "step": 14076 + }, + { + "epoch": 0.28156, + "grad_norm": 1.921875, + "grad_norm_var": 0.01697565714518229, + "learning_rate": 0.0001, + "loss": 3.9224, + "loss/crossentropy": 2.087389588356018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19179877638816833, + "step": 14078 + }, + { + "epoch": 0.2816, + "grad_norm": 1.96875, + "grad_norm_var": 0.018308258056640624, + "learning_rate": 0.0001, + "loss": 4.3356, + "loss/crossentropy": 2.3281190395355225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2292996421456337, + "step": 14080 + }, + { + "epoch": 0.28164, + "grad_norm": 1.953125, + "grad_norm_var": 0.015215810139973958, + "learning_rate": 0.0001, + "loss": 3.8584, + "loss/crossentropy": 1.8850311040878296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19652675837278366, + "step": 14082 + }, + { + "epoch": 0.28168, + "grad_norm": 1.8671875, + "grad_norm_var": 0.015604654947916666, + "learning_rate": 0.0001, + "loss": 3.9013, + "loss/crossentropy": 1.757595181465149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17358998954296112, + "step": 14084 + }, + { + "epoch": 0.28172, + "grad_norm": 2.015625, + "grad_norm_var": 0.01387939453125, + "learning_rate": 0.0001, + "loss": 4.1367, + "loss/crossentropy": 2.173685908317566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22043757140636444, + "step": 14086 + }, + { + "epoch": 0.28176, + "grad_norm": 1.8203125, + "grad_norm_var": 0.010758463541666667, + "learning_rate": 0.0001, + "loss": 4.0381, + "loss/crossentropy": 2.0290868282318115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19695669412612915, + "step": 14088 + }, + { + "epoch": 0.2818, + "grad_norm": 2.125, + "grad_norm_var": 0.010609690348307292, + "learning_rate": 0.0001, + "loss": 4.3497, + "loss/crossentropy": 2.320943236351013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21747629344463348, + "step": 14090 + }, + { + "epoch": 0.28184, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011205037434895834, + "learning_rate": 0.0001, + "loss": 4.0918, + "loss/crossentropy": 2.0311750173568726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19410807639360428, + "step": 14092 + }, + { + "epoch": 0.28188, + "grad_norm": 2.03125, + "grad_norm_var": 0.011207834879557291, + "learning_rate": 0.0001, + "loss": 4.1303, + "loss/crossentropy": 2.164630174636841, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1962611824274063, + "step": 14094 + }, + { + "epoch": 0.28192, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0093658447265625, + "learning_rate": 0.0001, + "loss": 3.9681, + "loss/crossentropy": 2.170566439628601, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24082274734973907, + "step": 14096 + }, + { + "epoch": 0.28196, + "grad_norm": 2.0, + "grad_norm_var": 0.009285227457682291, + "learning_rate": 0.0001, + "loss": 4.209, + "loss/crossentropy": 2.3137707710266113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22475039958953857, + "step": 14098 + }, + { + "epoch": 0.282, + "grad_norm": 1.7421875, + "grad_norm_var": 0.010794830322265626, + "learning_rate": 0.0001, + "loss": 3.7432, + "loss/crossentropy": 2.0448675751686096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19814980030059814, + "step": 14100 + }, + { + "epoch": 0.28204, + "grad_norm": 1.96875, + "grad_norm_var": 0.011423492431640625, + "learning_rate": 0.0001, + "loss": 4.1729, + "loss/crossentropy": 2.095334231853485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19743670523166656, + "step": 14102 + }, + { + "epoch": 0.28208, + "grad_norm": 1.9765625, + "grad_norm_var": 0.008737945556640625, + "learning_rate": 0.0001, + "loss": 4.0362, + "loss/crossentropy": 2.280429720878601, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21895557641983032, + "step": 14104 + }, + { + "epoch": 0.28212, + "grad_norm": 2.09375, + "grad_norm_var": 0.008292388916015626, + "learning_rate": 0.0001, + "loss": 4.1166, + "loss/crossentropy": 1.8548901677131653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19965855032205582, + "step": 14106 + }, + { + "epoch": 0.28216, + "grad_norm": 2.234375, + "grad_norm_var": 0.01163330078125, + "learning_rate": 0.0001, + "loss": 4.2589, + "loss/crossentropy": 2.3574371337890625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23160286247730255, + "step": 14108 + }, + { + "epoch": 0.2822, + "grad_norm": 2.078125, + "grad_norm_var": 0.012800089518229167, + "learning_rate": 0.0001, + "loss": 3.932, + "loss/crossentropy": 2.112724542617798, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19597014784812927, + "step": 14110 + }, + { + "epoch": 0.28224, + "grad_norm": 1.8828125, + "grad_norm_var": 0.013068644205729167, + "learning_rate": 0.0001, + "loss": 3.8792, + "loss/crossentropy": 1.9877798557281494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21120092272758484, + "step": 14112 + }, + { + "epoch": 0.28228, + "grad_norm": 2.015625, + "grad_norm_var": 0.013199869791666667, + "learning_rate": 0.0001, + "loss": 4.1761, + "loss/crossentropy": 2.253028154373169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2046835869550705, + "step": 14114 + }, + { + "epoch": 0.28232, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008561197916666667, + "learning_rate": 0.0001, + "loss": 4.1711, + "loss/crossentropy": 2.322582960128784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22178008407354355, + "step": 14116 + }, + { + "epoch": 0.28236, + "grad_norm": 2.03125, + "grad_norm_var": 0.008204905192057292, + "learning_rate": 0.0001, + "loss": 4.1584, + "loss/crossentropy": 2.470620036125183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23360955715179443, + "step": 14118 + }, + { + "epoch": 0.2824, + "grad_norm": 1.90625, + "grad_norm_var": 0.0086334228515625, + "learning_rate": 0.0001, + "loss": 4.0206, + "loss/crossentropy": 1.9218884110450745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1978505551815033, + "step": 14120 + }, + { + "epoch": 0.28244, + "grad_norm": 2.125, + "grad_norm_var": 0.0092437744140625, + "learning_rate": 0.0001, + "loss": 4.2045, + "loss/crossentropy": 1.9114753007888794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19295217841863632, + "step": 14122 + }, + { + "epoch": 0.28248, + "grad_norm": 2.015625, + "grad_norm_var": 0.498583984375, + "learning_rate": 0.0001, + "loss": 4.2259, + "loss/crossentropy": 2.1196082830429077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2068253606557846, + "step": 14124 + }, + { + "epoch": 0.28252, + "grad_norm": 2.046875, + "grad_norm_var": 0.492419179280599, + "learning_rate": 0.0001, + "loss": 4.2616, + "loss/crossentropy": 1.9869969487190247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1950489580631256, + "step": 14126 + }, + { + "epoch": 0.28256, + "grad_norm": 1.9609375, + "grad_norm_var": 0.488177235921224, + "learning_rate": 0.0001, + "loss": 4.1635, + "loss/crossentropy": 2.09599232673645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22184181958436966, + "step": 14128 + }, + { + "epoch": 0.2826, + "grad_norm": 2.046875, + "grad_norm_var": 0.489172108968099, + "learning_rate": 0.0001, + "loss": 4.2238, + "loss/crossentropy": 2.1891895532608032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2229236587882042, + "step": 14130 + }, + { + "epoch": 0.28264, + "grad_norm": 2.171875, + "grad_norm_var": 0.48787816365559894, + "learning_rate": 0.0001, + "loss": 4.12, + "loss/crossentropy": 2.5070972442626953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21862925589084625, + "step": 14132 + }, + { + "epoch": 0.28268, + "grad_norm": 2.03125, + "grad_norm_var": 0.48812662760416664, + "learning_rate": 0.0001, + "loss": 4.0993, + "loss/crossentropy": 2.100346863269806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19969436526298523, + "step": 14134 + }, + { + "epoch": 0.28272, + "grad_norm": 2.1875, + "grad_norm_var": 0.48713150024414065, + "learning_rate": 0.0001, + "loss": 4.2607, + "loss/crossentropy": 2.0019801259040833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21319210529327393, + "step": 14136 + }, + { + "epoch": 0.28276, + "grad_norm": 2.296875, + "grad_norm_var": 0.4855323791503906, + "learning_rate": 0.0001, + "loss": 4.7069, + "loss/crossentropy": 2.2734099626541138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24008513987064362, + "step": 14138 + }, + { + "epoch": 0.2828, + "grad_norm": 2.484375, + "grad_norm_var": 0.02581965128580729, + "learning_rate": 0.0001, + "loss": 4.1085, + "loss/crossentropy": 2.2589277029037476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2224387675523758, + "step": 14140 + }, + { + "epoch": 0.28284, + "grad_norm": 1.9609375, + "grad_norm_var": 0.02667236328125, + "learning_rate": 0.0001, + "loss": 4.2749, + "loss/crossentropy": 1.713346004486084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19133105129003525, + "step": 14142 + }, + { + "epoch": 0.28288, + "grad_norm": 2.0, + "grad_norm_var": 0.02616144816080729, + "learning_rate": 0.0001, + "loss": 4.1792, + "loss/crossentropy": 2.212652564048767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21326624602079391, + "step": 14144 + }, + { + "epoch": 0.28292, + "grad_norm": 2.046875, + "grad_norm_var": 0.02902399698893229, + "learning_rate": 0.0001, + "loss": 3.901, + "loss/crossentropy": 1.8900847434997559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1882227212190628, + "step": 14146 + }, + { + "epoch": 0.28296, + "grad_norm": 2.25, + "grad_norm_var": 0.028913370768229165, + "learning_rate": 0.0001, + "loss": 4.3775, + "loss/crossentropy": 2.171455979347229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20516864210367203, + "step": 14148 + }, + { + "epoch": 0.283, + "grad_norm": 2.046875, + "grad_norm_var": 0.027608235677083332, + "learning_rate": 0.0001, + "loss": 4.3466, + "loss/crossentropy": 2.0112149715423584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20043446868658066, + "step": 14150 + }, + { + "epoch": 0.28304, + "grad_norm": 1.96875, + "grad_norm_var": 0.026596832275390624, + "learning_rate": 0.0001, + "loss": 4.2879, + "loss/crossentropy": 2.0091559886932373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21023409068584442, + "step": 14152 + }, + { + "epoch": 0.28308, + "grad_norm": 1.9296875, + "grad_norm_var": 0.023045857747395832, + "learning_rate": 0.0001, + "loss": 3.9234, + "loss/crossentropy": 2.122502863407135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2023375853896141, + "step": 14154 + }, + { + "epoch": 0.28312, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011462148030598958, + "learning_rate": 0.0001, + "loss": 4.3187, + "loss/crossentropy": 2.2842462062835693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24532422423362732, + "step": 14156 + }, + { + "epoch": 0.28316, + "grad_norm": 2.078125, + "grad_norm_var": 0.010990397135416666, + "learning_rate": 0.0001, + "loss": 4.5248, + "loss/crossentropy": 2.247039318084717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24580712616443634, + "step": 14158 + }, + { + "epoch": 0.2832, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011432902018229166, + "learning_rate": 0.0001, + "loss": 3.9486, + "loss/crossentropy": 2.182216167449951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21095345169305801, + "step": 14160 + }, + { + "epoch": 0.28324, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008648427327473958, + "learning_rate": 0.0001, + "loss": 4.4582, + "loss/crossentropy": 2.219786763191223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21747471392154694, + "step": 14162 + }, + { + "epoch": 0.28328, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005830637613932292, + "learning_rate": 0.0001, + "loss": 3.9483, + "loss/crossentropy": 1.8589028716087341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19629907608032227, + "step": 14164 + }, + { + "epoch": 0.28332, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0138092041015625, + "learning_rate": 0.0001, + "loss": 4.0217, + "loss/crossentropy": 1.9820821285247803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20433218777179718, + "step": 14166 + }, + { + "epoch": 0.28336, + "grad_norm": 1.9296875, + "grad_norm_var": 0.014216868082682292, + "learning_rate": 0.0001, + "loss": 4.0081, + "loss/crossentropy": 1.8423896431922913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18210452795028687, + "step": 14168 + }, + { + "epoch": 0.2834, + "grad_norm": 2.171875, + "grad_norm_var": 0.0158203125, + "learning_rate": 0.0001, + "loss": 4.1244, + "loss/crossentropy": 2.0365665555000305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21229179203510284, + "step": 14170 + }, + { + "epoch": 0.28344, + "grad_norm": 1.7578125, + "grad_norm_var": 0.019212849934895835, + "learning_rate": 0.0001, + "loss": 3.7894, + "loss/crossentropy": 1.6819360256195068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20171043276786804, + "step": 14172 + }, + { + "epoch": 0.28348, + "grad_norm": 1.8828125, + "grad_norm_var": 0.019846343994140626, + "learning_rate": 0.0001, + "loss": 4.1247, + "loss/crossentropy": 1.9932513236999512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21374855190515518, + "step": 14174 + }, + { + "epoch": 0.28352, + "grad_norm": 1.96875, + "grad_norm_var": 0.01975072224934896, + "learning_rate": 0.0001, + "loss": 3.8874, + "loss/crossentropy": 1.7137236595153809, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17538649588823318, + "step": 14176 + }, + { + "epoch": 0.28356, + "grad_norm": 1.953125, + "grad_norm_var": 0.01977717081705729, + "learning_rate": 0.0001, + "loss": 3.8762, + "loss/crossentropy": 1.7120450735092163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19030355662107468, + "step": 14178 + }, + { + "epoch": 0.2836, + "grad_norm": 1.9765625, + "grad_norm_var": 0.019962565104166666, + "learning_rate": 0.0001, + "loss": 4.4603, + "loss/crossentropy": 2.346954107284546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24070723354816437, + "step": 14180 + }, + { + "epoch": 0.28364, + "grad_norm": 1.953125, + "grad_norm_var": 0.013755035400390626, + "learning_rate": 0.0001, + "loss": 4.2238, + "loss/crossentropy": 2.241390824317932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2247970700263977, + "step": 14182 + }, + { + "epoch": 0.28368, + "grad_norm": 1.890625, + "grad_norm_var": 0.02080078125, + "learning_rate": 0.0001, + "loss": 4.206, + "loss/crossentropy": 2.234646439552307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21947349607944489, + "step": 14184 + }, + { + "epoch": 0.28372, + "grad_norm": 1.9375, + "grad_norm_var": 0.019010416666666665, + "learning_rate": 0.0001, + "loss": 4.0446, + "loss/crossentropy": 1.9331231117248535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20428168773651123, + "step": 14186 + }, + { + "epoch": 0.28376, + "grad_norm": 1.9765625, + "grad_norm_var": 0.015315500895182292, + "learning_rate": 0.0001, + "loss": 4.0184, + "loss/crossentropy": 1.8166351318359375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21285146474838257, + "step": 14188 + }, + { + "epoch": 0.2838, + "grad_norm": 1.9765625, + "grad_norm_var": 0.015077463785807292, + "learning_rate": 0.0001, + "loss": 4.313, + "loss/crossentropy": 2.181701421737671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20924244076013565, + "step": 14190 + }, + { + "epoch": 0.28384, + "grad_norm": 2.078125, + "grad_norm_var": 0.015558878580729166, + "learning_rate": 0.0001, + "loss": 4.0866, + "loss/crossentropy": 2.1605955958366394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24021095037460327, + "step": 14192 + }, + { + "epoch": 0.28388, + "grad_norm": 2.1875, + "grad_norm_var": 0.015455881754557291, + "learning_rate": 0.0001, + "loss": 4.1617, + "loss/crossentropy": 2.3546024560928345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24228987097740173, + "step": 14194 + }, + { + "epoch": 0.28392, + "grad_norm": 2.015625, + "grad_norm_var": 0.015314737955729166, + "learning_rate": 0.0001, + "loss": 4.0791, + "loss/crossentropy": 2.2335511445999146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21917300671339035, + "step": 14196 + }, + { + "epoch": 0.28396, + "grad_norm": 2.0, + "grad_norm_var": 0.012760416666666666, + "learning_rate": 0.0001, + "loss": 3.9915, + "loss/crossentropy": 1.8429189324378967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18830660730600357, + "step": 14198 + }, + { + "epoch": 0.284, + "grad_norm": 2.015625, + "grad_norm_var": 0.0058746337890625, + "learning_rate": 0.0001, + "loss": 4.2671, + "loss/crossentropy": 2.4578241109848022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23527230322360992, + "step": 14200 + }, + { + "epoch": 0.28404, + "grad_norm": 2.03125, + "grad_norm_var": 0.00592041015625, + "learning_rate": 0.0001, + "loss": 4.3027, + "loss/crossentropy": 2.0944234132766724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2211083546280861, + "step": 14202 + }, + { + "epoch": 0.28408, + "grad_norm": 1.90625, + "grad_norm_var": 0.0066070556640625, + "learning_rate": 0.0001, + "loss": 3.9341, + "loss/crossentropy": 1.8876588344573975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1908133327960968, + "step": 14204 + }, + { + "epoch": 0.28412, + "grad_norm": 2.0625, + "grad_norm_var": 0.006190745035807291, + "learning_rate": 0.0001, + "loss": 4.368, + "loss/crossentropy": 2.0882590413093567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21159439533948898, + "step": 14206 + }, + { + "epoch": 0.28416, + "grad_norm": 2.015625, + "grad_norm_var": 0.005296834309895833, + "learning_rate": 0.0001, + "loss": 4.3005, + "loss/crossentropy": 2.219560742378235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2282530516386032, + "step": 14208 + }, + { + "epoch": 0.2842, + "grad_norm": 2.015625, + "grad_norm_var": 0.004759724934895833, + "learning_rate": 0.0001, + "loss": 3.9308, + "loss/crossentropy": 1.8383984565734863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1868659406900406, + "step": 14210 + }, + { + "epoch": 0.28424, + "grad_norm": 2.125, + "grad_norm_var": 0.006892649332682291, + "learning_rate": 0.0001, + "loss": 3.9613, + "loss/crossentropy": 1.7509828209877014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1862550526857376, + "step": 14212 + }, + { + "epoch": 0.28428, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007513173421223958, + "learning_rate": 0.0001, + "loss": 4.1301, + "loss/crossentropy": 2.1544927954673767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19378253817558289, + "step": 14214 + }, + { + "epoch": 0.28432, + "grad_norm": 2.03125, + "grad_norm_var": 0.007533518473307291, + "learning_rate": 0.0001, + "loss": 4.1261, + "loss/crossentropy": 2.498751997947693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22881630063056946, + "step": 14216 + }, + { + "epoch": 0.28436, + "grad_norm": 2.09375, + "grad_norm_var": 0.006780751546223958, + "learning_rate": 0.0001, + "loss": 4.3414, + "loss/crossentropy": 2.2638392448425293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21407127380371094, + "step": 14218 + }, + { + "epoch": 0.2844, + "grad_norm": 2.015625, + "grad_norm_var": 0.006380208333333333, + "learning_rate": 0.0001, + "loss": 4.0469, + "loss/crossentropy": 2.097207546234131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21202068030834198, + "step": 14220 + }, + { + "epoch": 0.28444, + "grad_norm": 2.0625, + "grad_norm_var": 0.007661946614583333, + "learning_rate": 0.0001, + "loss": 4.211, + "loss/crossentropy": 2.0577695965766907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21462847292423248, + "step": 14222 + }, + { + "epoch": 0.28448, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008373769124348958, + "learning_rate": 0.0001, + "loss": 4.2646, + "loss/crossentropy": 2.3061472177505493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23122046887874603, + "step": 14224 + }, + { + "epoch": 0.28452, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0068511962890625, + "learning_rate": 0.0001, + "loss": 3.947, + "loss/crossentropy": 1.850829005241394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19229399412870407, + "step": 14226 + }, + { + "epoch": 0.28456, + "grad_norm": 1.9375, + "grad_norm_var": 0.0051348368326822914, + "learning_rate": 0.0001, + "loss": 4.0891, + "loss/crossentropy": 1.9520751237869263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19159140437841415, + "step": 14228 + }, + { + "epoch": 0.2846, + "grad_norm": 1.875, + "grad_norm_var": 0.005145009358723958, + "learning_rate": 0.0001, + "loss": 4.2161, + "loss/crossentropy": 2.507380247116089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23365024477243423, + "step": 14230 + }, + { + "epoch": 0.28464, + "grad_norm": 2.046875, + "grad_norm_var": 0.006261952718098958, + "learning_rate": 0.0001, + "loss": 4.0787, + "loss/crossentropy": 1.9148982763290405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19785508513450623, + "step": 14232 + }, + { + "epoch": 0.28468, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007769521077473958, + "learning_rate": 0.0001, + "loss": 4.1882, + "loss/crossentropy": 2.004162549972534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.192252054810524, + "step": 14234 + }, + { + "epoch": 0.28472, + "grad_norm": 1.9375, + "grad_norm_var": 0.0078277587890625, + "learning_rate": 0.0001, + "loss": 4.1318, + "loss/crossentropy": 2.1455901861190796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19151784479618073, + "step": 14236 + }, + { + "epoch": 0.28476, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005580393473307291, + "learning_rate": 0.0001, + "loss": 3.9548, + "loss/crossentropy": 2.2556833028793335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20640526711940765, + "step": 14238 + }, + { + "epoch": 0.2848, + "grad_norm": 1.8828125, + "grad_norm_var": 0.013698069254557292, + "learning_rate": 0.0001, + "loss": 4.3805, + "loss/crossentropy": 2.1121991872787476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21154547482728958, + "step": 14240 + }, + { + "epoch": 0.28484, + "grad_norm": 1.7890625, + "grad_norm_var": 0.01586888631184896, + "learning_rate": 0.0001, + "loss": 3.7502, + "loss/crossentropy": 1.9072884321212769, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19220874458551407, + "step": 14242 + }, + { + "epoch": 0.28488, + "grad_norm": 2.015625, + "grad_norm_var": 0.0154052734375, + "learning_rate": 0.0001, + "loss": 4.1925, + "loss/crossentropy": 1.907566249370575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20355048030614853, + "step": 14244 + }, + { + "epoch": 0.28492, + "grad_norm": 1.859375, + "grad_norm_var": 0.0149169921875, + "learning_rate": 0.0001, + "loss": 3.8873, + "loss/crossentropy": 2.1140421628952026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20535241812467575, + "step": 14246 + }, + { + "epoch": 0.28496, + "grad_norm": 2.125, + "grad_norm_var": 0.016022745768229166, + "learning_rate": 0.0001, + "loss": 4.1704, + "loss/crossentropy": 2.192967176437378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20353975892066956, + "step": 14248 + }, + { + "epoch": 0.285, + "grad_norm": 1.90625, + "grad_norm_var": 0.015213775634765624, + "learning_rate": 0.0001, + "loss": 3.9064, + "loss/crossentropy": 1.6658200025558472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17274750024080276, + "step": 14250 + }, + { + "epoch": 0.28504, + "grad_norm": 2.171875, + "grad_norm_var": 0.017411041259765624, + "learning_rate": 0.0001, + "loss": 4.2015, + "loss/crossentropy": 2.014496326446533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.203952856361866, + "step": 14252 + }, + { + "epoch": 0.28508, + "grad_norm": 2.15625, + "grad_norm_var": 0.0249908447265625, + "learning_rate": 0.0001, + "loss": 4.4607, + "loss/crossentropy": 2.1534151434898376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23526588827371597, + "step": 14254 + }, + { + "epoch": 0.28512, + "grad_norm": 2.0625, + "grad_norm_var": 0.01701838175455729, + "learning_rate": 0.0001, + "loss": 4.1263, + "loss/crossentropy": 2.08686763048172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20443058758974075, + "step": 14256 + }, + { + "epoch": 0.28516, + "grad_norm": 2.0625, + "grad_norm_var": 0.0135406494140625, + "learning_rate": 0.0001, + "loss": 4.314, + "loss/crossentropy": 1.9428821802139282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20955658704042435, + "step": 14258 + }, + { + "epoch": 0.2852, + "grad_norm": 1.921875, + "grad_norm_var": 0.015710194905598957, + "learning_rate": 0.0001, + "loss": 3.9157, + "loss/crossentropy": 1.8959371447563171, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20648592710494995, + "step": 14260 + }, + { + "epoch": 0.28524, + "grad_norm": 1.8203125, + "grad_norm_var": 0.017081705729166667, + "learning_rate": 0.0001, + "loss": 3.8246, + "loss/crossentropy": 1.7842467427253723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17766441404819489, + "step": 14262 + }, + { + "epoch": 0.28528, + "grad_norm": 1.84375, + "grad_norm_var": 0.018805948893229167, + "learning_rate": 0.0001, + "loss": 4.0581, + "loss/crossentropy": 1.989040195941925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20939016342163086, + "step": 14264 + }, + { + "epoch": 0.28532, + "grad_norm": 1.953125, + "grad_norm_var": 0.018656158447265626, + "learning_rate": 0.0001, + "loss": 3.9216, + "loss/crossentropy": 2.0441301465034485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20979472994804382, + "step": 14266 + }, + { + "epoch": 0.28536, + "grad_norm": 1.875, + "grad_norm_var": 0.017970530192057292, + "learning_rate": 0.0001, + "loss": 4.2181, + "loss/crossentropy": 1.9348166584968567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1930733323097229, + "step": 14268 + }, + { + "epoch": 0.2854, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009650675455729167, + "learning_rate": 0.0001, + "loss": 4.2039, + "loss/crossentropy": 2.056081175804138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20383870601654053, + "step": 14270 + }, + { + "epoch": 0.28544, + "grad_norm": 2.125, + "grad_norm_var": 0.011271158854166666, + "learning_rate": 0.0001, + "loss": 4.4046, + "loss/crossentropy": 2.2752946615219116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2458929866552353, + "step": 14272 + }, + { + "epoch": 0.28548, + "grad_norm": 2.0625, + "grad_norm_var": 0.010456339518229166, + "learning_rate": 0.0001, + "loss": 4.1693, + "loss/crossentropy": 2.093926787376404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19211938977241516, + "step": 14274 + }, + { + "epoch": 0.28552, + "grad_norm": 2.0, + "grad_norm_var": 0.009913889567057292, + "learning_rate": 0.0001, + "loss": 4.1841, + "loss/crossentropy": 2.448614239692688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23177827894687653, + "step": 14276 + }, + { + "epoch": 0.28556, + "grad_norm": 1.765625, + "grad_norm_var": 0.012401326497395834, + "learning_rate": 0.0001, + "loss": 4.0002, + "loss/crossentropy": 2.0047106742858887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2162417769432068, + "step": 14278 + }, + { + "epoch": 0.2856, + "grad_norm": 2.328125, + "grad_norm_var": 0.017986806233723958, + "learning_rate": 0.0001, + "loss": 4.1669, + "loss/crossentropy": 1.982455313205719, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22751964628696442, + "step": 14280 + }, + { + "epoch": 0.28564, + "grad_norm": 2.0625, + "grad_norm_var": 0.017411295572916666, + "learning_rate": 0.0001, + "loss": 4.0355, + "loss/crossentropy": 1.872545838356018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19046122580766678, + "step": 14282 + }, + { + "epoch": 0.28568, + "grad_norm": 2.109375, + "grad_norm_var": 0.016388956705729166, + "learning_rate": 0.0001, + "loss": 4.1266, + "loss/crossentropy": 1.8999648690223694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2155570462346077, + "step": 14284 + }, + { + "epoch": 0.28572, + "grad_norm": 2.0, + "grad_norm_var": 0.014129384358723959, + "learning_rate": 0.0001, + "loss": 3.9649, + "loss/crossentropy": 1.8433392643928528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18906012177467346, + "step": 14286 + }, + { + "epoch": 0.28576, + "grad_norm": 2.15625, + "grad_norm_var": 0.014697011311848958, + "learning_rate": 0.0001, + "loss": 4.085, + "loss/crossentropy": 1.817060947418213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20473239570856094, + "step": 14288 + }, + { + "epoch": 0.2858, + "grad_norm": 2.265625, + "grad_norm_var": 0.01796239217122396, + "learning_rate": 0.0001, + "loss": 4.1856, + "loss/crossentropy": 2.0344348549842834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19991667568683624, + "step": 14290 + }, + { + "epoch": 0.28584, + "grad_norm": 1.9609375, + "grad_norm_var": 0.01784032185872396, + "learning_rate": 0.0001, + "loss": 4.2334, + "loss/crossentropy": 2.3413926362991333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2365436628460884, + "step": 14292 + }, + { + "epoch": 0.28588, + "grad_norm": 2.078125, + "grad_norm_var": 0.015464019775390626, + "learning_rate": 0.0001, + "loss": 3.9437, + "loss/crossentropy": 2.0959852933883667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19703902304172516, + "step": 14294 + }, + { + "epoch": 0.28592, + "grad_norm": 7.625, + "grad_norm_var": 1.9304239908854166, + "learning_rate": 0.0001, + "loss": 4.2144, + "loss/crossentropy": 2.220101237297058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21996742486953735, + "step": 14296 + }, + { + "epoch": 0.28596, + "grad_norm": 2.109375, + "grad_norm_var": 1.915691884358724, + "learning_rate": 0.0001, + "loss": 4.1806, + "loss/crossentropy": 2.073485493659973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2020936980843544, + "step": 14298 + }, + { + "epoch": 0.286, + "grad_norm": 1.96875, + "grad_norm_var": 1.9216265360514322, + "learning_rate": 0.0001, + "loss": 3.9176, + "loss/crossentropy": 2.228920817375183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22304438799619675, + "step": 14300 + }, + { + "epoch": 0.28604, + "grad_norm": 1.890625, + "grad_norm_var": 1.933642323811849, + "learning_rate": 0.0001, + "loss": 4.0831, + "loss/crossentropy": 1.724998950958252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1592913344502449, + "step": 14302 + }, + { + "epoch": 0.28608, + "grad_norm": 2.0, + "grad_norm_var": 1.9375221252441406, + "learning_rate": 0.0001, + "loss": 4.1738, + "loss/crossentropy": 1.8970724940299988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18843455612659454, + "step": 14304 + }, + { + "epoch": 0.28612, + "grad_norm": 2.09375, + "grad_norm_var": 1.9384295145670574, + "learning_rate": 0.0001, + "loss": 4.0808, + "loss/crossentropy": 1.9957273602485657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18272704631090164, + "step": 14306 + }, + { + "epoch": 0.28616, + "grad_norm": 2.015625, + "grad_norm_var": 1.9298177083333334, + "learning_rate": 0.0001, + "loss": 4.3861, + "loss/crossentropy": 2.1424412727355957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21485400944948196, + "step": 14308 + }, + { + "epoch": 0.2862, + "grad_norm": 1.9765625, + "grad_norm_var": 1.9411211649576823, + "learning_rate": 0.0001, + "loss": 3.9622, + "loss/crossentropy": 1.7729334235191345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20436155050992966, + "step": 14310 + }, + { + "epoch": 0.28624, + "grad_norm": 1.96875, + "grad_norm_var": 0.013952382405598958, + "learning_rate": 0.0001, + "loss": 4.2495, + "loss/crossentropy": 2.5863327980041504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24598564952611923, + "step": 14312 + }, + { + "epoch": 0.28628, + "grad_norm": 1.9375, + "grad_norm_var": 0.01014404296875, + "learning_rate": 0.0001, + "loss": 3.962, + "loss/crossentropy": 1.9089832305908203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1905580461025238, + "step": 14314 + }, + { + "epoch": 0.28632, + "grad_norm": 2.09375, + "grad_norm_var": 0.010228474934895834, + "learning_rate": 0.0001, + "loss": 4.2167, + "loss/crossentropy": 2.0153337121009827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21556401252746582, + "step": 14316 + }, + { + "epoch": 0.28636, + "grad_norm": 2.015625, + "grad_norm_var": 0.008776601155598958, + "learning_rate": 0.0001, + "loss": 4.4037, + "loss/crossentropy": 1.9692147970199585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17990782111883163, + "step": 14318 + }, + { + "epoch": 0.2864, + "grad_norm": 1.8515625, + "grad_norm_var": 0.010969034830729167, + "learning_rate": 0.0001, + "loss": 4.1049, + "loss/crossentropy": 2.0404393076896667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108697146177292, + "step": 14320 + }, + { + "epoch": 0.28644, + "grad_norm": 2.0, + "grad_norm_var": 0.009488932291666667, + "learning_rate": 0.0001, + "loss": 4.2809, + "loss/crossentropy": 2.133711099624634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1942032128572464, + "step": 14322 + }, + { + "epoch": 0.28648, + "grad_norm": 2.0, + "grad_norm_var": 0.004878489176432291, + "learning_rate": 0.0001, + "loss": 4.1605, + "loss/crossentropy": 1.9240726232528687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18174728006124496, + "step": 14324 + }, + { + "epoch": 0.28652, + "grad_norm": 2.0625, + "grad_norm_var": 0.00850830078125, + "learning_rate": 0.0001, + "loss": 4.2671, + "loss/crossentropy": 2.0952632427215576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20621410757303238, + "step": 14326 + }, + { + "epoch": 0.28656, + "grad_norm": 2.046875, + "grad_norm_var": 0.008326975504557292, + "learning_rate": 0.0001, + "loss": 4.0683, + "loss/crossentropy": 2.2604600191116333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2310045212507248, + "step": 14328 + }, + { + "epoch": 0.2866, + "grad_norm": 2.09375, + "grad_norm_var": 0.00777587890625, + "learning_rate": 0.0001, + "loss": 4.2551, + "loss/crossentropy": 2.0915167331695557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23344147205352783, + "step": 14330 + }, + { + "epoch": 0.28664, + "grad_norm": 1.9765625, + "grad_norm_var": 0.00804443359375, + "learning_rate": 0.0001, + "loss": 3.9575, + "loss/crossentropy": 2.2928651571273804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23840509355068207, + "step": 14332 + }, + { + "epoch": 0.28668, + "grad_norm": 2.0, + "grad_norm_var": 0.008685048421223958, + "learning_rate": 0.0001, + "loss": 4.4469, + "loss/crossentropy": 2.261072874069214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22056513279676437, + "step": 14334 + }, + { + "epoch": 0.28672, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007883453369140625, + "learning_rate": 0.0001, + "loss": 3.9218, + "loss/crossentropy": 1.7591394186019897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1900082305073738, + "step": 14336 + }, + { + "epoch": 0.28676, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008426666259765625, + "learning_rate": 0.0001, + "loss": 4.0457, + "loss/crossentropy": 2.129893183708191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19578705728054047, + "step": 14338 + }, + { + "epoch": 0.2868, + "grad_norm": 2.078125, + "grad_norm_var": 0.008451334635416667, + "learning_rate": 0.0001, + "loss": 4.4884, + "loss/crossentropy": 2.0376795530319214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088106870651245, + "step": 14340 + }, + { + "epoch": 0.28684, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005018870035807292, + "learning_rate": 0.0001, + "loss": 4.282, + "loss/crossentropy": 2.1484848260879517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2113661915063858, + "step": 14342 + }, + { + "epoch": 0.28688, + "grad_norm": 1.984375, + "grad_norm_var": 0.00496826171875, + "learning_rate": 0.0001, + "loss": 4.2713, + "loss/crossentropy": 2.204781651496887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20908530056476593, + "step": 14344 + }, + { + "epoch": 0.28692, + "grad_norm": 2.09375, + "grad_norm_var": 0.0052154541015625, + "learning_rate": 0.0001, + "loss": 4.1075, + "loss/crossentropy": 1.8096813559532166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18912722170352936, + "step": 14346 + }, + { + "epoch": 0.28696, + "grad_norm": 1.921875, + "grad_norm_var": 0.005580393473307291, + "learning_rate": 0.0001, + "loss": 4.0057, + "loss/crossentropy": 1.8014967441558838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18832575529813766, + "step": 14348 + }, + { + "epoch": 0.287, + "grad_norm": 2.203125, + "grad_norm_var": 0.00789794921875, + "learning_rate": 0.0001, + "loss": 4.3211, + "loss/crossentropy": 2.1443156003952026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.232828289270401, + "step": 14350 + }, + { + "epoch": 0.28704, + "grad_norm": 2.125, + "grad_norm_var": 0.008259073893229166, + "learning_rate": 0.0001, + "loss": 4.0984, + "loss/crossentropy": 2.1773927211761475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19964880496263504, + "step": 14352 + }, + { + "epoch": 0.28708, + "grad_norm": 2.0, + "grad_norm_var": 0.007795206705729167, + "learning_rate": 0.0001, + "loss": 4.1432, + "loss/crossentropy": 2.157313585281372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2149301916360855, + "step": 14354 + }, + { + "epoch": 0.28712, + "grad_norm": 2.046875, + "grad_norm_var": 0.007184855143229167, + "learning_rate": 0.0001, + "loss": 4.2938, + "loss/crossentropy": 2.3431146144866943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2297225296497345, + "step": 14356 + }, + { + "epoch": 0.28716, + "grad_norm": 2.03125, + "grad_norm_var": 0.007940419514973958, + "learning_rate": 0.0001, + "loss": 4.2026, + "loss/crossentropy": 1.894954264163971, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19812744110822678, + "step": 14358 + }, + { + "epoch": 0.2872, + "grad_norm": 1.9375, + "grad_norm_var": 0.008784993489583334, + "learning_rate": 0.0001, + "loss": 4.0654, + "loss/crossentropy": 2.1701435446739197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21183644235134125, + "step": 14360 + }, + { + "epoch": 0.28724, + "grad_norm": 2.015625, + "grad_norm_var": 0.007975006103515625, + "learning_rate": 0.0001, + "loss": 4.2457, + "loss/crossentropy": 2.202796459197998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21229087561368942, + "step": 14362 + }, + { + "epoch": 0.28728, + "grad_norm": 2.203125, + "grad_norm_var": 0.009789784749348959, + "learning_rate": 0.0001, + "loss": 4.3692, + "loss/crossentropy": 2.225682258605957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22512775659561157, + "step": 14364 + }, + { + "epoch": 0.28732, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007088216145833334, + "learning_rate": 0.0001, + "loss": 4.0938, + "loss/crossentropy": 1.9562655687332153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19557978957891464, + "step": 14366 + }, + { + "epoch": 0.28736, + "grad_norm": 2.0, + "grad_norm_var": 0.0062945048014322914, + "learning_rate": 0.0001, + "loss": 4.2922, + "loss/crossentropy": 1.794309377670288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1851721778512001, + "step": 14368 + }, + { + "epoch": 0.2874, + "grad_norm": 2.0, + "grad_norm_var": 0.006660970052083334, + "learning_rate": 0.0001, + "loss": 4.1198, + "loss/crossentropy": 2.276759445667267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2071758210659027, + "step": 14370 + }, + { + "epoch": 0.28744, + "grad_norm": 1.90625, + "grad_norm_var": 0.006696573893229167, + "learning_rate": 0.0001, + "loss": 4.1162, + "loss/crossentropy": 2.332028031349182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22525647282600403, + "step": 14372 + }, + { + "epoch": 0.28748, + "grad_norm": 2.078125, + "grad_norm_var": 0.005997721354166667, + "learning_rate": 0.0001, + "loss": 4.2594, + "loss/crossentropy": 1.975858986377716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18405582010746002, + "step": 14374 + }, + { + "epoch": 0.28752, + "grad_norm": 1.984375, + "grad_norm_var": 0.006453196207682292, + "learning_rate": 0.0001, + "loss": 4.0615, + "loss/crossentropy": 2.002572774887085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19543466717004776, + "step": 14376 + }, + { + "epoch": 0.28756, + "grad_norm": 1.96875, + "grad_norm_var": 0.006703440348307292, + "learning_rate": 0.0001, + "loss": 4.1837, + "loss/crossentropy": 2.400219678878784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23009777069091797, + "step": 14378 + }, + { + "epoch": 0.2876, + "grad_norm": 2.0, + "grad_norm_var": 0.003352864583333333, + "learning_rate": 0.0001, + "loss": 4.0071, + "loss/crossentropy": 1.6437376737594604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17785440385341644, + "step": 14380 + }, + { + "epoch": 0.28764, + "grad_norm": 2.15625, + "grad_norm_var": 0.007225545247395834, + "learning_rate": 0.0001, + "loss": 4.1494, + "loss/crossentropy": 1.8833640813827515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22255511581897736, + "step": 14382 + }, + { + "epoch": 0.28768, + "grad_norm": 2.0, + "grad_norm_var": 0.007100423177083333, + "learning_rate": 0.0001, + "loss": 4.0881, + "loss/crossentropy": 1.9113904237747192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2050003483891487, + "step": 14384 + }, + { + "epoch": 0.28772, + "grad_norm": 1.90625, + "grad_norm_var": 0.007765452067057292, + "learning_rate": 0.0001, + "loss": 4.1114, + "loss/crossentropy": 2.236249327659607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21349424868822098, + "step": 14386 + }, + { + "epoch": 0.28776, + "grad_norm": 2.140625, + "grad_norm_var": 0.008090972900390625, + "learning_rate": 0.0001, + "loss": 3.9282, + "loss/crossentropy": 1.8709489703178406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19309207051992416, + "step": 14388 + }, + { + "epoch": 0.2878, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007933553059895833, + "learning_rate": 0.0001, + "loss": 4.0871, + "loss/crossentropy": 1.926324725151062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19902244210243225, + "step": 14390 + }, + { + "epoch": 0.28784, + "grad_norm": 1.8671875, + "grad_norm_var": 0.00810546875, + "learning_rate": 0.0001, + "loss": 4.085, + "loss/crossentropy": 1.898009479045868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18698234856128693, + "step": 14392 + }, + { + "epoch": 0.28788, + "grad_norm": 2.1875, + "grad_norm_var": 0.0122711181640625, + "learning_rate": 0.0001, + "loss": 4.2367, + "loss/crossentropy": 1.9728660583496094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033083364367485, + "step": 14394 + }, + { + "epoch": 0.28792, + "grad_norm": 1.9296875, + "grad_norm_var": 0.01282958984375, + "learning_rate": 0.0001, + "loss": 3.7965, + "loss/crossentropy": 1.957255482673645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20039385557174683, + "step": 14396 + }, + { + "epoch": 0.28796, + "grad_norm": 2.0625, + "grad_norm_var": 0.009897613525390625, + "learning_rate": 0.0001, + "loss": 3.8525, + "loss/crossentropy": 1.8001562356948853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2059212401509285, + "step": 14398 + }, + { + "epoch": 0.288, + "grad_norm": 2.125, + "grad_norm_var": 0.011177571614583333, + "learning_rate": 0.0001, + "loss": 4.1749, + "loss/crossentropy": 1.9166946411132812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19456689059734344, + "step": 14400 + }, + { + "epoch": 0.28804, + "grad_norm": 2.0625, + "grad_norm_var": 0.01221923828125, + "learning_rate": 0.0001, + "loss": 4.3302, + "loss/crossentropy": 2.033670485019684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24448300898075104, + "step": 14402 + }, + { + "epoch": 0.28808, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012580362955729167, + "learning_rate": 0.0001, + "loss": 3.8679, + "loss/crossentropy": 2.2668023705482483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20903322845697403, + "step": 14404 + }, + { + "epoch": 0.28812, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012532297770182292, + "learning_rate": 0.0001, + "loss": 4.0638, + "loss/crossentropy": 2.1076024770736694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20488093793392181, + "step": 14406 + }, + { + "epoch": 0.28816, + "grad_norm": 2.078125, + "grad_norm_var": 0.012117258707682292, + "learning_rate": 0.0001, + "loss": 4.0707, + "loss/crossentropy": 1.996046781539917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20725766569375992, + "step": 14408 + }, + { + "epoch": 0.2882, + "grad_norm": 2.046875, + "grad_norm_var": 0.008365885416666666, + "learning_rate": 0.0001, + "loss": 4.2302, + "loss/crossentropy": 2.2031015157699585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19491394609212875, + "step": 14410 + }, + { + "epoch": 0.28824, + "grad_norm": 2.109375, + "grad_norm_var": 0.008714803059895833, + "learning_rate": 0.0001, + "loss": 4.3962, + "loss/crossentropy": 2.0339081287384033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21979261189699173, + "step": 14412 + }, + { + "epoch": 0.28828, + "grad_norm": 1.8515625, + "grad_norm_var": 0.011641438802083333, + "learning_rate": 0.0001, + "loss": 4.0042, + "loss/crossentropy": 1.9283286929130554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19970305263996124, + "step": 14414 + }, + { + "epoch": 0.28832, + "grad_norm": 2.046875, + "grad_norm_var": 0.011156972249348958, + "learning_rate": 0.0001, + "loss": 4.3879, + "loss/crossentropy": 2.1092429161071777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2114599123597145, + "step": 14416 + }, + { + "epoch": 0.28836, + "grad_norm": 1.9765625, + "grad_norm_var": 0.01004638671875, + "learning_rate": 0.0001, + "loss": 4.1234, + "loss/crossentropy": 1.8102002143859863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18407940864562988, + "step": 14418 + }, + { + "epoch": 0.2884, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008906809488932292, + "learning_rate": 0.0001, + "loss": 4.0943, + "loss/crossentropy": 2.3192719221115112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22208110988140106, + "step": 14420 + }, + { + "epoch": 0.28844, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0082183837890625, + "learning_rate": 0.0001, + "loss": 4.2268, + "loss/crossentropy": 2.2302430868148804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21011962741613388, + "step": 14422 + }, + { + "epoch": 0.28848, + "grad_norm": 1.9765625, + "grad_norm_var": 0.008542633056640625, + "learning_rate": 0.0001, + "loss": 4.1235, + "loss/crossentropy": 2.3015987873077393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2012081891298294, + "step": 14424 + }, + { + "epoch": 0.28852, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0077056884765625, + "learning_rate": 0.0001, + "loss": 4.3094, + "loss/crossentropy": 2.060473084449768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20174618810415268, + "step": 14426 + }, + { + "epoch": 0.28856, + "grad_norm": 2.3125, + "grad_norm_var": 0.014082845052083333, + "learning_rate": 0.0001, + "loss": 3.9594, + "loss/crossentropy": 1.8146299719810486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18100766092538834, + "step": 14428 + }, + { + "epoch": 0.2886, + "grad_norm": 2.0, + "grad_norm_var": 0.011277008056640624, + "learning_rate": 0.0001, + "loss": 3.8444, + "loss/crossentropy": 2.1504935026168823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20493116229772568, + "step": 14430 + }, + { + "epoch": 0.28864, + "grad_norm": 2.0, + "grad_norm_var": 0.010796864827473959, + "learning_rate": 0.0001, + "loss": 4.1153, + "loss/crossentropy": 2.4947997331619263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2339334562420845, + "step": 14432 + }, + { + "epoch": 0.28868, + "grad_norm": 2.3125, + "grad_norm_var": 0.01612116495768229, + "learning_rate": 0.0001, + "loss": 4.3011, + "loss/crossentropy": 2.421883702278137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23815617710351944, + "step": 14434 + }, + { + "epoch": 0.28872, + "grad_norm": 2.0, + "grad_norm_var": 0.016600545247395834, + "learning_rate": 0.0001, + "loss": 4.2152, + "loss/crossentropy": 2.093776822090149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20425530523061752, + "step": 14436 + }, + { + "epoch": 0.28876, + "grad_norm": 2.03125, + "grad_norm_var": 0.01817601521809896, + "learning_rate": 0.0001, + "loss": 3.9126, + "loss/crossentropy": 2.02763295173645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1972591131925583, + "step": 14438 + }, + { + "epoch": 0.2888, + "grad_norm": 2.015625, + "grad_norm_var": 0.0176422119140625, + "learning_rate": 0.0001, + "loss": 4.1348, + "loss/crossentropy": 1.9769265055656433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.186862014234066, + "step": 14440 + }, + { + "epoch": 0.28884, + "grad_norm": 1.9375, + "grad_norm_var": 0.019024403889973958, + "learning_rate": 0.0001, + "loss": 4.4997, + "loss/crossentropy": 2.34747314453125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22067507356405258, + "step": 14442 + }, + { + "epoch": 0.28888, + "grad_norm": 1.921875, + "grad_norm_var": 0.0128570556640625, + "learning_rate": 0.0001, + "loss": 4.1497, + "loss/crossentropy": 2.1036205887794495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2231181040406227, + "step": 14444 + }, + { + "epoch": 0.28892, + "grad_norm": 1.9140625, + "grad_norm_var": 0.01638768513997396, + "learning_rate": 0.0001, + "loss": 4.251, + "loss/crossentropy": 2.2350860834121704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23475627601146698, + "step": 14446 + }, + { + "epoch": 0.28896, + "grad_norm": 2.09375, + "grad_norm_var": 0.01778132120768229, + "learning_rate": 0.0001, + "loss": 4.2095, + "loss/crossentropy": 2.2336645126342773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20643477141857147, + "step": 14448 + }, + { + "epoch": 0.289, + "grad_norm": 1.9765625, + "grad_norm_var": 0.012923177083333333, + "learning_rate": 0.0001, + "loss": 4.0793, + "loss/crossentropy": 2.345365524291992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20650553703308105, + "step": 14450 + }, + { + "epoch": 0.28904, + "grad_norm": 2.125, + "grad_norm_var": 0.0120025634765625, + "learning_rate": 0.0001, + "loss": 4.131, + "loss/crossentropy": 2.307543635368347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21354079991579056, + "step": 14452 + }, + { + "epoch": 0.28908, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012001291910807291, + "learning_rate": 0.0001, + "loss": 4.1006, + "loss/crossentropy": 1.8094390034675598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1850133240222931, + "step": 14454 + }, + { + "epoch": 0.28912, + "grad_norm": 2.046875, + "grad_norm_var": 0.011424763997395834, + "learning_rate": 0.0001, + "loss": 4.1326, + "loss/crossentropy": 1.859872817993164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.226328507065773, + "step": 14456 + }, + { + "epoch": 0.28916, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009340159098307292, + "learning_rate": 0.0001, + "loss": 3.9997, + "loss/crossentropy": 1.9507490396499634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18718113750219345, + "step": 14458 + }, + { + "epoch": 0.2892, + "grad_norm": 1.859375, + "grad_norm_var": 0.01126708984375, + "learning_rate": 0.0001, + "loss": 3.7868, + "loss/crossentropy": 1.6642532348632812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1674812287092209, + "step": 14460 + }, + { + "epoch": 0.28924, + "grad_norm": 1.859375, + "grad_norm_var": 0.007039133707682292, + "learning_rate": 0.0001, + "loss": 3.9523, + "loss/crossentropy": 1.9071928262710571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18480891734361649, + "step": 14462 + }, + { + "epoch": 0.28928, + "grad_norm": 1.96875, + "grad_norm_var": 0.005718739827473959, + "learning_rate": 0.0001, + "loss": 4.2876, + "loss/crossentropy": 2.0758568048477173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2002219259738922, + "step": 14464 + }, + { + "epoch": 0.28932, + "grad_norm": 2.078125, + "grad_norm_var": 0.006769816080729167, + "learning_rate": 0.0001, + "loss": 3.9438, + "loss/crossentropy": 1.9777529835700989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19538866728544235, + "step": 14466 + }, + { + "epoch": 0.28936, + "grad_norm": 2.0625, + "grad_norm_var": 0.0062652587890625, + "learning_rate": 0.0001, + "loss": 4.1262, + "loss/crossentropy": 2.008077323436737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20578377693891525, + "step": 14468 + }, + { + "epoch": 0.2894, + "grad_norm": 1.8359375, + "grad_norm_var": 0.006673177083333333, + "learning_rate": 0.0001, + "loss": 3.9341, + "loss/crossentropy": 2.011266529560089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20298370718955994, + "step": 14470 + }, + { + "epoch": 0.28944, + "grad_norm": 1.9375, + "grad_norm_var": 0.0057769775390625, + "learning_rate": 0.0001, + "loss": 4.201, + "loss/crossentropy": 2.231672167778015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21847998350858688, + "step": 14472 + }, + { + "epoch": 0.28948, + "grad_norm": 1.984375, + "grad_norm_var": 0.007279205322265625, + "learning_rate": 0.0001, + "loss": 4.382, + "loss/crossentropy": 2.2268325090408325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22168640047311783, + "step": 14474 + }, + { + "epoch": 0.28952, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012105305989583334, + "learning_rate": 0.0001, + "loss": 4.2193, + "loss/crossentropy": 2.2705942392349243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2110700011253357, + "step": 14476 + }, + { + "epoch": 0.28956, + "grad_norm": 1.8203125, + "grad_norm_var": 0.012566884358723959, + "learning_rate": 0.0001, + "loss": 3.8754, + "loss/crossentropy": 1.893052339553833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18865809589624405, + "step": 14478 + }, + { + "epoch": 0.2896, + "grad_norm": 1.84375, + "grad_norm_var": 0.013732655843098959, + "learning_rate": 0.0001, + "loss": 4.0989, + "loss/crossentropy": 2.020704984664917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18822921812534332, + "step": 14480 + }, + { + "epoch": 0.28964, + "grad_norm": 2.03125, + "grad_norm_var": 0.014632161458333333, + "learning_rate": 0.0001, + "loss": 4.1651, + "loss/crossentropy": 1.762313961982727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.204545259475708, + "step": 14482 + }, + { + "epoch": 0.28968, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013158162434895834, + "learning_rate": 0.0001, + "loss": 4.0261, + "loss/crossentropy": 1.8793032765388489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17913836985826492, + "step": 14484 + }, + { + "epoch": 0.28972, + "grad_norm": 1.890625, + "grad_norm_var": 0.013372548421223958, + "learning_rate": 0.0001, + "loss": 4.2731, + "loss/crossentropy": 2.428762197494507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23766332119703293, + "step": 14486 + }, + { + "epoch": 0.28976, + "grad_norm": 2.046875, + "grad_norm_var": 0.014969635009765624, + "learning_rate": 0.0001, + "loss": 4.1134, + "loss/crossentropy": 2.2160138487815857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21619082987308502, + "step": 14488 + }, + { + "epoch": 0.2898, + "grad_norm": 2.046875, + "grad_norm_var": 0.0150787353515625, + "learning_rate": 0.0001, + "loss": 4.1049, + "loss/crossentropy": 1.8169404864311218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981067657470703, + "step": 14490 + }, + { + "epoch": 0.28984, + "grad_norm": 1.7890625, + "grad_norm_var": 0.0131591796875, + "learning_rate": 0.0001, + "loss": 4.1456, + "loss/crossentropy": 2.147166609764099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993183195590973, + "step": 14492 + }, + { + "epoch": 0.28988, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011362457275390625, + "learning_rate": 0.0001, + "loss": 3.8507, + "loss/crossentropy": 1.7740440964698792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18381935358047485, + "step": 14494 + }, + { + "epoch": 0.28992, + "grad_norm": 1.921875, + "grad_norm_var": 0.010428619384765626, + "learning_rate": 0.0001, + "loss": 4.0327, + "loss/crossentropy": 2.206219792366028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21052061766386032, + "step": 14496 + }, + { + "epoch": 0.28996, + "grad_norm": 2.015625, + "grad_norm_var": 0.008600870768229166, + "learning_rate": 0.0001, + "loss": 4.4421, + "loss/crossentropy": 2.4030661582946777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22094716131687164, + "step": 14498 + }, + { + "epoch": 0.29, + "grad_norm": 2.25, + "grad_norm_var": 0.016633097330729166, + "learning_rate": 0.0001, + "loss": 4.4593, + "loss/crossentropy": 2.142518997192383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2111787647008896, + "step": 14500 + }, + { + "epoch": 0.29004, + "grad_norm": 1.953125, + "grad_norm_var": 0.015633138020833333, + "learning_rate": 0.0001, + "loss": 4.3679, + "loss/crossentropy": 1.9228865504264832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19389715045690536, + "step": 14502 + }, + { + "epoch": 0.29008, + "grad_norm": 2.09375, + "grad_norm_var": 0.014141591389973958, + "learning_rate": 0.0001, + "loss": 4.011, + "loss/crossentropy": 1.789183259010315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18835698068141937, + "step": 14504 + }, + { + "epoch": 0.29012, + "grad_norm": 1.9375, + "grad_norm_var": 0.013798014322916666, + "learning_rate": 0.0001, + "loss": 4.2862, + "loss/crossentropy": 2.3876765966415405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23156627267599106, + "step": 14506 + }, + { + "epoch": 0.29016, + "grad_norm": 2.078125, + "grad_norm_var": 0.010151926676432292, + "learning_rate": 0.0001, + "loss": 4.2024, + "loss/crossentropy": 2.055350124835968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1995788812637329, + "step": 14508 + }, + { + "epoch": 0.2902, + "grad_norm": 1.96875, + "grad_norm_var": 0.010007476806640625, + "learning_rate": 0.0001, + "loss": 4.343, + "loss/crossentropy": 2.2045267820358276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2197813093662262, + "step": 14510 + }, + { + "epoch": 0.29024, + "grad_norm": 1.765625, + "grad_norm_var": 0.014503733317057291, + "learning_rate": 0.0001, + "loss": 3.7136, + "loss/crossentropy": 1.9596665501594543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17329587787389755, + "step": 14512 + }, + { + "epoch": 0.29028, + "grad_norm": 1.9921875, + "grad_norm_var": 0.017465972900390626, + "learning_rate": 0.0001, + "loss": 3.9546, + "loss/crossentropy": 1.8273005485534668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17768846452236176, + "step": 14514 + }, + { + "epoch": 0.29032, + "grad_norm": 1.8359375, + "grad_norm_var": 0.011739095052083334, + "learning_rate": 0.0001, + "loss": 3.9924, + "loss/crossentropy": 2.0428807735443115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104019746184349, + "step": 14516 + }, + { + "epoch": 0.29036, + "grad_norm": 2.21875, + "grad_norm_var": 0.014216868082682292, + "learning_rate": 0.0001, + "loss": 4.3591, + "loss/crossentropy": 2.1242733001708984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2153768166899681, + "step": 14518 + }, + { + "epoch": 0.2904, + "grad_norm": 1.875, + "grad_norm_var": 0.013944244384765625, + "learning_rate": 0.0001, + "loss": 4.0831, + "loss/crossentropy": 1.9223615527153015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18382133543491364, + "step": 14520 + }, + { + "epoch": 0.29044, + "grad_norm": 1.9453125, + "grad_norm_var": 0.013993326822916667, + "learning_rate": 0.0001, + "loss": 3.9527, + "loss/crossentropy": 2.12492972612381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19818739593029022, + "step": 14522 + }, + { + "epoch": 0.29048, + "grad_norm": 1.9609375, + "grad_norm_var": 0.013665517171223959, + "learning_rate": 0.0001, + "loss": 4.1539, + "loss/crossentropy": 2.0251020789146423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19633153080940247, + "step": 14524 + }, + { + "epoch": 0.29052, + "grad_norm": 2.1875, + "grad_norm_var": 0.014949544270833334, + "learning_rate": 0.0001, + "loss": 4.192, + "loss/crossentropy": 2.177084445953369, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22618898749351501, + "step": 14526 + }, + { + "epoch": 0.29056, + "grad_norm": 2.0625, + "grad_norm_var": 0.013997141520182292, + "learning_rate": 0.0001, + "loss": 3.9215, + "loss/crossentropy": 1.718012809753418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18632279336452484, + "step": 14528 + }, + { + "epoch": 0.2906, + "grad_norm": 1.9609375, + "grad_norm_var": 0.017850494384765624, + "learning_rate": 0.0001, + "loss": 4.2869, + "loss/crossentropy": 1.7026863098144531, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.211147278547287, + "step": 14530 + }, + { + "epoch": 0.29064, + "grad_norm": 1.8671875, + "grad_norm_var": 0.017682902018229165, + "learning_rate": 0.0001, + "loss": 3.9369, + "loss/crossentropy": 1.6652680039405823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16967280209064484, + "step": 14532 + }, + { + "epoch": 0.29068, + "grad_norm": 1.9296875, + "grad_norm_var": 0.013996378580729166, + "learning_rate": 0.0001, + "loss": 4.0815, + "loss/crossentropy": 2.191527247428894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20215752720832825, + "step": 14534 + }, + { + "epoch": 0.29072, + "grad_norm": 2.125, + "grad_norm_var": 0.015927886962890624, + "learning_rate": 0.0001, + "loss": 3.8817, + "loss/crossentropy": 1.8385645747184753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19906776398420334, + "step": 14536 + }, + { + "epoch": 0.29076, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01665013631184896, + "learning_rate": 0.0001, + "loss": 3.9186, + "loss/crossentropy": 2.0456249117851257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2057228460907936, + "step": 14538 + }, + { + "epoch": 0.2908, + "grad_norm": 1.84375, + "grad_norm_var": 0.018651326497395832, + "learning_rate": 0.0001, + "loss": 4.0636, + "loss/crossentropy": 2.0413625836372375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20096366852521896, + "step": 14540 + }, + { + "epoch": 0.29084, + "grad_norm": 1.9609375, + "grad_norm_var": 0.016755167643229166, + "learning_rate": 0.0001, + "loss": 3.7819, + "loss/crossentropy": 1.723297357559204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.180135078728199, + "step": 14542 + }, + { + "epoch": 0.29088, + "grad_norm": 2.078125, + "grad_norm_var": 0.01578343709309896, + "learning_rate": 0.0001, + "loss": 4.2115, + "loss/crossentropy": 2.3143200874328613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2289530113339424, + "step": 14544 + }, + { + "epoch": 0.29092, + "grad_norm": 2.046875, + "grad_norm_var": 0.010017903645833333, + "learning_rate": 0.0001, + "loss": 4.408, + "loss/crossentropy": 2.1612138748168945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21224550902843475, + "step": 14546 + }, + { + "epoch": 0.29096, + "grad_norm": 1.921875, + "grad_norm_var": 0.009227498372395834, + "learning_rate": 0.0001, + "loss": 4.1101, + "loss/crossentropy": 2.0591673851013184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18913161754608154, + "step": 14548 + }, + { + "epoch": 0.291, + "grad_norm": 1.875, + "grad_norm_var": 0.009505208333333333, + "learning_rate": 0.0001, + "loss": 4.132, + "loss/crossentropy": 2.0901013016700745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19315654039382935, + "step": 14550 + }, + { + "epoch": 0.29104, + "grad_norm": 1.9296875, + "grad_norm_var": 0.006833648681640625, + "learning_rate": 0.0001, + "loss": 3.9351, + "loss/crossentropy": 2.0716471672058105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1951451674103737, + "step": 14552 + }, + { + "epoch": 0.29108, + "grad_norm": 1.984375, + "grad_norm_var": 0.008186848958333333, + "learning_rate": 0.0001, + "loss": 4.3656, + "loss/crossentropy": 2.084408760070801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21354945749044418, + "step": 14554 + }, + { + "epoch": 0.29112, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006486002604166667, + "learning_rate": 0.0001, + "loss": 4.0342, + "loss/crossentropy": 2.154771566390991, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20277246832847595, + "step": 14556 + }, + { + "epoch": 0.29116, + "grad_norm": 2.09375, + "grad_norm_var": 0.0069353739420572914, + "learning_rate": 0.0001, + "loss": 4.259, + "loss/crossentropy": 2.24834144115448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23204077035188675, + "step": 14558 + }, + { + "epoch": 0.2912, + "grad_norm": 1.859375, + "grad_norm_var": 0.006951649983723958, + "learning_rate": 0.0001, + "loss": 3.8054, + "loss/crossentropy": 1.8239850401878357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18208947032690048, + "step": 14560 + }, + { + "epoch": 0.29124, + "grad_norm": 1.90625, + "grad_norm_var": 0.006701405843098958, + "learning_rate": 0.0001, + "loss": 3.9465, + "loss/crossentropy": 1.969277262687683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2093023955821991, + "step": 14562 + }, + { + "epoch": 0.29128, + "grad_norm": 2.046875, + "grad_norm_var": 0.007670084635416667, + "learning_rate": 0.0001, + "loss": 3.7176, + "loss/crossentropy": 1.561119556427002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1653747707605362, + "step": 14564 + }, + { + "epoch": 0.29132, + "grad_norm": 2.0625, + "grad_norm_var": 0.008154042561848958, + "learning_rate": 0.0001, + "loss": 4.3307, + "loss/crossentropy": 2.2341216802597046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23967482149600983, + "step": 14566 + }, + { + "epoch": 0.29136, + "grad_norm": 1.921875, + "grad_norm_var": 0.008227284749348958, + "learning_rate": 0.0001, + "loss": 3.9899, + "loss/crossentropy": 1.999854326248169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19598717242479324, + "step": 14568 + }, + { + "epoch": 0.2914, + "grad_norm": 1.953125, + "grad_norm_var": 0.006898752848307292, + "learning_rate": 0.0001, + "loss": 4.1262, + "loss/crossentropy": 1.8079020380973816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1919228658080101, + "step": 14570 + }, + { + "epoch": 0.29144, + "grad_norm": 2.109375, + "grad_norm_var": 0.009970855712890626, + "learning_rate": 0.0001, + "loss": 3.9946, + "loss/crossentropy": 2.044901430606842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21737617254257202, + "step": 14572 + }, + { + "epoch": 0.29148, + "grad_norm": 2.03125, + "grad_norm_var": 0.009806060791015625, + "learning_rate": 0.0001, + "loss": 4.0498, + "loss/crossentropy": 1.972197949886322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20828764885663986, + "step": 14574 + }, + { + "epoch": 0.29152, + "grad_norm": 2.015625, + "grad_norm_var": 0.010131581624348959, + "learning_rate": 0.0001, + "loss": 4.0011, + "loss/crossentropy": 1.9129782915115356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17108886688947678, + "step": 14576 + }, + { + "epoch": 0.29156, + "grad_norm": 2.109375, + "grad_norm_var": 0.01068115234375, + "learning_rate": 0.0001, + "loss": 4.1458, + "loss/crossentropy": 2.0545085072517395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20631857216358185, + "step": 14578 + }, + { + "epoch": 0.2916, + "grad_norm": 2.015625, + "grad_norm_var": 0.012654368082682292, + "learning_rate": 0.0001, + "loss": 4.355, + "loss/crossentropy": 2.0274637937545776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2099764049053192, + "step": 14580 + }, + { + "epoch": 0.29164, + "grad_norm": 1.875, + "grad_norm_var": 0.013765207926432292, + "learning_rate": 0.0001, + "loss": 3.9014, + "loss/crossentropy": 1.8154139518737793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19045238196849823, + "step": 14582 + }, + { + "epoch": 0.29168, + "grad_norm": 2.0625, + "grad_norm_var": 0.014937082926432291, + "learning_rate": 0.0001, + "loss": 4.0327, + "loss/crossentropy": 2.061666965484619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21272550523281097, + "step": 14584 + }, + { + "epoch": 0.29172, + "grad_norm": 2.03125, + "grad_norm_var": 0.01580785115559896, + "learning_rate": 0.0001, + "loss": 4.6019, + "loss/crossentropy": 2.3831146955490112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22327116876840591, + "step": 14586 + }, + { + "epoch": 0.29176, + "grad_norm": 2.046875, + "grad_norm_var": 0.013090006510416667, + "learning_rate": 0.0001, + "loss": 4.2142, + "loss/crossentropy": 1.8024229407310486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1871618777513504, + "step": 14588 + }, + { + "epoch": 0.2918, + "grad_norm": 2.0625, + "grad_norm_var": 0.014967600504557291, + "learning_rate": 0.0001, + "loss": 4.1611, + "loss/crossentropy": 2.289118528366089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22116435319185257, + "step": 14590 + }, + { + "epoch": 0.29184, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013277180989583333, + "learning_rate": 0.0001, + "loss": 4.1358, + "loss/crossentropy": 2.11151522397995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22029711306095123, + "step": 14592 + }, + { + "epoch": 0.29188, + "grad_norm": 2.125, + "grad_norm_var": 0.013288370768229167, + "learning_rate": 0.0001, + "loss": 4.3103, + "loss/crossentropy": 1.7856897711753845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1815958321094513, + "step": 14594 + }, + { + "epoch": 0.29192, + "grad_norm": 2.078125, + "grad_norm_var": 0.0101959228515625, + "learning_rate": 0.0001, + "loss": 4.1085, + "loss/crossentropy": 1.9745690822601318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2074003368616104, + "step": 14596 + }, + { + "epoch": 0.29196, + "grad_norm": 1.953125, + "grad_norm_var": 0.0091461181640625, + "learning_rate": 0.0001, + "loss": 4.3034, + "loss/crossentropy": 1.9199401140213013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19998926669359207, + "step": 14598 + }, + { + "epoch": 0.292, + "grad_norm": 1.953125, + "grad_norm_var": 0.007743072509765625, + "learning_rate": 0.0001, + "loss": 4.0842, + "loss/crossentropy": 1.9938844442367554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20138052105903625, + "step": 14600 + }, + { + "epoch": 0.29204, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0059397379557291664, + "learning_rate": 0.0001, + "loss": 4.0025, + "loss/crossentropy": 1.6290993094444275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18716775625944138, + "step": 14602 + }, + { + "epoch": 0.29208, + "grad_norm": 1.875, + "grad_norm_var": 0.006115468343098959, + "learning_rate": 0.0001, + "loss": 3.9953, + "loss/crossentropy": 2.116807520389557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2113799750804901, + "step": 14604 + }, + { + "epoch": 0.29212, + "grad_norm": 2.140625, + "grad_norm_var": 0.006392415364583333, + "learning_rate": 0.0001, + "loss": 4.0527, + "loss/crossentropy": 1.6732030510902405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19776474684476852, + "step": 14606 + }, + { + "epoch": 0.29216, + "grad_norm": 2.03125, + "grad_norm_var": 0.006762440999348958, + "learning_rate": 0.0001, + "loss": 4.2572, + "loss/crossentropy": 1.9403682351112366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18560070544481277, + "step": 14608 + }, + { + "epoch": 0.2922, + "grad_norm": 2.3125, + "grad_norm_var": 0.011641438802083333, + "learning_rate": 0.0001, + "loss": 4.2238, + "loss/crossentropy": 1.9337440729141235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172246277332306, + "step": 14610 + }, + { + "epoch": 0.29224, + "grad_norm": 2.0625, + "grad_norm_var": 0.011836751302083334, + "learning_rate": 0.0001, + "loss": 4.1839, + "loss/crossentropy": 2.1603177785873413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2076597660779953, + "step": 14612 + }, + { + "epoch": 0.29228, + "grad_norm": 1.953125, + "grad_norm_var": 0.0118804931640625, + "learning_rate": 0.0001, + "loss": 4.1103, + "loss/crossentropy": 1.979960322380066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2007075995206833, + "step": 14614 + }, + { + "epoch": 0.29232, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0134429931640625, + "learning_rate": 0.0001, + "loss": 3.9901, + "loss/crossentropy": 2.1262378096580505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033269852399826, + "step": 14616 + }, + { + "epoch": 0.29236, + "grad_norm": 1.953125, + "grad_norm_var": 0.013327789306640626, + "learning_rate": 0.0001, + "loss": 3.9001, + "loss/crossentropy": 1.836608648300171, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17032578587532043, + "step": 14618 + }, + { + "epoch": 0.2924, + "grad_norm": 1.984375, + "grad_norm_var": 0.012473297119140626, + "learning_rate": 0.0001, + "loss": 4.0165, + "loss/crossentropy": 1.784467339515686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.187791109085083, + "step": 14620 + }, + { + "epoch": 0.29244, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010196940104166666, + "learning_rate": 0.0001, + "loss": 4.2963, + "loss/crossentropy": 2.5023289918899536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2469349130988121, + "step": 14622 + }, + { + "epoch": 0.29248, + "grad_norm": 2.1875, + "grad_norm_var": 0.0142974853515625, + "learning_rate": 0.0001, + "loss": 4.3748, + "loss/crossentropy": 2.244086742401123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20690800249576569, + "step": 14624 + }, + { + "epoch": 0.29252, + "grad_norm": 2.109375, + "grad_norm_var": 0.008829752604166666, + "learning_rate": 0.0001, + "loss": 4.2594, + "loss/crossentropy": 2.167649209499359, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23074676096439362, + "step": 14626 + }, + { + "epoch": 0.29256, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0092437744140625, + "learning_rate": 0.0001, + "loss": 4.1837, + "loss/crossentropy": 1.934467613697052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2041843980550766, + "step": 14628 + }, + { + "epoch": 0.2926, + "grad_norm": 2.03125, + "grad_norm_var": 0.0152740478515625, + "learning_rate": 0.0001, + "loss": 4.3277, + "loss/crossentropy": 2.457708954811096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24325969815254211, + "step": 14630 + }, + { + "epoch": 0.29264, + "grad_norm": 2.015625, + "grad_norm_var": 0.014446767171223958, + "learning_rate": 0.0001, + "loss": 4.0523, + "loss/crossentropy": 2.165565609931946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20386752486228943, + "step": 14632 + }, + { + "epoch": 0.29268, + "grad_norm": 2.0625, + "grad_norm_var": 0.013677724202473958, + "learning_rate": 0.0001, + "loss": 4.0767, + "loss/crossentropy": 1.9475921988487244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2017974555492401, + "step": 14634 + }, + { + "epoch": 0.29272, + "grad_norm": 1.8203125, + "grad_norm_var": 0.015819295247395834, + "learning_rate": 0.0001, + "loss": 4.1435, + "loss/crossentropy": 1.95538991689682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19920305162668228, + "step": 14636 + }, + { + "epoch": 0.29276, + "grad_norm": 2.234375, + "grad_norm_var": 0.01803766886393229, + "learning_rate": 0.0001, + "loss": 4.4073, + "loss/crossentropy": 2.1813108921051025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2130545824766159, + "step": 14638 + }, + { + "epoch": 0.2928, + "grad_norm": 1.8515625, + "grad_norm_var": 0.01672337849934896, + "learning_rate": 0.0001, + "loss": 4.1841, + "loss/crossentropy": 2.198926568031311, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21660766005516052, + "step": 14640 + }, + { + "epoch": 0.29284, + "grad_norm": 2.015625, + "grad_norm_var": 0.01692682902018229, + "learning_rate": 0.0001, + "loss": 4.2872, + "loss/crossentropy": 2.244659662246704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22478342056274414, + "step": 14642 + }, + { + "epoch": 0.29288, + "grad_norm": 1.984375, + "grad_norm_var": 0.01608861287434896, + "learning_rate": 0.0001, + "loss": 4.234, + "loss/crossentropy": 2.1159621477127075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22023546695709229, + "step": 14644 + }, + { + "epoch": 0.29292, + "grad_norm": 1.859375, + "grad_norm_var": 0.011067708333333334, + "learning_rate": 0.0001, + "loss": 3.9166, + "loss/crossentropy": 2.2336114645004272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20683540403842926, + "step": 14646 + }, + { + "epoch": 0.29296, + "grad_norm": 2.078125, + "grad_norm_var": 0.011128743489583334, + "learning_rate": 0.0001, + "loss": 3.8647, + "loss/crossentropy": 1.9015604257583618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2791958376765251, + "step": 14648 + }, + { + "epoch": 0.293, + "grad_norm": 1.9140625, + "grad_norm_var": 0.011180623372395834, + "learning_rate": 0.0001, + "loss": 3.7325, + "loss/crossentropy": 1.6337950229644775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17279411852359772, + "step": 14650 + }, + { + "epoch": 0.29304, + "grad_norm": 2.0625, + "grad_norm_var": 0.009618123372395834, + "learning_rate": 0.0001, + "loss": 4.111, + "loss/crossentropy": 1.920238435268402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18912465870380402, + "step": 14652 + }, + { + "epoch": 0.29308, + "grad_norm": 2.046875, + "grad_norm_var": 0.006075032552083333, + "learning_rate": 0.0001, + "loss": 4.4346, + "loss/crossentropy": 2.5529074668884277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23994534462690353, + "step": 14654 + }, + { + "epoch": 0.29312, + "grad_norm": 1.953125, + "grad_norm_var": 0.0100982666015625, + "learning_rate": 0.0001, + "loss": 4.3154, + "loss/crossentropy": 2.1320537328720093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21480896323919296, + "step": 14656 + }, + { + "epoch": 0.29316, + "grad_norm": 1.8046875, + "grad_norm_var": 0.013205718994140626, + "learning_rate": 0.0001, + "loss": 3.867, + "loss/crossentropy": 1.8921156525611877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1749916449189186, + "step": 14658 + }, + { + "epoch": 0.2932, + "grad_norm": 1.9140625, + "grad_norm_var": 0.015433502197265626, + "learning_rate": 0.0001, + "loss": 3.7405, + "loss/crossentropy": 2.2475300431251526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20675917714834213, + "step": 14660 + }, + { + "epoch": 0.29324, + "grad_norm": 2.046875, + "grad_norm_var": 0.015384928385416666, + "learning_rate": 0.0001, + "loss": 4.2091, + "loss/crossentropy": 2.291227698326111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2244400456547737, + "step": 14662 + }, + { + "epoch": 0.29328, + "grad_norm": 2.09375, + "grad_norm_var": 0.017976888020833335, + "learning_rate": 0.0001, + "loss": 4.1468, + "loss/crossentropy": 1.8830837607383728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21137318760156631, + "step": 14664 + }, + { + "epoch": 0.29332, + "grad_norm": 2.078125, + "grad_norm_var": 0.019742838541666665, + "learning_rate": 0.0001, + "loss": 4.3424, + "loss/crossentropy": 2.366846203804016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22360631078481674, + "step": 14666 + }, + { + "epoch": 0.29336, + "grad_norm": 1.875, + "grad_norm_var": 0.021720123291015626, + "learning_rate": 0.0001, + "loss": 4.1778, + "loss/crossentropy": 2.165378987789154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088456228375435, + "step": 14668 + }, + { + "epoch": 0.2934, + "grad_norm": 1.9609375, + "grad_norm_var": 0.021809895833333332, + "learning_rate": 0.0001, + "loss": 4.178, + "loss/crossentropy": 1.782981276512146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17654509842395782, + "step": 14670 + }, + { + "epoch": 0.29344, + "grad_norm": 2.28125, + "grad_norm_var": 0.021581013997395832, + "learning_rate": 0.0001, + "loss": 4.0796, + "loss/crossentropy": 1.9891030192375183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18961766362190247, + "step": 14672 + }, + { + "epoch": 0.29348, + "grad_norm": 1.9921875, + "grad_norm_var": 0.015738932291666667, + "learning_rate": 0.0001, + "loss": 4.0359, + "loss/crossentropy": 1.9350959062576294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2124071717262268, + "step": 14674 + }, + { + "epoch": 0.29352, + "grad_norm": 2.21875, + "grad_norm_var": 0.01131591796875, + "learning_rate": 0.0001, + "loss": 4.2651, + "loss/crossentropy": 2.1763634085655212, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22027570754289627, + "step": 14676 + }, + { + "epoch": 0.29356, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012499745686848958, + "learning_rate": 0.0001, + "loss": 4.1502, + "loss/crossentropy": 1.741381287574768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17308862507343292, + "step": 14678 + }, + { + "epoch": 0.2936, + "grad_norm": 2.03125, + "grad_norm_var": 0.011633046468098958, + "learning_rate": 0.0001, + "loss": 4.2012, + "loss/crossentropy": 2.352774977684021, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20929308235645294, + "step": 14680 + }, + { + "epoch": 0.29364, + "grad_norm": 2.0625, + "grad_norm_var": 0.012679036458333333, + "learning_rate": 0.0001, + "loss": 3.9799, + "loss/crossentropy": 1.6342085003852844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17740759253501892, + "step": 14682 + }, + { + "epoch": 0.29368, + "grad_norm": 2.109375, + "grad_norm_var": 0.0110260009765625, + "learning_rate": 0.0001, + "loss": 4.1968, + "loss/crossentropy": 2.315647602081299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2202264443039894, + "step": 14684 + }, + { + "epoch": 0.29372, + "grad_norm": 2.09375, + "grad_norm_var": 0.011860911051432292, + "learning_rate": 0.0001, + "loss": 4.0276, + "loss/crossentropy": 2.1667529344558716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20557944476604462, + "step": 14686 + }, + { + "epoch": 0.29376, + "grad_norm": 1.8359375, + "grad_norm_var": 0.012970987955729167, + "learning_rate": 0.0001, + "loss": 3.7212, + "loss/crossentropy": 1.9426026940345764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17865119129419327, + "step": 14688 + }, + { + "epoch": 0.2938, + "grad_norm": 1.9453125, + "grad_norm_var": 0.013720703125, + "learning_rate": 0.0001, + "loss": 3.9345, + "loss/crossentropy": 2.012951970100403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1918969601392746, + "step": 14690 + }, + { + "epoch": 0.29384, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0097808837890625, + "learning_rate": 0.0001, + "loss": 4.2131, + "loss/crossentropy": 2.2038668394088745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21520675718784332, + "step": 14692 + }, + { + "epoch": 0.29388, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009659576416015624, + "learning_rate": 0.0001, + "loss": 3.9577, + "loss/crossentropy": 2.284528613090515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2084675058722496, + "step": 14694 + }, + { + "epoch": 0.29392, + "grad_norm": 2.09375, + "grad_norm_var": 0.009991200764973958, + "learning_rate": 0.0001, + "loss": 4.1159, + "loss/crossentropy": 2.3619518280029297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23107902705669403, + "step": 14696 + }, + { + "epoch": 0.29396, + "grad_norm": 2.0625, + "grad_norm_var": 0.009870402018229167, + "learning_rate": 0.0001, + "loss": 4.1488, + "loss/crossentropy": 2.279863119125366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21235084533691406, + "step": 14698 + }, + { + "epoch": 0.294, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009639231363932292, + "learning_rate": 0.0001, + "loss": 4.2216, + "loss/crossentropy": 2.128455936908722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21381118893623352, + "step": 14700 + }, + { + "epoch": 0.29404, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009014638264973958, + "learning_rate": 0.0001, + "loss": 4.0147, + "loss/crossentropy": 2.265346884727478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2086036428809166, + "step": 14702 + }, + { + "epoch": 0.29408, + "grad_norm": 2.0, + "grad_norm_var": 0.008421834309895833, + "learning_rate": 0.0001, + "loss": 4.2778, + "loss/crossentropy": 2.4323991537094116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21795648336410522, + "step": 14704 + }, + { + "epoch": 0.29412, + "grad_norm": 2.203125, + "grad_norm_var": 0.30881729125976565, + "learning_rate": 0.0001, + "loss": 4.3864, + "loss/crossentropy": 2.2948319911956787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23302766680717468, + "step": 14706 + }, + { + "epoch": 0.29416, + "grad_norm": 1.9765625, + "grad_norm_var": 0.3063140869140625, + "learning_rate": 0.0001, + "loss": 4.121, + "loss/crossentropy": 2.1273980140686035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2240317091345787, + "step": 14708 + }, + { + "epoch": 0.2942, + "grad_norm": 1.828125, + "grad_norm_var": 0.304357655843099, + "learning_rate": 0.0001, + "loss": 3.7468, + "loss/crossentropy": 2.0593321323394775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19727306067943573, + "step": 14710 + }, + { + "epoch": 0.29424, + "grad_norm": 1.9296875, + "grad_norm_var": 0.3060373942057292, + "learning_rate": 0.0001, + "loss": 3.9979, + "loss/crossentropy": 2.339016914367676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2313404530286789, + "step": 14712 + }, + { + "epoch": 0.29428, + "grad_norm": 1.9921875, + "grad_norm_var": 0.30822652180989585, + "learning_rate": 0.0001, + "loss": 3.9406, + "loss/crossentropy": 2.1587395668029785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20085373520851135, + "step": 14714 + }, + { + "epoch": 0.29432, + "grad_norm": 2.078125, + "grad_norm_var": 0.3103912353515625, + "learning_rate": 0.0001, + "loss": 4.0616, + "loss/crossentropy": 1.8317759037017822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20150133967399597, + "step": 14716 + }, + { + "epoch": 0.29436, + "grad_norm": 1.7890625, + "grad_norm_var": 0.31324437459309895, + "learning_rate": 0.0001, + "loss": 3.9291, + "loss/crossentropy": 1.849327266216278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17959115654230118, + "step": 14718 + }, + { + "epoch": 0.2944, + "grad_norm": 1.8828125, + "grad_norm_var": 0.32099507649739584, + "learning_rate": 0.0001, + "loss": 3.7933, + "loss/crossentropy": 1.652997612953186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17243028432130814, + "step": 14720 + }, + { + "epoch": 0.29444, + "grad_norm": 2.03125, + "grad_norm_var": 0.012995402018229166, + "learning_rate": 0.0001, + "loss": 4.2919, + "loss/crossentropy": 1.968604028224945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20564967393875122, + "step": 14722 + }, + { + "epoch": 0.29448, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007466634114583333, + "learning_rate": 0.0001, + "loss": 4.1211, + "loss/crossentropy": 2.349083185195923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24004054814577103, + "step": 14724 + }, + { + "epoch": 0.29452, + "grad_norm": 1.96875, + "grad_norm_var": 0.006200917561848958, + "learning_rate": 0.0001, + "loss": 4.2078, + "loss/crossentropy": 2.1629676818847656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2260306179523468, + "step": 14726 + }, + { + "epoch": 0.29456, + "grad_norm": 2.265625, + "grad_norm_var": 0.012572224934895833, + "learning_rate": 0.0001, + "loss": 4.5995, + "loss/crossentropy": 2.221126079559326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21054977923631668, + "step": 14728 + }, + { + "epoch": 0.2946, + "grad_norm": 2.234375, + "grad_norm_var": 0.3638987223307292, + "learning_rate": 0.0001, + "loss": 4.2588, + "loss/crossentropy": 2.0333986282348633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27580036222934723, + "step": 14730 + }, + { + "epoch": 0.29464, + "grad_norm": 2.09375, + "grad_norm_var": 0.36292699178059895, + "learning_rate": 0.0001, + "loss": 4.2601, + "loss/crossentropy": 2.0025470852851868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20166245102882385, + "step": 14732 + }, + { + "epoch": 0.29468, + "grad_norm": 2.046875, + "grad_norm_var": 0.35410868326822914, + "learning_rate": 0.0001, + "loss": 4.1044, + "loss/crossentropy": 2.2882933616638184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1972086951136589, + "step": 14734 + }, + { + "epoch": 0.29472, + "grad_norm": 2.125, + "grad_norm_var": 0.3395342508951823, + "learning_rate": 0.0001, + "loss": 4.1616, + "loss/crossentropy": 2.2404085397720337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22015579044818878, + "step": 14736 + }, + { + "epoch": 0.29476, + "grad_norm": 2.234375, + "grad_norm_var": 0.33505223592122396, + "learning_rate": 0.0001, + "loss": 4.0132, + "loss/crossentropy": 1.9309356808662415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2082076519727707, + "step": 14738 + }, + { + "epoch": 0.2948, + "grad_norm": 2.046875, + "grad_norm_var": 0.33227437337239585, + "learning_rate": 0.0001, + "loss": 4.1652, + "loss/crossentropy": 2.2118934988975525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20175088196992874, + "step": 14740 + }, + { + "epoch": 0.29484, + "grad_norm": 2.03125, + "grad_norm_var": 0.36171875, + "learning_rate": 0.0001, + "loss": 4.1204, + "loss/crossentropy": 2.19997900724411, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22470968216657639, + "step": 14742 + }, + { + "epoch": 0.29488, + "grad_norm": 2.09375, + "grad_norm_var": 0.365966796875, + "learning_rate": 0.0001, + "loss": 4.2159, + "loss/crossentropy": 1.8140272498130798, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20032335817813873, + "step": 14744 + }, + { + "epoch": 0.29492, + "grad_norm": 1.9609375, + "grad_norm_var": 0.07648086547851562, + "learning_rate": 0.0001, + "loss": 3.957, + "loss/crossentropy": 2.0233620405197144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20464400947093964, + "step": 14746 + }, + { + "epoch": 0.29496, + "grad_norm": 2.390625, + "grad_norm_var": 0.07643407185872396, + "learning_rate": 0.0001, + "loss": 4.3946, + "loss/crossentropy": 2.3900705575942993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23763202875852585, + "step": 14748 + }, + { + "epoch": 0.295, + "grad_norm": 1.9765625, + "grad_norm_var": 0.07768325805664063, + "learning_rate": 0.0001, + "loss": 4.205, + "loss/crossentropy": 1.9895538687705994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20736829191446304, + "step": 14750 + }, + { + "epoch": 0.29504, + "grad_norm": 2.03125, + "grad_norm_var": 0.0857421875, + "learning_rate": 0.0001, + "loss": 4.1454, + "loss/crossentropy": 1.7315555810928345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1839754432439804, + "step": 14752 + }, + { + "epoch": 0.29508, + "grad_norm": 2.0625, + "grad_norm_var": 0.08105367024739583, + "learning_rate": 0.0001, + "loss": 4.5482, + "loss/crossentropy": 2.44241464138031, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2505839467048645, + "step": 14754 + }, + { + "epoch": 0.29512, + "grad_norm": 2.1875, + "grad_norm_var": 0.0849029541015625, + "learning_rate": 0.0001, + "loss": 3.782, + "loss/crossentropy": 1.7948896884918213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987960934638977, + "step": 14756 + }, + { + "epoch": 0.29516, + "grad_norm": 2.125, + "grad_norm_var": 0.020334625244140626, + "learning_rate": 0.0001, + "loss": 4.0898, + "loss/crossentropy": 1.710760235786438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18624412268400192, + "step": 14758 + }, + { + "epoch": 0.2952, + "grad_norm": 2.390625, + "grad_norm_var": 0.027205149332682293, + "learning_rate": 0.0001, + "loss": 4.4651, + "loss/crossentropy": 2.0127750635147095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20178033411502838, + "step": 14760 + }, + { + "epoch": 0.29524, + "grad_norm": 2.0, + "grad_norm_var": 0.027042388916015625, + "learning_rate": 0.0001, + "loss": 4.1152, + "loss/crossentropy": 1.9764790534973145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19000716507434845, + "step": 14762 + }, + { + "epoch": 0.29528, + "grad_norm": 2.140625, + "grad_norm_var": 0.020930735270182292, + "learning_rate": 0.0001, + "loss": 3.9634, + "loss/crossentropy": 2.0221771597862244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1980113685131073, + "step": 14764 + }, + { + "epoch": 0.29532, + "grad_norm": 2.09375, + "grad_norm_var": 0.019972483317057293, + "learning_rate": 0.0001, + "loss": 4.1721, + "loss/crossentropy": 2.278168559074402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.231141597032547, + "step": 14766 + }, + { + "epoch": 0.29536, + "grad_norm": 1.953125, + "grad_norm_var": 0.0168853759765625, + "learning_rate": 0.0001, + "loss": 4.1308, + "loss/crossentropy": 2.1837246417999268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2026391476392746, + "step": 14768 + }, + { + "epoch": 0.2954, + "grad_norm": 1.890625, + "grad_norm_var": 0.019795735677083332, + "learning_rate": 0.0001, + "loss": 4.0694, + "loss/crossentropy": 2.1170668601989746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1871805638074875, + "step": 14770 + }, + { + "epoch": 0.29544, + "grad_norm": 2.125, + "grad_norm_var": 0.01784032185872396, + "learning_rate": 0.0001, + "loss": 4.1003, + "loss/crossentropy": 1.9905366897583008, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1951729580760002, + "step": 14772 + }, + { + "epoch": 0.29548, + "grad_norm": 2.03125, + "grad_norm_var": 0.017146809895833334, + "learning_rate": 0.0001, + "loss": 3.9811, + "loss/crossentropy": 1.781678318977356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19464702904224396, + "step": 14774 + }, + { + "epoch": 0.29552, + "grad_norm": 1.875, + "grad_norm_var": 0.012303670247395834, + "learning_rate": 0.0001, + "loss": 4.1944, + "loss/crossentropy": 2.2671462297439575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22269655019044876, + "step": 14776 + }, + { + "epoch": 0.29556, + "grad_norm": 2.046875, + "grad_norm_var": 0.012511952718098959, + "learning_rate": 0.0001, + "loss": 4.2128, + "loss/crossentropy": 2.0294137001037598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21939906477928162, + "step": 14778 + }, + { + "epoch": 0.2956, + "grad_norm": 1.9375, + "grad_norm_var": 0.011264801025390625, + "learning_rate": 0.0001, + "loss": 4.1295, + "loss/crossentropy": 1.9174134731292725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19592813402414322, + "step": 14780 + }, + { + "epoch": 0.29564, + "grad_norm": 2.0, + "grad_norm_var": 0.009528605143229167, + "learning_rate": 0.0001, + "loss": 3.9877, + "loss/crossentropy": 2.0033875703811646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2121451273560524, + "step": 14782 + }, + { + "epoch": 0.29568, + "grad_norm": 1.9765625, + "grad_norm_var": 0.09626439412434896, + "learning_rate": 0.0001, + "loss": 4.0263, + "loss/crossentropy": 2.209542691707611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18373841792345047, + "step": 14784 + }, + { + "epoch": 0.29572, + "grad_norm": 2.015625, + "grad_norm_var": 0.09244155883789062, + "learning_rate": 0.0001, + "loss": 4.149, + "loss/crossentropy": 2.0651984214782715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20486845821142197, + "step": 14786 + }, + { + "epoch": 0.29576, + "grad_norm": 2.03125, + "grad_norm_var": 0.09123433430989583, + "learning_rate": 0.0001, + "loss": 4.1697, + "loss/crossentropy": 2.311215043067932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22553718090057373, + "step": 14788 + }, + { + "epoch": 0.2958, + "grad_norm": 2.234375, + "grad_norm_var": 0.09251708984375, + "learning_rate": 0.0001, + "loss": 4.3291, + "loss/crossentropy": 2.0282764434814453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20748327672481537, + "step": 14790 + }, + { + "epoch": 0.29584, + "grad_norm": 1.9765625, + "grad_norm_var": 0.09575093587239583, + "learning_rate": 0.0001, + "loss": 3.764, + "loss/crossentropy": 2.04198157787323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19687633216381073, + "step": 14792 + }, + { + "epoch": 0.29588, + "grad_norm": 2.125, + "grad_norm_var": 0.09601236979166666, + "learning_rate": 0.0001, + "loss": 4.389, + "loss/crossentropy": 2.049328565597534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20265275985002518, + "step": 14794 + }, + { + "epoch": 0.29592, + "grad_norm": 1.984375, + "grad_norm_var": 0.0992876688639323, + "learning_rate": 0.0001, + "loss": 3.9118, + "loss/crossentropy": 1.905173659324646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.193840891122818, + "step": 14796 + }, + { + "epoch": 0.29596, + "grad_norm": 2.078125, + "grad_norm_var": 0.09698486328125, + "learning_rate": 0.0001, + "loss": 3.941, + "loss/crossentropy": 1.8715303540229797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18632445484399796, + "step": 14798 + }, + { + "epoch": 0.296, + "grad_norm": 1.953125, + "grad_norm_var": 0.013765207926432292, + "learning_rate": 0.0001, + "loss": 4.039, + "loss/crossentropy": 2.0428889989852905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19538581371307373, + "step": 14800 + }, + { + "epoch": 0.29604, + "grad_norm": 2.203125, + "grad_norm_var": 0.015553538004557292, + "learning_rate": 0.0001, + "loss": 4.0834, + "loss/crossentropy": 2.2959046363830566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2372349202632904, + "step": 14802 + }, + { + "epoch": 0.29608, + "grad_norm": 1.96875, + "grad_norm_var": 0.016190592447916666, + "learning_rate": 0.0001, + "loss": 4.0399, + "loss/crossentropy": 2.083451807498932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20915375649929047, + "step": 14804 + }, + { + "epoch": 0.29612, + "grad_norm": 2.03125, + "grad_norm_var": 0.0132232666015625, + "learning_rate": 0.0001, + "loss": 4.293, + "loss/crossentropy": 2.31546950340271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20648062229156494, + "step": 14806 + }, + { + "epoch": 0.29616, + "grad_norm": 1.96875, + "grad_norm_var": 0.013019816080729166, + "learning_rate": 0.0001, + "loss": 3.7273, + "loss/crossentropy": 1.9833638072013855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19132772833108902, + "step": 14808 + }, + { + "epoch": 0.2962, + "grad_norm": 1.8984375, + "grad_norm_var": 0.013655344645182291, + "learning_rate": 0.0001, + "loss": 3.8247, + "loss/crossentropy": 1.784917414188385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19032159447669983, + "step": 14810 + }, + { + "epoch": 0.29624, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011263020833333333, + "learning_rate": 0.0001, + "loss": 4.0011, + "loss/crossentropy": 1.843966782093048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22127597779035568, + "step": 14812 + }, + { + "epoch": 0.29628, + "grad_norm": 2.921875, + "grad_norm_var": 0.06729100545247396, + "learning_rate": 0.0001, + "loss": 4.0535, + "loss/crossentropy": 1.8107360005378723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18275444209575653, + "step": 14814 + }, + { + "epoch": 0.29632, + "grad_norm": 2.078125, + "grad_norm_var": 0.06665827433268229, + "learning_rate": 0.0001, + "loss": 4.2711, + "loss/crossentropy": 2.147618055343628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23800157755613327, + "step": 14816 + }, + { + "epoch": 0.29636, + "grad_norm": 2.0625, + "grad_norm_var": 0.06475804646809896, + "learning_rate": 0.0001, + "loss": 4.267, + "loss/crossentropy": 2.4430278539657593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2401380091905594, + "step": 14818 + }, + { + "epoch": 0.2964, + "grad_norm": 2.078125, + "grad_norm_var": 0.06377665201822917, + "learning_rate": 0.0001, + "loss": 4.3204, + "loss/crossentropy": 2.181501626968384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22491848468780518, + "step": 14820 + }, + { + "epoch": 0.29644, + "grad_norm": 2.046875, + "grad_norm_var": 0.06398111979166667, + "learning_rate": 0.0001, + "loss": 4.0375, + "loss/crossentropy": 2.101313889026642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21598663926124573, + "step": 14822 + }, + { + "epoch": 0.29648, + "grad_norm": 1.9765625, + "grad_norm_var": 0.059020741780598955, + "learning_rate": 0.0001, + "loss": 3.9629, + "loss/crossentropy": 1.6834549307823181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19430063664913177, + "step": 14824 + }, + { + "epoch": 0.29652, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0566162109375, + "learning_rate": 0.0001, + "loss": 4.0087, + "loss/crossentropy": 1.8400229215621948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18764454871416092, + "step": 14826 + }, + { + "epoch": 0.29656, + "grad_norm": 2.015625, + "grad_norm_var": 0.05609944661458333, + "learning_rate": 0.0001, + "loss": 4.1427, + "loss/crossentropy": 1.9417667388916016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21304062008857727, + "step": 14828 + }, + { + "epoch": 0.2966, + "grad_norm": 2.046875, + "grad_norm_var": 0.003639475504557292, + "learning_rate": 0.0001, + "loss": 4.125, + "loss/crossentropy": 2.227192521095276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23011230677366257, + "step": 14830 + }, + { + "epoch": 0.29664, + "grad_norm": 2.015625, + "grad_norm_var": 0.0029436747233072915, + "learning_rate": 0.0001, + "loss": 4.3496, + "loss/crossentropy": 2.2136365175247192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21694285422563553, + "step": 14832 + }, + { + "epoch": 0.29668, + "grad_norm": 1.890625, + "grad_norm_var": 0.004137929280598958, + "learning_rate": 0.0001, + "loss": 3.9464, + "loss/crossentropy": 1.9179469347000122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1996636986732483, + "step": 14834 + }, + { + "epoch": 0.29672, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0043718973795572914, + "learning_rate": 0.0001, + "loss": 3.9001, + "loss/crossentropy": 1.9639176726341248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17738713324069977, + "step": 14836 + }, + { + "epoch": 0.29676, + "grad_norm": 2.0625, + "grad_norm_var": 0.004564412434895833, + "learning_rate": 0.0001, + "loss": 4.2665, + "loss/crossentropy": 1.7748695611953735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18663641810417175, + "step": 14838 + }, + { + "epoch": 0.2968, + "grad_norm": 1.8046875, + "grad_norm_var": 0.007054646809895833, + "learning_rate": 0.0001, + "loss": 3.9879, + "loss/crossentropy": 2.094521999359131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18323734402656555, + "step": 14840 + }, + { + "epoch": 0.29684, + "grad_norm": 2.3125, + "grad_norm_var": 0.01407470703125, + "learning_rate": 0.0001, + "loss": 4.5969, + "loss/crossentropy": 2.488257050514221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23173442482948303, + "step": 14842 + }, + { + "epoch": 0.29688, + "grad_norm": 2.046875, + "grad_norm_var": 0.013444010416666667, + "learning_rate": 0.0001, + "loss": 4.2015, + "loss/crossentropy": 2.2317086458206177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2046324908733368, + "step": 14844 + }, + { + "epoch": 0.29692, + "grad_norm": 2.09375, + "grad_norm_var": 0.014134724934895834, + "learning_rate": 0.0001, + "loss": 3.9917, + "loss/crossentropy": 2.275767207145691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20739831775426865, + "step": 14846 + }, + { + "epoch": 0.29696, + "grad_norm": 1.9765625, + "grad_norm_var": 0.014501698811848958, + "learning_rate": 0.0001, + "loss": 3.9728, + "loss/crossentropy": 2.0288257002830505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18860980868339539, + "step": 14848 + }, + { + "epoch": 0.297, + "grad_norm": 2.140625, + "grad_norm_var": 0.015449778238932291, + "learning_rate": 0.0001, + "loss": 4.3509, + "loss/crossentropy": 2.0609896183013916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20585624873638153, + "step": 14850 + }, + { + "epoch": 0.29704, + "grad_norm": 1.96875, + "grad_norm_var": 0.018214670817057292, + "learning_rate": 0.0001, + "loss": 3.5485, + "loss/crossentropy": 1.9250158667564392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1843906193971634, + "step": 14852 + }, + { + "epoch": 0.29708, + "grad_norm": 1.96875, + "grad_norm_var": 0.018070475260416666, + "learning_rate": 0.0001, + "loss": 3.9761, + "loss/crossentropy": 2.118358612060547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2035902589559555, + "step": 14854 + }, + { + "epoch": 0.29712, + "grad_norm": 1.90625, + "grad_norm_var": 0.015610504150390624, + "learning_rate": 0.0001, + "loss": 4.135, + "loss/crossentropy": 1.8381852507591248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1859317123889923, + "step": 14856 + }, + { + "epoch": 0.29716, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009513346354166667, + "learning_rate": 0.0001, + "loss": 4.1235, + "loss/crossentropy": 2.078063726425171, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21035100519657135, + "step": 14858 + }, + { + "epoch": 0.2972, + "grad_norm": 2.078125, + "grad_norm_var": 0.010701497395833334, + "learning_rate": 0.0001, + "loss": 4.0549, + "loss/crossentropy": 2.1546566486358643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19998939335346222, + "step": 14860 + }, + { + "epoch": 0.29724, + "grad_norm": 2.046875, + "grad_norm_var": 0.010164133707682292, + "learning_rate": 0.0001, + "loss": 4.2264, + "loss/crossentropy": 2.019156754016876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1957200989127159, + "step": 14862 + }, + { + "epoch": 0.29728, + "grad_norm": 2.140625, + "grad_norm_var": 0.013138834635416667, + "learning_rate": 0.0001, + "loss": 4.0808, + "loss/crossentropy": 1.8542814254760742, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19331540167331696, + "step": 14864 + }, + { + "epoch": 0.29732, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010895792643229167, + "learning_rate": 0.0001, + "loss": 4.2488, + "loss/crossentropy": 2.241651773452759, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22117780148983002, + "step": 14866 + }, + { + "epoch": 0.29736, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011226145426432292, + "learning_rate": 0.0001, + "loss": 4.2139, + "loss/crossentropy": 1.8856277465820312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19262754172086716, + "step": 14868 + }, + { + "epoch": 0.2974, + "grad_norm": 2.203125, + "grad_norm_var": 0.013388824462890626, + "learning_rate": 0.0001, + "loss": 4.3315, + "loss/crossentropy": 2.0159581899642944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21892660111188889, + "step": 14870 + }, + { + "epoch": 0.29744, + "grad_norm": 1.828125, + "grad_norm_var": 0.014562733968098958, + "learning_rate": 0.0001, + "loss": 3.8965, + "loss/crossentropy": 1.8794240355491638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18827088177204132, + "step": 14872 + }, + { + "epoch": 0.29748, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013726552327473959, + "learning_rate": 0.0001, + "loss": 4.2176, + "loss/crossentropy": 2.3741602897644043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2292410209774971, + "step": 14874 + }, + { + "epoch": 0.29752, + "grad_norm": 1.828125, + "grad_norm_var": 0.014025624593098958, + "learning_rate": 0.0001, + "loss": 3.8625, + "loss/crossentropy": 1.9994327425956726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21266867220401764, + "step": 14876 + }, + { + "epoch": 0.29756, + "grad_norm": 2.125, + "grad_norm_var": 0.014143880208333333, + "learning_rate": 0.0001, + "loss": 4.1337, + "loss/crossentropy": 2.0914021730422974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2071390450000763, + "step": 14878 + }, + { + "epoch": 0.2976, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012798817952473958, + "learning_rate": 0.0001, + "loss": 4.0463, + "loss/crossentropy": 1.8055935502052307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19375848025083542, + "step": 14880 + }, + { + "epoch": 0.29764, + "grad_norm": 2.03125, + "grad_norm_var": 0.012896474202473958, + "learning_rate": 0.0001, + "loss": 4.3662, + "loss/crossentropy": 2.614544630050659, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22156962752342224, + "step": 14882 + }, + { + "epoch": 0.29768, + "grad_norm": 1.8515625, + "grad_norm_var": 0.013205718994140626, + "learning_rate": 0.0001, + "loss": 3.6502, + "loss/crossentropy": 1.7306513786315918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17159338295459747, + "step": 14884 + }, + { + "epoch": 0.29772, + "grad_norm": 2.046875, + "grad_norm_var": 0.010060373942057292, + "learning_rate": 0.0001, + "loss": 4.0683, + "loss/crossentropy": 2.060949981212616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20615212619304657, + "step": 14886 + }, + { + "epoch": 0.29776, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008756510416666667, + "learning_rate": 0.0001, + "loss": 3.9903, + "loss/crossentropy": 1.8338764309883118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20274019241333008, + "step": 14888 + }, + { + "epoch": 0.2978, + "grad_norm": 1.78125, + "grad_norm_var": 0.013683827718098958, + "learning_rate": 0.0001, + "loss": 4.0335, + "loss/crossentropy": 2.2182517051696777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22446341067552567, + "step": 14890 + }, + { + "epoch": 0.29784, + "grad_norm": 2.09375, + "grad_norm_var": 0.012951405843098958, + "learning_rate": 0.0001, + "loss": 4.3924, + "loss/crossentropy": 2.222801446914673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22065874934196472, + "step": 14892 + }, + { + "epoch": 0.29788, + "grad_norm": 1.8515625, + "grad_norm_var": 0.012808990478515626, + "learning_rate": 0.0001, + "loss": 4.1315, + "loss/crossentropy": 2.2181414365768433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21953265368938446, + "step": 14894 + }, + { + "epoch": 0.29792, + "grad_norm": 1.984375, + "grad_norm_var": 0.013166300455729167, + "learning_rate": 0.0001, + "loss": 3.654, + "loss/crossentropy": 1.9304096102714539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20096193999052048, + "step": 14896 + }, + { + "epoch": 0.29796, + "grad_norm": 2.0, + "grad_norm_var": 0.012043253580729166, + "learning_rate": 0.0001, + "loss": 4.2074, + "loss/crossentropy": 2.2288341522216797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24108773469924927, + "step": 14898 + }, + { + "epoch": 0.298, + "grad_norm": 2.15625, + "grad_norm_var": 0.012550608317057291, + "learning_rate": 0.0001, + "loss": 4.191, + "loss/crossentropy": 2.1711790561676025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2395533323287964, + "step": 14900 + }, + { + "epoch": 0.29804, + "grad_norm": 2.03125, + "grad_norm_var": 0.019482167561848958, + "learning_rate": 0.0001, + "loss": 4.2686, + "loss/crossentropy": 2.0533857345581055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21801679581403732, + "step": 14902 + }, + { + "epoch": 0.29808, + "grad_norm": 2.0625, + "grad_norm_var": 0.01883519490559896, + "learning_rate": 0.0001, + "loss": 3.8678, + "loss/crossentropy": 1.634634256362915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17861492931842804, + "step": 14904 + }, + { + "epoch": 0.29812, + "grad_norm": 2.15625, + "grad_norm_var": 0.0149322509765625, + "learning_rate": 0.0001, + "loss": 4.2152, + "loss/crossentropy": 2.3151358366012573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22694342583417892, + "step": 14906 + }, + { + "epoch": 0.29816, + "grad_norm": 2.03125, + "grad_norm_var": 0.01585871378580729, + "learning_rate": 0.0001, + "loss": 4.085, + "loss/crossentropy": 1.9165552258491516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18231045454740524, + "step": 14908 + }, + { + "epoch": 0.2982, + "grad_norm": 1.8828125, + "grad_norm_var": 0.01480712890625, + "learning_rate": 0.0001, + "loss": 3.9693, + "loss/crossentropy": 2.0094637274742126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19547566026449203, + "step": 14910 + }, + { + "epoch": 0.29824, + "grad_norm": 1.8515625, + "grad_norm_var": 0.014753214518229167, + "learning_rate": 0.0001, + "loss": 3.9091, + "loss/crossentropy": 1.8764418959617615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18009892851114273, + "step": 14912 + }, + { + "epoch": 0.29828, + "grad_norm": 2.140625, + "grad_norm_var": 0.015868123372395834, + "learning_rate": 0.0001, + "loss": 4.429, + "loss/crossentropy": 2.1255921721458435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20359216630458832, + "step": 14914 + }, + { + "epoch": 0.29832, + "grad_norm": 1.9921875, + "grad_norm_var": 0.014891560872395833, + "learning_rate": 0.0001, + "loss": 3.7889, + "loss/crossentropy": 2.1316330432891846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1970125362277031, + "step": 14916 + }, + { + "epoch": 0.29836, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0089996337890625, + "learning_rate": 0.0001, + "loss": 4.0305, + "loss/crossentropy": 1.7825700640678406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1902567446231842, + "step": 14918 + }, + { + "epoch": 0.2984, + "grad_norm": 2.0, + "grad_norm_var": 0.008565012613932292, + "learning_rate": 0.0001, + "loss": 3.9577, + "loss/crossentropy": 1.982073962688446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20216460525989532, + "step": 14920 + }, + { + "epoch": 0.29844, + "grad_norm": 1.953125, + "grad_norm_var": 0.0060943603515625, + "learning_rate": 0.0001, + "loss": 4.04, + "loss/crossentropy": 2.3210418224334717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2224876508116722, + "step": 14922 + }, + { + "epoch": 0.29848, + "grad_norm": 2.03125, + "grad_norm_var": 0.005535634358723959, + "learning_rate": 0.0001, + "loss": 4.0509, + "loss/crossentropy": 1.7990338206291199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17401983588933945, + "step": 14924 + }, + { + "epoch": 0.29852, + "grad_norm": 2.1875, + "grad_norm_var": 0.010545857747395833, + "learning_rate": 0.0001, + "loss": 4.4979, + "loss/crossentropy": 1.8772737979888916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21376933157444, + "step": 14926 + }, + { + "epoch": 0.29856, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009716542561848958, + "learning_rate": 0.0001, + "loss": 4.0544, + "loss/crossentropy": 1.6914128065109253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17860179394483566, + "step": 14928 + }, + { + "epoch": 0.2986, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009691365559895833, + "learning_rate": 0.0001, + "loss": 3.9737, + "loss/crossentropy": 1.9545430541038513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015315517783165, + "step": 14930 + }, + { + "epoch": 0.29864, + "grad_norm": 1.921875, + "grad_norm_var": 0.009175618489583334, + "learning_rate": 0.0001, + "loss": 3.9181, + "loss/crossentropy": 2.0585074424743652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2029297649860382, + "step": 14932 + }, + { + "epoch": 0.29868, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009679921468098958, + "learning_rate": 0.0001, + "loss": 3.8638, + "loss/crossentropy": 1.9196518063545227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1970217153429985, + "step": 14934 + }, + { + "epoch": 0.29872, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0097564697265625, + "learning_rate": 0.0001, + "loss": 4.1366, + "loss/crossentropy": 2.1077409982681274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22366851568222046, + "step": 14936 + }, + { + "epoch": 0.29876, + "grad_norm": 3.921875, + "grad_norm_var": 0.24059829711914063, + "learning_rate": 0.0001, + "loss": 3.8552, + "loss/crossentropy": 1.7660444974899292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23386957496404648, + "step": 14938 + }, + { + "epoch": 0.2988, + "grad_norm": 2.21875, + "grad_norm_var": 0.23871841430664062, + "learning_rate": 0.0001, + "loss": 4.2936, + "loss/crossentropy": 1.8436493873596191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19589588046073914, + "step": 14940 + }, + { + "epoch": 0.29884, + "grad_norm": 2.40625, + "grad_norm_var": 0.24443333943684895, + "learning_rate": 0.0001, + "loss": 4.3465, + "loss/crossentropy": 2.2123888731002808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26607823371887207, + "step": 14942 + }, + { + "epoch": 0.29888, + "grad_norm": 2.5625, + "grad_norm_var": 0.2546119689941406, + "learning_rate": 0.0001, + "loss": 4.3234, + "loss/crossentropy": 2.70485258102417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2529008686542511, + "step": 14944 + }, + { + "epoch": 0.29892, + "grad_norm": 2.109375, + "grad_norm_var": 0.25155843098958336, + "learning_rate": 0.0001, + "loss": 4.1379, + "loss/crossentropy": 1.932865023612976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22141847014427185, + "step": 14946 + }, + { + "epoch": 0.29896, + "grad_norm": 1.9765625, + "grad_norm_var": 0.25643310546875, + "learning_rate": 0.0001, + "loss": 4.0119, + "loss/crossentropy": 2.213751196861267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21382954716682434, + "step": 14948 + }, + { + "epoch": 0.299, + "grad_norm": 1.9609375, + "grad_norm_var": 0.251073964436849, + "learning_rate": 0.0001, + "loss": 3.9673, + "loss/crossentropy": 1.679059088230133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16484958678483963, + "step": 14950 + }, + { + "epoch": 0.29904, + "grad_norm": 1.9453125, + "grad_norm_var": 0.24972508748372396, + "learning_rate": 0.0001, + "loss": 3.9873, + "loss/crossentropy": 1.748594582080841, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20150119066238403, + "step": 14952 + }, + { + "epoch": 0.29908, + "grad_norm": 2.5625, + "grad_norm_var": 0.05215657552083333, + "learning_rate": 0.0001, + "loss": 4.8053, + "loss/crossentropy": 2.4588215351104736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2513396218419075, + "step": 14954 + }, + { + "epoch": 0.29912, + "grad_norm": 1.9453125, + "grad_norm_var": 0.05468114217122396, + "learning_rate": 0.0001, + "loss": 4.0782, + "loss/crossentropy": 2.1104516983032227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20738063752651215, + "step": 14956 + }, + { + "epoch": 0.29916, + "grad_norm": 2.109375, + "grad_norm_var": 0.04905776977539063, + "learning_rate": 0.0001, + "loss": 4.2351, + "loss/crossentropy": 1.7171857953071594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27423302084207535, + "step": 14958 + }, + { + "epoch": 0.2992, + "grad_norm": 1.890625, + "grad_norm_var": 0.03264058430989583, + "learning_rate": 0.0001, + "loss": 3.7507, + "loss/crossentropy": 1.5607159733772278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1616373285651207, + "step": 14960 + }, + { + "epoch": 0.29924, + "grad_norm": 1.9765625, + "grad_norm_var": 0.033394114176432295, + "learning_rate": 0.0001, + "loss": 4.0128, + "loss/crossentropy": 1.9257365465164185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18031761050224304, + "step": 14962 + }, + { + "epoch": 0.29928, + "grad_norm": 2.0, + "grad_norm_var": 0.031062825520833334, + "learning_rate": 0.0001, + "loss": 3.9406, + "loss/crossentropy": 1.876187801361084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19531002640724182, + "step": 14964 + }, + { + "epoch": 0.29932, + "grad_norm": 2.140625, + "grad_norm_var": 0.0317047119140625, + "learning_rate": 0.0001, + "loss": 4.2028, + "loss/crossentropy": 1.9325042963027954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19496598094701767, + "step": 14966 + }, + { + "epoch": 0.29936, + "grad_norm": 1.8515625, + "grad_norm_var": 0.03322652180989583, + "learning_rate": 0.0001, + "loss": 4.1293, + "loss/crossentropy": 2.148952007293701, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18833627551794052, + "step": 14968 + }, + { + "epoch": 0.2994, + "grad_norm": 2.015625, + "grad_norm_var": 0.008099110921223958, + "learning_rate": 0.0001, + "loss": 4.0954, + "loss/crossentropy": 2.3639365434646606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21851496398448944, + "step": 14970 + }, + { + "epoch": 0.29944, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008371734619140625, + "learning_rate": 0.0001, + "loss": 3.8026, + "loss/crossentropy": 1.941165030002594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19548548012971878, + "step": 14972 + }, + { + "epoch": 0.29948, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0082916259765625, + "learning_rate": 0.0001, + "loss": 4.1203, + "loss/crossentropy": 2.030466139316559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1922580599784851, + "step": 14974 + }, + { + "epoch": 0.29952, + "grad_norm": 2.0625, + "grad_norm_var": 0.008487955729166666, + "learning_rate": 0.0001, + "loss": 4.2854, + "loss/crossentropy": 2.2279865741729736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20831073075532913, + "step": 14976 + }, + { + "epoch": 0.29956, + "grad_norm": 1.875, + "grad_norm_var": 0.007047271728515625, + "learning_rate": 0.0001, + "loss": 4.1708, + "loss/crossentropy": 2.0749244689941406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18032152205705643, + "step": 14978 + }, + { + "epoch": 0.2996, + "grad_norm": 2.0625, + "grad_norm_var": 0.04270731608072917, + "learning_rate": 0.0001, + "loss": 4.199, + "loss/crossentropy": 2.3190104961395264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24735668301582336, + "step": 14980 + }, + { + "epoch": 0.29964, + "grad_norm": 1.8203125, + "grad_norm_var": 0.044406890869140625, + "learning_rate": 0.0001, + "loss": 3.9549, + "loss/crossentropy": 1.970013439655304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19323202222585678, + "step": 14982 + }, + { + "epoch": 0.29968, + "grad_norm": 1.9921875, + "grad_norm_var": 0.045169830322265625, + "learning_rate": 0.0001, + "loss": 4.4762, + "loss/crossentropy": 2.052343726158142, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20909954607486725, + "step": 14984 + }, + { + "epoch": 0.29972, + "grad_norm": 2.046875, + "grad_norm_var": 0.043268839518229164, + "learning_rate": 0.0001, + "loss": 4.1776, + "loss/crossentropy": 2.2586673498153687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22640614211559296, + "step": 14986 + }, + { + "epoch": 0.29976, + "grad_norm": 1.9609375, + "grad_norm_var": 0.042557779947916666, + "learning_rate": 0.0001, + "loss": 3.8465, + "loss/crossentropy": 1.8843002319335938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19117384403944016, + "step": 14988 + }, + { + "epoch": 0.2998, + "grad_norm": 1.9453125, + "grad_norm_var": 0.042740885416666666, + "learning_rate": 0.0001, + "loss": 3.7343, + "loss/crossentropy": 1.542019009590149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18295453488826752, + "step": 14990 + }, + { + "epoch": 0.29984, + "grad_norm": 1.9609375, + "grad_norm_var": 0.043369293212890625, + "learning_rate": 0.0001, + "loss": 4.1335, + "loss/crossentropy": 2.160573959350586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22916415333747864, + "step": 14992 + }, + { + "epoch": 0.29988, + "grad_norm": 2.609375, + "grad_norm_var": 0.062168121337890625, + "learning_rate": 0.0001, + "loss": 4.1898, + "loss/crossentropy": 2.1296870708465576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20498204976320267, + "step": 14994 + }, + { + "epoch": 0.29992, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0336669921875, + "learning_rate": 0.0001, + "loss": 4.0554, + "loss/crossentropy": 2.0356279015541077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21798957884311676, + "step": 14996 + }, + { + "epoch": 0.29996, + "grad_norm": 2.015625, + "grad_norm_var": 0.035796864827473955, + "learning_rate": 0.0001, + "loss": 4.6195, + "loss/crossentropy": 2.466023027896881, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22107623517513275, + "step": 14998 + }, + { + "epoch": 0.3, + "grad_norm": 2.046875, + "grad_norm_var": 0.03408203125, + "learning_rate": 0.0001, + "loss": 4.1455, + "loss/crossentropy": 1.8457531332969666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21306351572275162, + "step": 15000 + }, + { + "epoch": 0.30004, + "grad_norm": 2.078125, + "grad_norm_var": 0.034211222330729166, + "learning_rate": 0.0001, + "loss": 4.5128, + "loss/crossentropy": 2.202435612678528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21542339771986008, + "step": 15002 + }, + { + "epoch": 0.30008, + "grad_norm": 1.9921875, + "grad_norm_var": 0.032613118489583336, + "learning_rate": 0.0001, + "loss": 4.1489, + "loss/crossentropy": 2.142255425453186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21016352623701096, + "step": 15004 + }, + { + "epoch": 0.30012, + "grad_norm": 2.03125, + "grad_norm_var": 0.030631256103515626, + "learning_rate": 0.0001, + "loss": 4.1443, + "loss/crossentropy": 1.949910044670105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19256845861673355, + "step": 15006 + }, + { + "epoch": 0.30016, + "grad_norm": 2.078125, + "grad_norm_var": 0.029750315348307292, + "learning_rate": 0.0001, + "loss": 4.34, + "loss/crossentropy": 2.257867217063904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22471557557582855, + "step": 15008 + }, + { + "epoch": 0.3002, + "grad_norm": 2.140625, + "grad_norm_var": 0.011358388264973958, + "learning_rate": 0.0001, + "loss": 4.3457, + "loss/crossentropy": 2.1541898250579834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23054643720388412, + "step": 15010 + }, + { + "epoch": 0.30024, + "grad_norm": 2.296875, + "grad_norm_var": 0.01165771484375, + "learning_rate": 0.0001, + "loss": 4.2537, + "loss/crossentropy": 2.0029123425483704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21815954893827438, + "step": 15012 + }, + { + "epoch": 0.30028, + "grad_norm": 2.125, + "grad_norm_var": 0.0076416015625, + "learning_rate": 0.0001, + "loss": 4.0093, + "loss/crossentropy": 2.0617172718048096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19382908195257187, + "step": 15014 + }, + { + "epoch": 0.30032, + "grad_norm": 1.953125, + "grad_norm_var": 0.00921630859375, + "learning_rate": 0.0001, + "loss": 4.3123, + "loss/crossentropy": 2.1493303775787354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21762489527463913, + "step": 15016 + }, + { + "epoch": 0.30036, + "grad_norm": 2.0625, + "grad_norm_var": 0.0093505859375, + "learning_rate": 0.0001, + "loss": 4.1282, + "loss/crossentropy": 2.081854462623596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22617685049772263, + "step": 15018 + }, + { + "epoch": 0.3004, + "grad_norm": 1.953125, + "grad_norm_var": 0.009842681884765624, + "learning_rate": 0.0001, + "loss": 4.0352, + "loss/crossentropy": 1.8565402626991272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18223749846220016, + "step": 15020 + }, + { + "epoch": 0.30044, + "grad_norm": 1.96875, + "grad_norm_var": 0.010815175374348958, + "learning_rate": 0.0001, + "loss": 4.1779, + "loss/crossentropy": 2.129835605621338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21263620257377625, + "step": 15022 + }, + { + "epoch": 0.30048, + "grad_norm": 2.15625, + "grad_norm_var": 0.011189524332682292, + "learning_rate": 0.0001, + "loss": 4.5131, + "loss/crossentropy": 2.113425612449646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22930295765399933, + "step": 15024 + }, + { + "epoch": 0.30052, + "grad_norm": 2.0, + "grad_norm_var": 0.010109202067057291, + "learning_rate": 0.0001, + "loss": 4.1557, + "loss/crossentropy": 2.254178762435913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21644078195095062, + "step": 15026 + }, + { + "epoch": 0.30056, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011966959635416666, + "learning_rate": 0.0001, + "loss": 4.0863, + "loss/crossentropy": 2.233784854412079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22132302820682526, + "step": 15028 + }, + { + "epoch": 0.3006, + "grad_norm": 1.953125, + "grad_norm_var": 0.0118804931640625, + "learning_rate": 0.0001, + "loss": 3.8657, + "loss/crossentropy": 2.2234549522399902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2144894078373909, + "step": 15030 + }, + { + "epoch": 0.30064, + "grad_norm": 2.046875, + "grad_norm_var": 0.0108306884765625, + "learning_rate": 0.0001, + "loss": 4.1315, + "loss/crossentropy": 2.0401915907859802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20804419368505478, + "step": 15032 + }, + { + "epoch": 0.30068, + "grad_norm": 1.953125, + "grad_norm_var": 0.010978190104166667, + "learning_rate": 0.0001, + "loss": 4.0939, + "loss/crossentropy": 2.058899462223053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2222386747598648, + "step": 15034 + }, + { + "epoch": 0.30072, + "grad_norm": 2.421875, + "grad_norm_var": 0.020694986979166666, + "learning_rate": 0.0001, + "loss": 4.5452, + "loss/crossentropy": 2.396425485610962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24734684079885483, + "step": 15036 + }, + { + "epoch": 0.30076, + "grad_norm": 2.03125, + "grad_norm_var": 0.02011693318684896, + "learning_rate": 0.0001, + "loss": 4.1124, + "loss/crossentropy": 1.9825797080993652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20504453778266907, + "step": 15038 + }, + { + "epoch": 0.3008, + "grad_norm": 2.015625, + "grad_norm_var": 0.02068049112955729, + "learning_rate": 0.0001, + "loss": 4.0287, + "loss/crossentropy": 1.9550088047981262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2051691859960556, + "step": 15040 + }, + { + "epoch": 0.30084, + "grad_norm": 2.015625, + "grad_norm_var": 0.02102839152018229, + "learning_rate": 0.0001, + "loss": 4.1926, + "loss/crossentropy": 2.166532874107361, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20486898720264435, + "step": 15042 + }, + { + "epoch": 0.30088, + "grad_norm": 1.921875, + "grad_norm_var": 0.0158355712890625, + "learning_rate": 0.0001, + "loss": 3.9905, + "loss/crossentropy": 1.8752986192703247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20286446809768677, + "step": 15044 + }, + { + "epoch": 0.30092, + "grad_norm": 2.125, + "grad_norm_var": 0.016649373372395835, + "learning_rate": 0.0001, + "loss": 4.3217, + "loss/crossentropy": 1.9599428176879883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061413824558258, + "step": 15046 + }, + { + "epoch": 0.30096, + "grad_norm": 2.015625, + "grad_norm_var": 0.015428670247395833, + "learning_rate": 0.0001, + "loss": 4.2914, + "loss/crossentropy": 2.5601563453674316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2123739868402481, + "step": 15048 + }, + { + "epoch": 0.301, + "grad_norm": 2.0, + "grad_norm_var": 0.015062459309895833, + "learning_rate": 0.0001, + "loss": 4.2784, + "loss/crossentropy": 2.075629949569702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20791151374578476, + "step": 15050 + }, + { + "epoch": 0.30104, + "grad_norm": 2.171875, + "grad_norm_var": 0.008532460530598958, + "learning_rate": 0.0001, + "loss": 4.1578, + "loss/crossentropy": 1.8346505165100098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16620153188705444, + "step": 15052 + }, + { + "epoch": 0.30108, + "grad_norm": 2.109375, + "grad_norm_var": 0.009382120768229167, + "learning_rate": 0.0001, + "loss": 4.003, + "loss/crossentropy": 1.9715532660484314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20792889595031738, + "step": 15054 + }, + { + "epoch": 0.30112, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0090728759765625, + "learning_rate": 0.0001, + "loss": 4.06, + "loss/crossentropy": 2.135376811027527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.218796044588089, + "step": 15056 + }, + { + "epoch": 0.30116, + "grad_norm": 2.171875, + "grad_norm_var": 0.00941162109375, + "learning_rate": 0.0001, + "loss": 4.4339, + "loss/crossentropy": 2.095982074737549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2120692878961563, + "step": 15058 + }, + { + "epoch": 0.3012, + "grad_norm": 2.203125, + "grad_norm_var": 0.00911865234375, + "learning_rate": 0.0001, + "loss": 4.3395, + "loss/crossentropy": 2.2836159467697144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23748520761728287, + "step": 15060 + }, + { + "epoch": 0.30124, + "grad_norm": 2.03125, + "grad_norm_var": 0.010163370768229167, + "learning_rate": 0.0001, + "loss": 3.8877, + "loss/crossentropy": 2.073283314704895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19927848130464554, + "step": 15062 + }, + { + "epoch": 0.30128, + "grad_norm": 2.015625, + "grad_norm_var": 0.010196940104166666, + "learning_rate": 0.0001, + "loss": 4.1908, + "loss/crossentropy": 2.2401102781295776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22298122942447662, + "step": 15064 + }, + { + "epoch": 0.30132, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012650299072265624, + "learning_rate": 0.0001, + "loss": 4.1878, + "loss/crossentropy": 2.063979387283325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19714537262916565, + "step": 15066 + }, + { + "epoch": 0.30136, + "grad_norm": 2.0, + "grad_norm_var": 0.012894694010416667, + "learning_rate": 0.0001, + "loss": 3.6407, + "loss/crossentropy": 2.0346400141716003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18405266851186752, + "step": 15068 + }, + { + "epoch": 0.3014, + "grad_norm": 1.8671875, + "grad_norm_var": 0.013437652587890625, + "learning_rate": 0.0001, + "loss": 3.8641, + "loss/crossentropy": 2.1211341619491577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19659722596406937, + "step": 15070 + }, + { + "epoch": 0.30144, + "grad_norm": 3.484375, + "grad_norm_var": 0.1490966796875, + "learning_rate": 0.0001, + "loss": 3.9967, + "loss/crossentropy": 1.937787652015686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18829380720853806, + "step": 15072 + }, + { + "epoch": 0.30148, + "grad_norm": 1.9140625, + "grad_norm_var": 0.1507219950358073, + "learning_rate": 0.0001, + "loss": 3.922, + "loss/crossentropy": 1.9047453999519348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18311911821365356, + "step": 15074 + }, + { + "epoch": 0.30152, + "grad_norm": 2.171875, + "grad_norm_var": 0.15110651652018228, + "learning_rate": 0.0001, + "loss": 4.3423, + "loss/crossentropy": 2.1409407258033752, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21341674029827118, + "step": 15076 + }, + { + "epoch": 0.30156, + "grad_norm": 2.265625, + "grad_norm_var": 0.15178197224934895, + "learning_rate": 0.0001, + "loss": 4.2815, + "loss/crossentropy": 2.188783288002014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2406207174062729, + "step": 15078 + }, + { + "epoch": 0.3016, + "grad_norm": 2.125, + "grad_norm_var": 0.15162938435872395, + "learning_rate": 0.0001, + "loss": 4.368, + "loss/crossentropy": 2.088913679122925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2055455967783928, + "step": 15080 + }, + { + "epoch": 0.30164, + "grad_norm": 1.9921875, + "grad_norm_var": 0.15350316365559896, + "learning_rate": 0.0001, + "loss": 3.9039, + "loss/crossentropy": 1.9477434754371643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19234874844551086, + "step": 15082 + }, + { + "epoch": 0.30168, + "grad_norm": 1.9375, + "grad_norm_var": 0.1490069071451823, + "learning_rate": 0.0001, + "loss": 4.2653, + "loss/crossentropy": 2.2611928582191467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2250279188156128, + "step": 15084 + }, + { + "epoch": 0.30172, + "grad_norm": 2.109375, + "grad_norm_var": 0.14735514322916668, + "learning_rate": 0.0001, + "loss": 4.043, + "loss/crossentropy": 2.340996742248535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2201436087489128, + "step": 15086 + }, + { + "epoch": 0.30176, + "grad_norm": 1.8515625, + "grad_norm_var": 0.01718724568684896, + "learning_rate": 0.0001, + "loss": 3.9232, + "loss/crossentropy": 2.045587956905365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2049652263522148, + "step": 15088 + }, + { + "epoch": 0.3018, + "grad_norm": 1.9453125, + "grad_norm_var": 0.015265909830729167, + "learning_rate": 0.0001, + "loss": 3.9266, + "loss/crossentropy": 1.6079826951026917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1584620103240013, + "step": 15090 + }, + { + "epoch": 0.30184, + "grad_norm": 2.046875, + "grad_norm_var": 0.013494618733723958, + "learning_rate": 0.0001, + "loss": 4.1049, + "loss/crossentropy": 2.122701048851013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2095094472169876, + "step": 15092 + }, + { + "epoch": 0.30188, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008243560791015625, + "learning_rate": 0.0001, + "loss": 3.8602, + "loss/crossentropy": 1.8888922929763794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19282665848731995, + "step": 15094 + }, + { + "epoch": 0.30192, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0063873291015625, + "learning_rate": 0.0001, + "loss": 4.1279, + "loss/crossentropy": 2.121204972267151, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1894393339753151, + "step": 15096 + }, + { + "epoch": 0.30196, + "grad_norm": 1.890625, + "grad_norm_var": 0.005991363525390625, + "learning_rate": 0.0001, + "loss": 4.0433, + "loss/crossentropy": 1.836807131767273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1851387470960617, + "step": 15098 + }, + { + "epoch": 0.302, + "grad_norm": 1.96875, + "grad_norm_var": 0.005246734619140625, + "learning_rate": 0.0001, + "loss": 4.1723, + "loss/crossentropy": 2.170082688331604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20964065939188004, + "step": 15100 + }, + { + "epoch": 0.30204, + "grad_norm": 2.015625, + "grad_norm_var": 0.0037595113118489582, + "learning_rate": 0.0001, + "loss": 4.0642, + "loss/crossentropy": 2.128955125808716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20737067610025406, + "step": 15102 + }, + { + "epoch": 0.30208, + "grad_norm": 1.890625, + "grad_norm_var": 0.0062744140625, + "learning_rate": 0.0001, + "loss": 4.1561, + "loss/crossentropy": 1.970711886882782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19101983308792114, + "step": 15104 + }, + { + "epoch": 0.30212, + "grad_norm": 2.40625, + "grad_norm_var": 0.018040974934895832, + "learning_rate": 0.0001, + "loss": 4.5089, + "loss/crossentropy": 2.105665922164917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23741377890110016, + "step": 15106 + }, + { + "epoch": 0.30216, + "grad_norm": 2.109375, + "grad_norm_var": 0.01962865193684896, + "learning_rate": 0.0001, + "loss": 4.3766, + "loss/crossentropy": 2.274402379989624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20633316040039062, + "step": 15108 + }, + { + "epoch": 0.3022, + "grad_norm": 2.046875, + "grad_norm_var": 0.017996978759765626, + "learning_rate": 0.0001, + "loss": 4.0393, + "loss/crossentropy": 2.134031891822815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20932414382696152, + "step": 15110 + }, + { + "epoch": 0.30224, + "grad_norm": 1.9140625, + "grad_norm_var": 0.01930720011393229, + "learning_rate": 0.0001, + "loss": 3.9084, + "loss/crossentropy": 2.015448212623596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18043135851621628, + "step": 15112 + }, + { + "epoch": 0.30228, + "grad_norm": 2.109375, + "grad_norm_var": 0.018232981363932293, + "learning_rate": 0.0001, + "loss": 4.2025, + "loss/crossentropy": 2.3585838079452515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2323882058262825, + "step": 15114 + }, + { + "epoch": 0.30232, + "grad_norm": 1.9140625, + "grad_norm_var": 0.018607584635416667, + "learning_rate": 0.0001, + "loss": 3.8586, + "loss/crossentropy": 1.790964961051941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18150723725557327, + "step": 15116 + }, + { + "epoch": 0.30236, + "grad_norm": 2.109375, + "grad_norm_var": 0.018802642822265625, + "learning_rate": 0.0001, + "loss": 4.1433, + "loss/crossentropy": 2.0206486582756042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2102368101477623, + "step": 15118 + }, + { + "epoch": 0.3024, + "grad_norm": 1.9921875, + "grad_norm_var": 0.016169230143229168, + "learning_rate": 0.0001, + "loss": 4.1773, + "loss/crossentropy": 2.21474289894104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22742661088705063, + "step": 15120 + }, + { + "epoch": 0.30244, + "grad_norm": 2.1875, + "grad_norm_var": 0.009781901041666667, + "learning_rate": 0.0001, + "loss": 3.9536, + "loss/crossentropy": 2.0452207922935486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19045353680849075, + "step": 15122 + }, + { + "epoch": 0.30248, + "grad_norm": 1.828125, + "grad_norm_var": 0.0105865478515625, + "learning_rate": 0.0001, + "loss": 3.7914, + "loss/crossentropy": 2.019612729549408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2032184973359108, + "step": 15124 + }, + { + "epoch": 0.30252, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010221099853515625, + "learning_rate": 0.0001, + "loss": 4.0303, + "loss/crossentropy": 2.1821396350860596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22832991182804108, + "step": 15126 + }, + { + "epoch": 0.30256, + "grad_norm": 1.875, + "grad_norm_var": 0.010088857014973958, + "learning_rate": 0.0001, + "loss": 3.7838, + "loss/crossentropy": 2.0395348072052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18255837261676788, + "step": 15128 + }, + { + "epoch": 0.3026, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009081013997395833, + "learning_rate": 0.0001, + "loss": 4.177, + "loss/crossentropy": 2.1724050045013428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19672775268554688, + "step": 15130 + }, + { + "epoch": 0.30264, + "grad_norm": 1.96875, + "grad_norm_var": 0.008470662434895833, + "learning_rate": 0.0001, + "loss": 4.2438, + "loss/crossentropy": 2.107967436313629, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19564508646726608, + "step": 15132 + }, + { + "epoch": 0.30268, + "grad_norm": 1.953125, + "grad_norm_var": 0.007355753580729167, + "learning_rate": 0.0001, + "loss": 3.8674, + "loss/crossentropy": 1.7179370522499084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17135357856750488, + "step": 15134 + }, + { + "epoch": 0.30272, + "grad_norm": 2.078125, + "grad_norm_var": 0.008119455973307292, + "learning_rate": 0.0001, + "loss": 4.104, + "loss/crossentropy": 1.9004579186439514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19902431219816208, + "step": 15136 + }, + { + "epoch": 0.30276, + "grad_norm": 2.0625, + "grad_norm_var": 0.0052487691243489586, + "learning_rate": 0.0001, + "loss": 4.013, + "loss/crossentropy": 1.9554911255836487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19999338686466217, + "step": 15138 + }, + { + "epoch": 0.3028, + "grad_norm": 1.9140625, + "grad_norm_var": 0.004793294270833333, + "learning_rate": 0.0001, + "loss": 4.1051, + "loss/crossentropy": 1.8687474131584167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19250088185071945, + "step": 15140 + }, + { + "epoch": 0.30284, + "grad_norm": 2.0625, + "grad_norm_var": 0.004929351806640625, + "learning_rate": 0.0001, + "loss": 4.1217, + "loss/crossentropy": 2.1360538005828857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22316065430641174, + "step": 15142 + }, + { + "epoch": 0.30288, + "grad_norm": 2.125, + "grad_norm_var": 0.006776682535807292, + "learning_rate": 0.0001, + "loss": 3.7681, + "loss/crossentropy": 1.6100040078163147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16926150023937225, + "step": 15144 + }, + { + "epoch": 0.30292, + "grad_norm": 2.09375, + "grad_norm_var": 0.0070302327473958336, + "learning_rate": 0.0001, + "loss": 4.0459, + "loss/crossentropy": 1.983083188533783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19075944274663925, + "step": 15146 + }, + { + "epoch": 0.30296, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009110514322916667, + "learning_rate": 0.0001, + "loss": 3.8722, + "loss/crossentropy": 1.9027757048606873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2069152221083641, + "step": 15148 + }, + { + "epoch": 0.303, + "grad_norm": 1.953125, + "grad_norm_var": 0.009144846598307292, + "learning_rate": 0.0001, + "loss": 3.996, + "loss/crossentropy": 1.9141475558280945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19106873124837875, + "step": 15150 + }, + { + "epoch": 0.30304, + "grad_norm": 2.109375, + "grad_norm_var": 0.009821573893229166, + "learning_rate": 0.0001, + "loss": 4.0872, + "loss/crossentropy": 1.924220085144043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19870451837778091, + "step": 15152 + }, + { + "epoch": 0.30308, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009169260660807291, + "learning_rate": 0.0001, + "loss": 4.1611, + "loss/crossentropy": 2.18874990940094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2040283977985382, + "step": 15154 + }, + { + "epoch": 0.30312, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0096588134765625, + "learning_rate": 0.0001, + "loss": 3.6905, + "loss/crossentropy": 2.0296109914779663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20384181290864944, + "step": 15156 + }, + { + "epoch": 0.30316, + "grad_norm": 2.03125, + "grad_norm_var": 0.0091949462890625, + "learning_rate": 0.0001, + "loss": 4.2475, + "loss/crossentropy": 2.1908326148986816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20775151252746582, + "step": 15158 + }, + { + "epoch": 0.3032, + "grad_norm": 2.0625, + "grad_norm_var": 0.007523345947265625, + "learning_rate": 0.0001, + "loss": 4.191, + "loss/crossentropy": 2.1348751187324524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21342730522155762, + "step": 15160 + }, + { + "epoch": 0.30324, + "grad_norm": 2.09375, + "grad_norm_var": 0.0075103759765625, + "learning_rate": 0.0001, + "loss": 3.9642, + "loss/crossentropy": 2.03944593667984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1934272050857544, + "step": 15162 + }, + { + "epoch": 0.30328, + "grad_norm": 1.9609375, + "grad_norm_var": 0.005475870768229167, + "learning_rate": 0.0001, + "loss": 3.9692, + "loss/crossentropy": 1.8574647307395935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1847086101770401, + "step": 15164 + }, + { + "epoch": 0.30332, + "grad_norm": 1.9140625, + "grad_norm_var": 0.005671183268229167, + "learning_rate": 0.0001, + "loss": 4.3025, + "loss/crossentropy": 2.219611406326294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1982950195670128, + "step": 15166 + }, + { + "epoch": 0.30336, + "grad_norm": 2.078125, + "grad_norm_var": 0.005159250895182292, + "learning_rate": 0.0001, + "loss": 4.1903, + "loss/crossentropy": 2.024726688861847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.217759907245636, + "step": 15168 + }, + { + "epoch": 0.3034, + "grad_norm": 2.046875, + "grad_norm_var": 0.005353800455729167, + "learning_rate": 0.0001, + "loss": 4.2273, + "loss/crossentropy": 2.196849226951599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21290963143110275, + "step": 15170 + }, + { + "epoch": 0.30344, + "grad_norm": 1.7421875, + "grad_norm_var": 0.00750732421875, + "learning_rate": 0.0001, + "loss": 4.0096, + "loss/crossentropy": 1.9023171067237854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19199151545763016, + "step": 15172 + }, + { + "epoch": 0.30348, + "grad_norm": 2.03125, + "grad_norm_var": 0.008644358317057291, + "learning_rate": 0.0001, + "loss": 4.0798, + "loss/crossentropy": 2.1286264657974243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20934996753931046, + "step": 15174 + }, + { + "epoch": 0.30352, + "grad_norm": 2.046875, + "grad_norm_var": 0.008007558186848958, + "learning_rate": 0.0001, + "loss": 4.2621, + "loss/crossentropy": 2.2262455224990845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2091647908091545, + "step": 15176 + }, + { + "epoch": 0.30356, + "grad_norm": 2.140625, + "grad_norm_var": 0.04183349609375, + "learning_rate": 0.0001, + "loss": 4.2439, + "loss/crossentropy": 2.4126710891723633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22972645610570908, + "step": 15178 + }, + { + "epoch": 0.3036, + "grad_norm": 2.078125, + "grad_norm_var": 0.042569732666015624, + "learning_rate": 0.0001, + "loss": 4.4268, + "loss/crossentropy": 2.4708153009414673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23798953741788864, + "step": 15180 + }, + { + "epoch": 0.30364, + "grad_norm": 2.109375, + "grad_norm_var": 0.04053929646809896, + "learning_rate": 0.0001, + "loss": 4.1841, + "loss/crossentropy": 1.9924429655075073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19149084389209747, + "step": 15182 + }, + { + "epoch": 0.30368, + "grad_norm": 1.953125, + "grad_norm_var": 0.04182103474934896, + "learning_rate": 0.0001, + "loss": 4.1335, + "loss/crossentropy": 1.8018346428871155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1883997619152069, + "step": 15184 + }, + { + "epoch": 0.30372, + "grad_norm": 1.9296875, + "grad_norm_var": 0.04639867146809896, + "learning_rate": 0.0001, + "loss": 3.96, + "loss/crossentropy": 2.001536011695862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2008758783340454, + "step": 15186 + }, + { + "epoch": 0.30376, + "grad_norm": 2.1875, + "grad_norm_var": 0.06099828084309896, + "learning_rate": 0.0001, + "loss": 4.2308, + "loss/crossentropy": 1.9239726066589355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2068573236465454, + "step": 15188 + }, + { + "epoch": 0.3038, + "grad_norm": 2.296875, + "grad_norm_var": 0.059845987955729166, + "learning_rate": 0.0001, + "loss": 4.303, + "loss/crossentropy": 2.252953827381134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22697407007217407, + "step": 15190 + }, + { + "epoch": 0.30384, + "grad_norm": 1.921875, + "grad_norm_var": 0.0631011962890625, + "learning_rate": 0.0001, + "loss": 4.174, + "loss/crossentropy": 2.135131001472473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088206186890602, + "step": 15192 + }, + { + "epoch": 0.30388, + "grad_norm": 2.09375, + "grad_norm_var": 0.0391754150390625, + "learning_rate": 0.0001, + "loss": 4.22, + "loss/crossentropy": 2.394876003265381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22017298638820648, + "step": 15194 + }, + { + "epoch": 0.30392, + "grad_norm": 2.078125, + "grad_norm_var": 0.03862711588541667, + "learning_rate": 0.0001, + "loss": 4.3951, + "loss/crossentropy": 2.3517301082611084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21459681540727615, + "step": 15196 + }, + { + "epoch": 0.30396, + "grad_norm": 2.015625, + "grad_norm_var": 0.038914998372395836, + "learning_rate": 0.0001, + "loss": 4.3068, + "loss/crossentropy": 1.956869900226593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19348695874214172, + "step": 15198 + }, + { + "epoch": 0.304, + "grad_norm": 1.9921875, + "grad_norm_var": 0.045660146077473956, + "learning_rate": 0.0001, + "loss": 3.5481, + "loss/crossentropy": 1.8349076509475708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19203926622867584, + "step": 15200 + }, + { + "epoch": 0.30404, + "grad_norm": 2.140625, + "grad_norm_var": 0.041265614827473956, + "learning_rate": 0.0001, + "loss": 4.1749, + "loss/crossentropy": 2.253283977508545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24825416505336761, + "step": 15202 + }, + { + "epoch": 0.30408, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0156646728515625, + "learning_rate": 0.0001, + "loss": 3.9585, + "loss/crossentropy": 1.7219101190567017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18302495777606964, + "step": 15204 + }, + { + "epoch": 0.30412, + "grad_norm": 2.046875, + "grad_norm_var": 0.010309855143229166, + "learning_rate": 0.0001, + "loss": 4.4398, + "loss/crossentropy": 2.2931004762649536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21909544616937637, + "step": 15206 + }, + { + "epoch": 0.30416, + "grad_norm": 2.125, + "grad_norm_var": 0.009883626302083334, + "learning_rate": 0.0001, + "loss": 4.0707, + "loss/crossentropy": 1.9331820011138916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19565977901220322, + "step": 15208 + }, + { + "epoch": 0.3042, + "grad_norm": 2.015625, + "grad_norm_var": 0.008568318684895833, + "learning_rate": 0.0001, + "loss": 4.0936, + "loss/crossentropy": 1.9863171577453613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20089460909366608, + "step": 15210 + }, + { + "epoch": 0.30424, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010945383707682292, + "learning_rate": 0.0001, + "loss": 4.3762, + "loss/crossentropy": 2.326872229576111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20695248246192932, + "step": 15212 + }, + { + "epoch": 0.30428, + "grad_norm": 2.171875, + "grad_norm_var": 0.012679036458333333, + "learning_rate": 0.0001, + "loss": 4.1684, + "loss/crossentropy": 2.0216678380966187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19882218539714813, + "step": 15214 + }, + { + "epoch": 0.30432, + "grad_norm": 2.171875, + "grad_norm_var": 0.0074503580729166664, + "learning_rate": 0.0001, + "loss": 4.3682, + "loss/crossentropy": 2.170402765274048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22156177461147308, + "step": 15216 + }, + { + "epoch": 0.30436, + "grad_norm": 2.03125, + "grad_norm_var": 0.007608795166015625, + "learning_rate": 0.0001, + "loss": 4.1655, + "loss/crossentropy": 1.9652912616729736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20658399909734726, + "step": 15218 + }, + { + "epoch": 0.3044, + "grad_norm": 1.90625, + "grad_norm_var": 0.0090972900390625, + "learning_rate": 0.0001, + "loss": 4.1234, + "loss/crossentropy": 2.4518260955810547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20796608924865723, + "step": 15220 + }, + { + "epoch": 0.30444, + "grad_norm": 1.984375, + "grad_norm_var": 0.009952545166015625, + "learning_rate": 0.0001, + "loss": 4.0179, + "loss/crossentropy": 1.9519163370132446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19269376248121262, + "step": 15222 + }, + { + "epoch": 0.30448, + "grad_norm": 1.9375, + "grad_norm_var": 0.010758209228515624, + "learning_rate": 0.0001, + "loss": 3.9411, + "loss/crossentropy": 2.1493905782699585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19713661074638367, + "step": 15224 + }, + { + "epoch": 0.30452, + "grad_norm": 2.5, + "grad_norm_var": 0.026878865559895833, + "learning_rate": 0.0001, + "loss": 4.2124, + "loss/crossentropy": 2.2856662273406982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23240232467651367, + "step": 15226 + }, + { + "epoch": 0.30456, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0249176025390625, + "learning_rate": 0.0001, + "loss": 4.1723, + "loss/crossentropy": 2.3631181716918945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22731658816337585, + "step": 15228 + }, + { + "epoch": 0.3046, + "grad_norm": 2.015625, + "grad_norm_var": 0.027469635009765625, + "learning_rate": 0.0001, + "loss": 4.0952, + "loss/crossentropy": 2.1270273327827454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20736727863550186, + "step": 15230 + }, + { + "epoch": 0.30464, + "grad_norm": 2.09375, + "grad_norm_var": 0.029320271809895833, + "learning_rate": 0.0001, + "loss": 4.4801, + "loss/crossentropy": 2.1448079347610474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20350141823291779, + "step": 15232 + }, + { + "epoch": 0.30468, + "grad_norm": 2.71875, + "grad_norm_var": 0.059576161702473956, + "learning_rate": 0.0001, + "loss": 4.295, + "loss/crossentropy": 2.0582846999168396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19898280501365662, + "step": 15234 + }, + { + "epoch": 0.30472, + "grad_norm": 2.078125, + "grad_norm_var": 0.0597076416015625, + "learning_rate": 0.0001, + "loss": 4.0946, + "loss/crossentropy": 2.2282591462135315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22299205511808395, + "step": 15236 + }, + { + "epoch": 0.30476, + "grad_norm": 4.34375, + "grad_norm_var": 0.3757484436035156, + "learning_rate": 0.0001, + "loss": 4.4031, + "loss/crossentropy": 1.902436077594757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19752992689609528, + "step": 15238 + }, + { + "epoch": 0.3048, + "grad_norm": 2.09375, + "grad_norm_var": 0.3626177469889323, + "learning_rate": 0.0001, + "loss": 4.5476, + "loss/crossentropy": 2.5741195678710938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25522060692310333, + "step": 15240 + }, + { + "epoch": 0.30484, + "grad_norm": 2.078125, + "grad_norm_var": 0.3576942443847656, + "learning_rate": 0.0001, + "loss": 4.2319, + "loss/crossentropy": 2.261489987373352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2379501387476921, + "step": 15242 + }, + { + "epoch": 0.30488, + "grad_norm": 1.921875, + "grad_norm_var": 0.3581451416015625, + "learning_rate": 0.0001, + "loss": 4.1728, + "loss/crossentropy": 2.3075523376464844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21830804646015167, + "step": 15244 + }, + { + "epoch": 0.30492, + "grad_norm": 1.9375, + "grad_norm_var": 0.36470133463541665, + "learning_rate": 0.0001, + "loss": 3.9585, + "loss/crossentropy": 2.096464157104492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2073582485318184, + "step": 15246 + }, + { + "epoch": 0.30496, + "grad_norm": 2.078125, + "grad_norm_var": 0.41076558430989585, + "learning_rate": 0.0001, + "loss": 4.0121, + "loss/crossentropy": 1.9624161124229431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19424450397491455, + "step": 15248 + }, + { + "epoch": 0.305, + "grad_norm": 2.0625, + "grad_norm_var": 0.38983154296875, + "learning_rate": 0.0001, + "loss": 4.1351, + "loss/crossentropy": 2.0838447213172913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22090255469083786, + "step": 15250 + }, + { + "epoch": 0.30504, + "grad_norm": 2.171875, + "grad_norm_var": 0.379644521077474, + "learning_rate": 0.0001, + "loss": 4.4244, + "loss/crossentropy": 2.349258542060852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20451825857162476, + "step": 15252 + }, + { + "epoch": 0.30508, + "grad_norm": 2.09375, + "grad_norm_var": 0.06938451131184896, + "learning_rate": 0.0001, + "loss": 4.328, + "loss/crossentropy": 2.3996351957321167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24550051987171173, + "step": 15254 + }, + { + "epoch": 0.30512, + "grad_norm": 1.859375, + "grad_norm_var": 0.07430191040039062, + "learning_rate": 0.0001, + "loss": 4.0098, + "loss/crossentropy": 2.325650215148926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2189955934882164, + "step": 15256 + }, + { + "epoch": 0.30516, + "grad_norm": 2.015625, + "grad_norm_var": 0.07815348307291667, + "learning_rate": 0.0001, + "loss": 4.0063, + "loss/crossentropy": 1.8568453788757324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18639766424894333, + "step": 15258 + }, + { + "epoch": 0.3052, + "grad_norm": 1.875, + "grad_norm_var": 0.08263346354166666, + "learning_rate": 0.0001, + "loss": 3.8955, + "loss/crossentropy": 1.8268811106681824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18669788539409637, + "step": 15260 + }, + { + "epoch": 0.30524, + "grad_norm": 2.125, + "grad_norm_var": 0.08479715983072916, + "learning_rate": 0.0001, + "loss": 4.1124, + "loss/crossentropy": 2.0555724501609802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2101765051484108, + "step": 15262 + }, + { + "epoch": 0.30528, + "grad_norm": 1.9296875, + "grad_norm_var": 0.018070475260416666, + "learning_rate": 0.0001, + "loss": 4.1075, + "loss/crossentropy": 2.080612599849701, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18955913186073303, + "step": 15264 + }, + { + "epoch": 0.30532, + "grad_norm": 1.890625, + "grad_norm_var": 0.0137939453125, + "learning_rate": 0.0001, + "loss": 4.2678, + "loss/crossentropy": 2.352793037891388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21857291460037231, + "step": 15266 + }, + { + "epoch": 0.30536, + "grad_norm": 1.8671875, + "grad_norm_var": 0.010434722900390625, + "learning_rate": 0.0001, + "loss": 4.0871, + "loss/crossentropy": 2.0258968472480774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18990560621023178, + "step": 15268 + }, + { + "epoch": 0.3054, + "grad_norm": 1.8046875, + "grad_norm_var": 0.010871378580729167, + "learning_rate": 0.0001, + "loss": 4.3126, + "loss/crossentropy": 2.228433310985565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20862557739019394, + "step": 15270 + }, + { + "epoch": 0.30544, + "grad_norm": 1.9765625, + "grad_norm_var": 0.012330881754557292, + "learning_rate": 0.0001, + "loss": 3.7731, + "loss/crossentropy": 1.9475300312042236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20644760876893997, + "step": 15272 + }, + { + "epoch": 0.30548, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012166341145833334, + "learning_rate": 0.0001, + "loss": 4.1685, + "loss/crossentropy": 2.3502081632614136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22096982598304749, + "step": 15274 + }, + { + "epoch": 0.30552, + "grad_norm": 2.09375, + "grad_norm_var": 0.012064615885416666, + "learning_rate": 0.0001, + "loss": 4.294, + "loss/crossentropy": 2.2581464052200317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21158118546009064, + "step": 15276 + }, + { + "epoch": 0.30556, + "grad_norm": 2.171875, + "grad_norm_var": 0.014054361979166667, + "learning_rate": 0.0001, + "loss": 4.0301, + "loss/crossentropy": 1.9583097696304321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19379162788391113, + "step": 15278 + }, + { + "epoch": 0.3056, + "grad_norm": 2.109375, + "grad_norm_var": 0.016527303059895835, + "learning_rate": 0.0001, + "loss": 4.4464, + "loss/crossentropy": 1.988980233669281, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19495020806789398, + "step": 15280 + }, + { + "epoch": 0.30564, + "grad_norm": 1.984375, + "grad_norm_var": 0.017350260416666666, + "learning_rate": 0.0001, + "loss": 4.1733, + "loss/crossentropy": 2.1816195249557495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2196531891822815, + "step": 15282 + }, + { + "epoch": 0.30568, + "grad_norm": 1.9375, + "grad_norm_var": 0.015264638264973958, + "learning_rate": 0.0001, + "loss": 3.7994, + "loss/crossentropy": 2.180557370185852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2106764316558838, + "step": 15284 + }, + { + "epoch": 0.30572, + "grad_norm": 2.046875, + "grad_norm_var": 0.011027018229166666, + "learning_rate": 0.0001, + "loss": 4.0655, + "loss/crossentropy": 1.901601493358612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18202877044677734, + "step": 15286 + }, + { + "epoch": 0.30576, + "grad_norm": 2.0625, + "grad_norm_var": 0.019301096598307293, + "learning_rate": 0.0001, + "loss": 4.1989, + "loss/crossentropy": 2.259449601173401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23166191577911377, + "step": 15288 + }, + { + "epoch": 0.3058, + "grad_norm": 1.90625, + "grad_norm_var": 0.020514933268229167, + "learning_rate": 0.0001, + "loss": 3.9866, + "loss/crossentropy": 1.786646544933319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19156906753778458, + "step": 15290 + }, + { + "epoch": 0.30584, + "grad_norm": 1.9453125, + "grad_norm_var": 0.02205174763997396, + "learning_rate": 0.0001, + "loss": 3.9258, + "loss/crossentropy": 1.8522255420684814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1828666776418686, + "step": 15292 + }, + { + "epoch": 0.30588, + "grad_norm": 1.9765625, + "grad_norm_var": 0.02127685546875, + "learning_rate": 0.0001, + "loss": 4.2543, + "loss/crossentropy": 2.195667266845703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21043212711811066, + "step": 15294 + }, + { + "epoch": 0.30592, + "grad_norm": 2.0, + "grad_norm_var": 0.020918528238932293, + "learning_rate": 0.0001, + "loss": 3.8389, + "loss/crossentropy": 1.8748087882995605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20358053594827652, + "step": 15296 + }, + { + "epoch": 0.30596, + "grad_norm": 1.9453125, + "grad_norm_var": 0.017536417643229166, + "learning_rate": 0.0001, + "loss": 4.0789, + "loss/crossentropy": 2.2658244371414185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2206317037343979, + "step": 15298 + }, + { + "epoch": 0.306, + "grad_norm": 2.140625, + "grad_norm_var": 0.018778483072916668, + "learning_rate": 0.0001, + "loss": 4.2882, + "loss/crossentropy": 1.7914378643035889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20813927054405212, + "step": 15300 + }, + { + "epoch": 0.30604, + "grad_norm": 2.109375, + "grad_norm_var": 0.01895319620768229, + "learning_rate": 0.0001, + "loss": 4.1428, + "loss/crossentropy": 2.046573519706726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20681843161582947, + "step": 15302 + }, + { + "epoch": 0.30608, + "grad_norm": 2.03125, + "grad_norm_var": 0.005960845947265625, + "learning_rate": 0.0001, + "loss": 4.2538, + "loss/crossentropy": 2.0068886280059814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.201521098613739, + "step": 15304 + }, + { + "epoch": 0.30612, + "grad_norm": 2.03125, + "grad_norm_var": 0.005509440104166667, + "learning_rate": 0.0001, + "loss": 4.0672, + "loss/crossentropy": 2.2020740509033203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20413114875555038, + "step": 15306 + }, + { + "epoch": 0.30616, + "grad_norm": 2.09375, + "grad_norm_var": 0.0057769775390625, + "learning_rate": 0.0001, + "loss": 4.3654, + "loss/crossentropy": 2.3031119108200073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20878810435533524, + "step": 15308 + }, + { + "epoch": 0.3062, + "grad_norm": 1.9296875, + "grad_norm_var": 0.006204986572265625, + "learning_rate": 0.0001, + "loss": 3.7786, + "loss/crossentropy": 1.6445855498313904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1618219092488289, + "step": 15310 + }, + { + "epoch": 0.30624, + "grad_norm": 2.046875, + "grad_norm_var": 0.004572550455729167, + "learning_rate": 0.0001, + "loss": 4.1378, + "loss/crossentropy": 1.995127022266388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21076080948114395, + "step": 15312 + }, + { + "epoch": 0.30628, + "grad_norm": 1.90625, + "grad_norm_var": 0.005411783854166667, + "learning_rate": 0.0001, + "loss": 3.6278, + "loss/crossentropy": 1.8340229392051697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19418048858642578, + "step": 15314 + }, + { + "epoch": 0.30632, + "grad_norm": 2.0625, + "grad_norm_var": 0.007106272379557291, + "learning_rate": 0.0001, + "loss": 3.9496, + "loss/crossentropy": 1.713787317276001, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17608212679624557, + "step": 15316 + }, + { + "epoch": 0.30636, + "grad_norm": 2.0625, + "grad_norm_var": 0.006967926025390625, + "learning_rate": 0.0001, + "loss": 4.0223, + "loss/crossentropy": 1.7105762362480164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1760871261358261, + "step": 15318 + }, + { + "epoch": 0.3064, + "grad_norm": 1.9375, + "grad_norm_var": 0.0071441650390625, + "learning_rate": 0.0001, + "loss": 4.0326, + "loss/crossentropy": 2.337005376815796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2222321778535843, + "step": 15320 + }, + { + "epoch": 0.30644, + "grad_norm": 2.046875, + "grad_norm_var": 0.008868153889973958, + "learning_rate": 0.0001, + "loss": 4.3269, + "loss/crossentropy": 1.8426868915557861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1880369558930397, + "step": 15322 + }, + { + "epoch": 0.30648, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009162394205729167, + "learning_rate": 0.0001, + "loss": 3.7798, + "loss/crossentropy": 2.300028443336487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22529813647270203, + "step": 15324 + }, + { + "epoch": 0.30652, + "grad_norm": 1.953125, + "grad_norm_var": 0.009193674723307291, + "learning_rate": 0.0001, + "loss": 3.9838, + "loss/crossentropy": 2.3558120727539062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2000514343380928, + "step": 15326 + }, + { + "epoch": 0.30656, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007879384358723958, + "learning_rate": 0.0001, + "loss": 3.6921, + "loss/crossentropy": 1.681145966053009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1756032332777977, + "step": 15328 + }, + { + "epoch": 0.3066, + "grad_norm": 2.0, + "grad_norm_var": 0.010298665364583333, + "learning_rate": 0.0001, + "loss": 4.41, + "loss/crossentropy": 2.1566712260246277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21067694574594498, + "step": 15330 + }, + { + "epoch": 0.30664, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0075347900390625, + "learning_rate": 0.0001, + "loss": 3.993, + "loss/crossentropy": 2.125000774860382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20768606662750244, + "step": 15332 + }, + { + "epoch": 0.30668, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007328033447265625, + "learning_rate": 0.0001, + "loss": 4.0805, + "loss/crossentropy": 2.168649673461914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20393794029951096, + "step": 15334 + }, + { + "epoch": 0.30672, + "grad_norm": 2.09375, + "grad_norm_var": 0.008072662353515624, + "learning_rate": 0.0001, + "loss": 3.9154, + "loss/crossentropy": 1.657529592514038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19351129233837128, + "step": 15336 + }, + { + "epoch": 0.30676, + "grad_norm": 2.0625, + "grad_norm_var": 0.006780751546223958, + "learning_rate": 0.0001, + "loss": 4.2166, + "loss/crossentropy": 2.115979850292206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20223377645015717, + "step": 15338 + }, + { + "epoch": 0.3068, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005833943684895833, + "learning_rate": 0.0001, + "loss": 4.1288, + "loss/crossentropy": 1.699661135673523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17160461097955704, + "step": 15340 + }, + { + "epoch": 0.30684, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008180491129557292, + "learning_rate": 0.0001, + "loss": 3.8418, + "loss/crossentropy": 2.141602098941803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19209860265254974, + "step": 15342 + }, + { + "epoch": 0.30688, + "grad_norm": 2.0625, + "grad_norm_var": 0.007806142171223958, + "learning_rate": 0.0001, + "loss": 4.3619, + "loss/crossentropy": 2.1502809524536133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21510163694620132, + "step": 15344 + }, + { + "epoch": 0.30692, + "grad_norm": 1.984375, + "grad_norm_var": 0.006078847249348958, + "learning_rate": 0.0001, + "loss": 4.0684, + "loss/crossentropy": 2.1354172825813293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19959942996501923, + "step": 15346 + }, + { + "epoch": 0.30696, + "grad_norm": 2.015625, + "grad_norm_var": 0.006095377604166666, + "learning_rate": 0.0001, + "loss": 4.1196, + "loss/crossentropy": 2.0634223222732544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2073688805103302, + "step": 15348 + }, + { + "epoch": 0.307, + "grad_norm": 2.125, + "grad_norm_var": 0.008385976155598959, + "learning_rate": 0.0001, + "loss": 4.1489, + "loss/crossentropy": 1.9851733446121216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2060396894812584, + "step": 15350 + }, + { + "epoch": 0.30704, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007490793863932292, + "learning_rate": 0.0001, + "loss": 4.1196, + "loss/crossentropy": 2.3790550231933594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23205619305372238, + "step": 15352 + }, + { + "epoch": 0.30708, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008552042643229167, + "learning_rate": 0.0001, + "loss": 3.9085, + "loss/crossentropy": 1.943642258644104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18238085508346558, + "step": 15354 + }, + { + "epoch": 0.30712, + "grad_norm": 2.0, + "grad_norm_var": 0.008988189697265624, + "learning_rate": 0.0001, + "loss": 4.112, + "loss/crossentropy": 1.9680217504501343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18599005788564682, + "step": 15356 + }, + { + "epoch": 0.30716, + "grad_norm": 1.890625, + "grad_norm_var": 0.0077898661295572914, + "learning_rate": 0.0001, + "loss": 3.7402, + "loss/crossentropy": 1.9464871287345886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18814007937908173, + "step": 15358 + }, + { + "epoch": 0.3072, + "grad_norm": 2.265625, + "grad_norm_var": 0.013179524739583334, + "learning_rate": 0.0001, + "loss": 4.0231, + "loss/crossentropy": 2.0038596987724304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19203800708055496, + "step": 15360 + }, + { + "epoch": 0.30724, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013071441650390625, + "learning_rate": 0.0001, + "loss": 4.0843, + "loss/crossentropy": 1.93240225315094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20595912635326385, + "step": 15362 + }, + { + "epoch": 0.30728, + "grad_norm": 2.0, + "grad_norm_var": 0.012938435872395833, + "learning_rate": 0.0001, + "loss": 4.2042, + "loss/crossentropy": 2.067903220653534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21503040939569473, + "step": 15364 + }, + { + "epoch": 0.30732, + "grad_norm": 2.140625, + "grad_norm_var": 0.012674713134765625, + "learning_rate": 0.0001, + "loss": 3.9056, + "loss/crossentropy": 2.179181218147278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20675741881132126, + "step": 15366 + }, + { + "epoch": 0.30736, + "grad_norm": 2.078125, + "grad_norm_var": 0.02471923828125, + "learning_rate": 0.0001, + "loss": 4.4657, + "loss/crossentropy": 2.106004774570465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23683416098356247, + "step": 15368 + }, + { + "epoch": 0.3074, + "grad_norm": 2.0, + "grad_norm_var": 0.023266347249348958, + "learning_rate": 0.0001, + "loss": 4.135, + "loss/crossentropy": 2.2843246459960938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24139763414859772, + "step": 15370 + }, + { + "epoch": 0.30744, + "grad_norm": 2.0, + "grad_norm_var": 0.028484853108723958, + "learning_rate": 0.0001, + "loss": 4.122, + "loss/crossentropy": 2.2482924461364746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2242894545197487, + "step": 15372 + }, + { + "epoch": 0.30748, + "grad_norm": 1.9765625, + "grad_norm_var": 0.025321451822916667, + "learning_rate": 0.0001, + "loss": 4.221, + "loss/crossentropy": 2.1573593616485596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22176649421453476, + "step": 15374 + }, + { + "epoch": 0.30752, + "grad_norm": 1.8828125, + "grad_norm_var": 0.022705078125, + "learning_rate": 0.0001, + "loss": 4.219, + "loss/crossentropy": 1.9243710041046143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19137389957904816, + "step": 15376 + }, + { + "epoch": 0.30756, + "grad_norm": 1.984375, + "grad_norm_var": 0.02211481730143229, + "learning_rate": 0.0001, + "loss": 4.0903, + "loss/crossentropy": 2.004511833190918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20657775551080704, + "step": 15378 + }, + { + "epoch": 0.3076, + "grad_norm": 1.859375, + "grad_norm_var": 0.022956339518229167, + "learning_rate": 0.0001, + "loss": 4.0174, + "loss/crossentropy": 1.8409560918807983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19139550626277924, + "step": 15380 + }, + { + "epoch": 0.30764, + "grad_norm": 2.09375, + "grad_norm_var": 0.019974772135416666, + "learning_rate": 0.0001, + "loss": 4.2722, + "loss/crossentropy": 2.1442220211029053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23278377205133438, + "step": 15382 + }, + { + "epoch": 0.30768, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013651275634765625, + "learning_rate": 0.0001, + "loss": 4.2915, + "loss/crossentropy": 2.086996078491211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20602282136678696, + "step": 15384 + }, + { + "epoch": 0.30772, + "grad_norm": 2.125, + "grad_norm_var": 0.014404042561848959, + "learning_rate": 0.0001, + "loss": 4.4884, + "loss/crossentropy": 2.479863405227661, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23815791308879852, + "step": 15386 + }, + { + "epoch": 0.30776, + "grad_norm": 1.953125, + "grad_norm_var": 0.009034983317057292, + "learning_rate": 0.0001, + "loss": 4.0648, + "loss/crossentropy": 1.9218324422836304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22571419924497604, + "step": 15388 + }, + { + "epoch": 0.3078, + "grad_norm": 2.078125, + "grad_norm_var": 0.009004720052083333, + "learning_rate": 0.0001, + "loss": 4.0315, + "loss/crossentropy": 1.9488168954849243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2004484310746193, + "step": 15390 + }, + { + "epoch": 0.30784, + "grad_norm": 2.09375, + "grad_norm_var": 0.006585439046223958, + "learning_rate": 0.0001, + "loss": 4.3221, + "loss/crossentropy": 2.307602286338806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23881366103887558, + "step": 15392 + }, + { + "epoch": 0.30788, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007283528645833333, + "learning_rate": 0.0001, + "loss": 4.1025, + "loss/crossentropy": 2.1380842328071594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2019854038953781, + "step": 15394 + }, + { + "epoch": 0.30792, + "grad_norm": 2.125, + "grad_norm_var": 0.0072509765625, + "learning_rate": 0.0001, + "loss": 4.1245, + "loss/crossentropy": 2.0551159977912903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18669020384550095, + "step": 15396 + }, + { + "epoch": 0.30796, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009250640869140625, + "learning_rate": 0.0001, + "loss": 3.959, + "loss/crossentropy": 1.9213807582855225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1843273714184761, + "step": 15398 + }, + { + "epoch": 0.308, + "grad_norm": 2.015625, + "grad_norm_var": 0.008628082275390626, + "learning_rate": 0.0001, + "loss": 3.9724, + "loss/crossentropy": 2.0028855204582214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2215728610754013, + "step": 15400 + }, + { + "epoch": 0.30804, + "grad_norm": 2.0, + "grad_norm_var": 0.0064776102701822914, + "learning_rate": 0.0001, + "loss": 4.0287, + "loss/crossentropy": 1.957477331161499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20727651566267014, + "step": 15402 + }, + { + "epoch": 0.30808, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0080718994140625, + "learning_rate": 0.0001, + "loss": 4.0255, + "loss/crossentropy": 2.2241835594177246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25234874337911606, + "step": 15404 + }, + { + "epoch": 0.30812, + "grad_norm": 2.046875, + "grad_norm_var": 0.006852213541666667, + "learning_rate": 0.0001, + "loss": 4.2677, + "loss/crossentropy": 2.108901560306549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20304366201162338, + "step": 15406 + }, + { + "epoch": 0.30816, + "grad_norm": 2.015625, + "grad_norm_var": 0.0075032552083333336, + "learning_rate": 0.0001, + "loss": 4.302, + "loss/crossentropy": 2.1005085110664368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21205321699380875, + "step": 15408 + }, + { + "epoch": 0.3082, + "grad_norm": 2.03125, + "grad_norm_var": 0.008139801025390626, + "learning_rate": 0.0001, + "loss": 4.2505, + "loss/crossentropy": 2.1565412282943726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22271832078695297, + "step": 15410 + }, + { + "epoch": 0.30824, + "grad_norm": 2.015625, + "grad_norm_var": 0.006241607666015625, + "learning_rate": 0.0001, + "loss": 4.1236, + "loss/crossentropy": 1.906798243522644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21911519020795822, + "step": 15412 + }, + { + "epoch": 0.30828, + "grad_norm": 1.8828125, + "grad_norm_var": 0.006089019775390625, + "learning_rate": 0.0001, + "loss": 4.1679, + "loss/crossentropy": 2.0200547575950623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19892341643571854, + "step": 15414 + }, + { + "epoch": 0.30832, + "grad_norm": 2.03125, + "grad_norm_var": 0.006037394205729167, + "learning_rate": 0.0001, + "loss": 4.302, + "loss/crossentropy": 2.14735209941864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21503761410713196, + "step": 15416 + }, + { + "epoch": 0.30836, + "grad_norm": 2.078125, + "grad_norm_var": 0.007405344645182292, + "learning_rate": 0.0001, + "loss": 4.2943, + "loss/crossentropy": 2.0479459166526794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21268057823181152, + "step": 15418 + }, + { + "epoch": 0.3084, + "grad_norm": 2.015625, + "grad_norm_var": 0.006831614176432291, + "learning_rate": 0.0001, + "loss": 4.1969, + "loss/crossentropy": 1.4835808873176575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16378673166036606, + "step": 15420 + }, + { + "epoch": 0.30844, + "grad_norm": 1.984375, + "grad_norm_var": 0.007802073160807292, + "learning_rate": 0.0001, + "loss": 4.058, + "loss/crossentropy": 1.7942256331443787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18387632071971893, + "step": 15422 + }, + { + "epoch": 0.30848, + "grad_norm": 2.046875, + "grad_norm_var": 0.006534576416015625, + "learning_rate": 0.0001, + "loss": 4.1352, + "loss/crossentropy": 1.8660341501235962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20189601182937622, + "step": 15424 + }, + { + "epoch": 0.30852, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006257120768229167, + "learning_rate": 0.0001, + "loss": 4.0359, + "loss/crossentropy": 1.8545112013816833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21422524750232697, + "step": 15426 + }, + { + "epoch": 0.30856, + "grad_norm": 1.890625, + "grad_norm_var": 0.0069163004557291664, + "learning_rate": 0.0001, + "loss": 4.1411, + "loss/crossentropy": 2.204701781272888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21733464300632477, + "step": 15428 + }, + { + "epoch": 0.3086, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006087239583333333, + "learning_rate": 0.0001, + "loss": 4.0865, + "loss/crossentropy": 1.9295769929885864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20931877195835114, + "step": 15430 + }, + { + "epoch": 0.30864, + "grad_norm": 2.140625, + "grad_norm_var": 0.008534495035807292, + "learning_rate": 0.0001, + "loss": 3.9165, + "loss/crossentropy": 2.0090243220329285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20928232371807098, + "step": 15432 + }, + { + "epoch": 0.30868, + "grad_norm": 2.0625, + "grad_norm_var": 0.009326171875, + "learning_rate": 0.0001, + "loss": 4.0517, + "loss/crossentropy": 2.1884257793426514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20887935161590576, + "step": 15434 + }, + { + "epoch": 0.30872, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009553019205729167, + "learning_rate": 0.0001, + "loss": 4.0101, + "loss/crossentropy": 2.023981511592865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1893448904156685, + "step": 15436 + }, + { + "epoch": 0.30876, + "grad_norm": 1.890625, + "grad_norm_var": 0.00909423828125, + "learning_rate": 0.0001, + "loss": 4.0168, + "loss/crossentropy": 2.427197813987732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21649108827114105, + "step": 15438 + }, + { + "epoch": 0.3088, + "grad_norm": 2.3125, + "grad_norm_var": 0.015478515625, + "learning_rate": 0.0001, + "loss": 4.1243, + "loss/crossentropy": 2.2578816413879395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21297947317361832, + "step": 15440 + }, + { + "epoch": 0.30884, + "grad_norm": 2.0625, + "grad_norm_var": 0.015248362223307292, + "learning_rate": 0.0001, + "loss": 4.2977, + "loss/crossentropy": 1.8909358382225037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1924509033560753, + "step": 15442 + }, + { + "epoch": 0.30888, + "grad_norm": 2.046875, + "grad_norm_var": 0.016228993733723957, + "learning_rate": 0.0001, + "loss": 4.0142, + "loss/crossentropy": 2.344139575958252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20414643734693527, + "step": 15444 + }, + { + "epoch": 0.30892, + "grad_norm": 1.96875, + "grad_norm_var": 0.01624755859375, + "learning_rate": 0.0001, + "loss": 4.2373, + "loss/crossentropy": 2.268660068511963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22332928329706192, + "step": 15446 + }, + { + "epoch": 0.30896, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0148193359375, + "learning_rate": 0.0001, + "loss": 4.0944, + "loss/crossentropy": 2.1276373267173767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2123691812157631, + "step": 15448 + }, + { + "epoch": 0.309, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012975819905598958, + "learning_rate": 0.0001, + "loss": 4.1025, + "loss/crossentropy": 2.2471970319747925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2228669673204422, + "step": 15450 + }, + { + "epoch": 0.30904, + "grad_norm": 2.0625, + "grad_norm_var": 0.0124908447265625, + "learning_rate": 0.0001, + "loss": 4.2153, + "loss/crossentropy": 2.122064530849457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22981297969818115, + "step": 15452 + }, + { + "epoch": 0.30908, + "grad_norm": 1.7734375, + "grad_norm_var": 0.014823150634765626, + "learning_rate": 0.0001, + "loss": 3.815, + "loss/crossentropy": 2.016224443912506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19105292856693268, + "step": 15454 + }, + { + "epoch": 0.30912, + "grad_norm": 1.8359375, + "grad_norm_var": 0.016193644205729166, + "learning_rate": 0.0001, + "loss": 4.1518, + "loss/crossentropy": 1.9021474719047546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23519159853458405, + "step": 15456 + }, + { + "epoch": 0.30916, + "grad_norm": 2.09375, + "grad_norm_var": 0.016617838541666666, + "learning_rate": 0.0001, + "loss": 4.0971, + "loss/crossentropy": 2.067293703556061, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20525048673152924, + "step": 15458 + }, + { + "epoch": 0.3092, + "grad_norm": 2.3125, + "grad_norm_var": 0.021930948893229166, + "learning_rate": 0.0001, + "loss": 4.3147, + "loss/crossentropy": 2.0720648169517517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20188772678375244, + "step": 15460 + }, + { + "epoch": 0.30924, + "grad_norm": 2.015625, + "grad_norm_var": 0.030890909830729167, + "learning_rate": 0.0001, + "loss": 4.212, + "loss/crossentropy": 2.1377363204956055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20943745225667953, + "step": 15462 + }, + { + "epoch": 0.30928, + "grad_norm": 2.078125, + "grad_norm_var": 0.028595987955729166, + "learning_rate": 0.0001, + "loss": 4.0799, + "loss/crossentropy": 1.9101733565330505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18436391651630402, + "step": 15464 + }, + { + "epoch": 0.30932, + "grad_norm": 2.015625, + "grad_norm_var": 0.026775868733723958, + "learning_rate": 0.0001, + "loss": 4.0945, + "loss/crossentropy": 2.205660581588745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2431517392396927, + "step": 15466 + }, + { + "epoch": 0.30936, + "grad_norm": 2.078125, + "grad_norm_var": 0.02670262654622396, + "learning_rate": 0.0001, + "loss": 4.502, + "loss/crossentropy": 2.3487337827682495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22333069890737534, + "step": 15468 + }, + { + "epoch": 0.3094, + "grad_norm": 1.9609375, + "grad_norm_var": 0.021971638997395834, + "learning_rate": 0.0001, + "loss": 3.9717, + "loss/crossentropy": 1.953830897808075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20369011163711548, + "step": 15470 + }, + { + "epoch": 0.30944, + "grad_norm": 2.0625, + "grad_norm_var": 0.0156402587890625, + "learning_rate": 0.0001, + "loss": 4.2282, + "loss/crossentropy": 2.1175807118415833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23428738862276077, + "step": 15472 + }, + { + "epoch": 0.30948, + "grad_norm": 1.9375, + "grad_norm_var": 0.016585032145182293, + "learning_rate": 0.0001, + "loss": 4.0257, + "loss/crossentropy": 2.114426612854004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20113752037286758, + "step": 15474 + }, + { + "epoch": 0.30952, + "grad_norm": 2.03125, + "grad_norm_var": 0.01676203409830729, + "learning_rate": 0.0001, + "loss": 3.8239, + "loss/crossentropy": 2.121155083179474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23428727686405182, + "step": 15476 + }, + { + "epoch": 0.30956, + "grad_norm": 1.9375, + "grad_norm_var": 0.007061513264973959, + "learning_rate": 0.0001, + "loss": 3.9982, + "loss/crossentropy": 2.1271342635154724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993299275636673, + "step": 15478 + }, + { + "epoch": 0.3096, + "grad_norm": 1.921875, + "grad_norm_var": 0.007502237955729167, + "learning_rate": 0.0001, + "loss": 4.0211, + "loss/crossentropy": 2.132619023323059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20428313314914703, + "step": 15480 + }, + { + "epoch": 0.30964, + "grad_norm": 2.296875, + "grad_norm_var": 0.013720703125, + "learning_rate": 0.0001, + "loss": 4.2644, + "loss/crossentropy": 1.8355163931846619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1969365030527115, + "step": 15482 + }, + { + "epoch": 0.30968, + "grad_norm": 2.21875, + "grad_norm_var": 0.016695149739583335, + "learning_rate": 0.0001, + "loss": 4.0001, + "loss/crossentropy": 1.6765839457511902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19648748636245728, + "step": 15484 + }, + { + "epoch": 0.30972, + "grad_norm": 2.03125, + "grad_norm_var": 0.0168609619140625, + "learning_rate": 0.0001, + "loss": 4.0806, + "loss/crossentropy": 1.7628436088562012, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1850176900625229, + "step": 15486 + }, + { + "epoch": 0.30976, + "grad_norm": 2.125, + "grad_norm_var": 0.017014312744140624, + "learning_rate": 0.0001, + "loss": 4.5902, + "loss/crossentropy": 2.407894253730774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2243446335196495, + "step": 15488 + }, + { + "epoch": 0.3098, + "grad_norm": 1.8984375, + "grad_norm_var": 0.017409006754557293, + "learning_rate": 0.0001, + "loss": 4.0274, + "loss/crossentropy": 2.0359743237495422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2140478640794754, + "step": 15490 + }, + { + "epoch": 0.30984, + "grad_norm": 2.078125, + "grad_norm_var": 0.012111155192057292, + "learning_rate": 0.0001, + "loss": 4.086, + "loss/crossentropy": 2.1705892086029053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108132392168045, + "step": 15492 + }, + { + "epoch": 0.30988, + "grad_norm": 1.984375, + "grad_norm_var": 0.011126454671223958, + "learning_rate": 0.0001, + "loss": 3.9782, + "loss/crossentropy": 1.836803376674652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19436977803707123, + "step": 15494 + }, + { + "epoch": 0.30992, + "grad_norm": 2.015625, + "grad_norm_var": 0.010432688395182292, + "learning_rate": 0.0001, + "loss": 4.1018, + "loss/crossentropy": 2.1445683240890503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21950577944517136, + "step": 15496 + }, + { + "epoch": 0.30996, + "grad_norm": 2.09375, + "grad_norm_var": 0.007299550374348958, + "learning_rate": 0.0001, + "loss": 4.1382, + "loss/crossentropy": 2.258531093597412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.212137870490551, + "step": 15498 + }, + { + "epoch": 0.31, + "grad_norm": 1.96875, + "grad_norm_var": 0.004052480061848958, + "learning_rate": 0.0001, + "loss": 4.2473, + "loss/crossentropy": 1.9310460686683655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18316251039505005, + "step": 15500 + }, + { + "epoch": 0.31004, + "grad_norm": 1.9375, + "grad_norm_var": 0.004788970947265625, + "learning_rate": 0.0001, + "loss": 3.8656, + "loss/crossentropy": 1.867807149887085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18478462100028992, + "step": 15502 + }, + { + "epoch": 0.31008, + "grad_norm": 2.21875, + "grad_norm_var": 0.008042144775390624, + "learning_rate": 0.0001, + "loss": 4.4691, + "loss/crossentropy": 2.0724986791610718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20986932516098022, + "step": 15504 + }, + { + "epoch": 0.31012, + "grad_norm": 2.203125, + "grad_norm_var": 0.010138956705729167, + "learning_rate": 0.0001, + "loss": 4.5188, + "loss/crossentropy": 2.1599953174591064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2250317633152008, + "step": 15506 + }, + { + "epoch": 0.31016, + "grad_norm": 1.8125, + "grad_norm_var": 0.015608469645182291, + "learning_rate": 0.0001, + "loss": 3.7842, + "loss/crossentropy": 1.975223183631897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104134038090706, + "step": 15508 + }, + { + "epoch": 0.3102, + "grad_norm": 2.21875, + "grad_norm_var": 0.019539388020833333, + "learning_rate": 0.0001, + "loss": 4.0028, + "loss/crossentropy": 2.133773148059845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20153696089982986, + "step": 15510 + }, + { + "epoch": 0.31024, + "grad_norm": 1.984375, + "grad_norm_var": 0.02014948527018229, + "learning_rate": 0.0001, + "loss": 3.9948, + "loss/crossentropy": 1.994078278541565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20573429763317108, + "step": 15512 + }, + { + "epoch": 0.31028, + "grad_norm": 1.953125, + "grad_norm_var": 0.019280751546223957, + "learning_rate": 0.0001, + "loss": 3.8209, + "loss/crossentropy": 1.7885381579399109, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1889629364013672, + "step": 15514 + }, + { + "epoch": 0.31032, + "grad_norm": 2.0, + "grad_norm_var": 0.019191233317057292, + "learning_rate": 0.0001, + "loss": 4.2765, + "loss/crossentropy": 2.495113968849182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22110097110271454, + "step": 15516 + }, + { + "epoch": 0.31036, + "grad_norm": 1.953125, + "grad_norm_var": 0.019160715738932292, + "learning_rate": 0.0001, + "loss": 4.0465, + "loss/crossentropy": 2.0757648944854736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20948036015033722, + "step": 15518 + }, + { + "epoch": 0.3104, + "grad_norm": 1.9765625, + "grad_norm_var": 0.015793609619140624, + "learning_rate": 0.0001, + "loss": 4.1369, + "loss/crossentropy": 2.151356875896454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22598887234926224, + "step": 15520 + }, + { + "epoch": 0.31044, + "grad_norm": 2.0, + "grad_norm_var": 0.010933176676432291, + "learning_rate": 0.0001, + "loss": 4.2193, + "loss/crossentropy": 2.1210484504699707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23365220427513123, + "step": 15522 + }, + { + "epoch": 0.31048, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008414459228515626, + "learning_rate": 0.0001, + "loss": 4.0584, + "loss/crossentropy": 2.3766754865646362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22161870449781418, + "step": 15524 + }, + { + "epoch": 0.31052, + "grad_norm": 1.9375, + "grad_norm_var": 0.0043853759765625, + "learning_rate": 0.0001, + "loss": 4.0968, + "loss/crossentropy": 2.33975088596344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21598009765148163, + "step": 15526 + }, + { + "epoch": 0.31056, + "grad_norm": 2.1875, + "grad_norm_var": 0.006468709309895833, + "learning_rate": 0.0001, + "loss": 4.4643, + "loss/crossentropy": 2.3024542331695557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20656803250312805, + "step": 15528 + }, + { + "epoch": 0.3106, + "grad_norm": 2.078125, + "grad_norm_var": 0.007666015625, + "learning_rate": 0.0001, + "loss": 4.1137, + "loss/crossentropy": 2.009307861328125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19054396450519562, + "step": 15530 + }, + { + "epoch": 0.31064, + "grad_norm": 1.90625, + "grad_norm_var": 0.008902994791666667, + "learning_rate": 0.0001, + "loss": 3.9413, + "loss/crossentropy": 1.8119140267372131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19199557602405548, + "step": 15532 + }, + { + "epoch": 0.31068, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009614817301432292, + "learning_rate": 0.0001, + "loss": 3.8753, + "loss/crossentropy": 1.9586234092712402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18981865793466568, + "step": 15534 + }, + { + "epoch": 0.31072, + "grad_norm": 1.96875, + "grad_norm_var": 0.010798899332682292, + "learning_rate": 0.0001, + "loss": 4.4319, + "loss/crossentropy": 2.1208658814430237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20590776205062866, + "step": 15536 + }, + { + "epoch": 0.31076, + "grad_norm": 1.84375, + "grad_norm_var": 0.012336222330729167, + "learning_rate": 0.0001, + "loss": 3.7806, + "loss/crossentropy": 1.539306402206421, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1615876704454422, + "step": 15538 + }, + { + "epoch": 0.3108, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013768513997395834, + "learning_rate": 0.0001, + "loss": 4.0686, + "loss/crossentropy": 2.1376689672470093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21377786993980408, + "step": 15540 + }, + { + "epoch": 0.31084, + "grad_norm": 2.140625, + "grad_norm_var": 0.016249338785807293, + "learning_rate": 0.0001, + "loss": 4.368, + "loss/crossentropy": 1.954556941986084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19270780682563782, + "step": 15542 + }, + { + "epoch": 0.31088, + "grad_norm": 2.015625, + "grad_norm_var": 0.013575998942057292, + "learning_rate": 0.0001, + "loss": 4.1313, + "loss/crossentropy": 2.0225048661231995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1937834918498993, + "step": 15544 + }, + { + "epoch": 0.31092, + "grad_norm": 2.0625, + "grad_norm_var": 0.011310831705729166, + "learning_rate": 0.0001, + "loss": 4.0919, + "loss/crossentropy": 2.149065852165222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21962979435920715, + "step": 15546 + }, + { + "epoch": 0.31096, + "grad_norm": 2.171875, + "grad_norm_var": 0.013073476155598958, + "learning_rate": 0.0001, + "loss": 4.0192, + "loss/crossentropy": 1.9546958804130554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20022059231996536, + "step": 15548 + }, + { + "epoch": 0.311, + "grad_norm": 1.90625, + "grad_norm_var": 0.013728841145833334, + "learning_rate": 0.0001, + "loss": 4.2815, + "loss/crossentropy": 2.3048110008239746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2232206016778946, + "step": 15550 + }, + { + "epoch": 0.31104, + "grad_norm": 1.953125, + "grad_norm_var": 0.014525349934895833, + "learning_rate": 0.0001, + "loss": 3.9263, + "loss/crossentropy": 2.0329134464263916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20285522937774658, + "step": 15552 + }, + { + "epoch": 0.31108, + "grad_norm": 2.0, + "grad_norm_var": 0.013138580322265624, + "learning_rate": 0.0001, + "loss": 4.1654, + "loss/crossentropy": 2.0910086631774902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20308854430913925, + "step": 15554 + }, + { + "epoch": 0.31112, + "grad_norm": 1.96875, + "grad_norm_var": 0.012361399332682292, + "learning_rate": 0.0001, + "loss": 4.1922, + "loss/crossentropy": 2.3473485708236694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2138630449771881, + "step": 15556 + }, + { + "epoch": 0.31116, + "grad_norm": 1.984375, + "grad_norm_var": 0.0098297119140625, + "learning_rate": 0.0001, + "loss": 4.3456, + "loss/crossentropy": 2.4176318645477295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23624707758426666, + "step": 15558 + }, + { + "epoch": 0.3112, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009943644205729166, + "learning_rate": 0.0001, + "loss": 4.0898, + "loss/crossentropy": 2.190505266189575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21235676854848862, + "step": 15560 + }, + { + "epoch": 0.31124, + "grad_norm": 2.28125, + "grad_norm_var": 0.1935198465983073, + "learning_rate": 0.0001, + "loss": 4.3688, + "loss/crossentropy": 2.1774531602859497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2568993717432022, + "step": 15562 + }, + { + "epoch": 0.31128, + "grad_norm": 1.9609375, + "grad_norm_var": 0.1934832255045573, + "learning_rate": 0.0001, + "loss": 4.004, + "loss/crossentropy": 2.0320950150489807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21587081998586655, + "step": 15564 + }, + { + "epoch": 0.31132, + "grad_norm": 2.125, + "grad_norm_var": 0.19068781534830728, + "learning_rate": 0.0001, + "loss": 4.3037, + "loss/crossentropy": 2.0477761030197144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21909283846616745, + "step": 15566 + }, + { + "epoch": 0.31136, + "grad_norm": 2.09375, + "grad_norm_var": 0.18454996744791666, + "learning_rate": 0.0001, + "loss": 4.2288, + "loss/crossentropy": 2.3907222747802734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21334309875965118, + "step": 15568 + }, + { + "epoch": 0.3114, + "grad_norm": 1.984375, + "grad_norm_var": 0.1816973368326823, + "learning_rate": 0.0001, + "loss": 4.28, + "loss/crossentropy": 2.2539732456207275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21502045542001724, + "step": 15570 + }, + { + "epoch": 0.31144, + "grad_norm": 1.96875, + "grad_norm_var": 0.17766520182291667, + "learning_rate": 0.0001, + "loss": 3.9675, + "loss/crossentropy": 2.036223292350769, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1959664672613144, + "step": 15572 + }, + { + "epoch": 0.31148, + "grad_norm": 1.984375, + "grad_norm_var": 0.17722880045572917, + "learning_rate": 0.0001, + "loss": 4.0387, + "loss/crossentropy": 1.7659193873405457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19535844773054123, + "step": 15574 + }, + { + "epoch": 0.31152, + "grad_norm": 2.015625, + "grad_norm_var": 0.17946751912434897, + "learning_rate": 0.0001, + "loss": 3.8892, + "loss/crossentropy": 2.0053369402885437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24516403675079346, + "step": 15576 + }, + { + "epoch": 0.31156, + "grad_norm": 2.171875, + "grad_norm_var": 0.0060384114583333336, + "learning_rate": 0.0001, + "loss": 3.9851, + "loss/crossentropy": 2.1804131269454956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2349890023469925, + "step": 15578 + }, + { + "epoch": 0.3116, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008552042643229167, + "learning_rate": 0.0001, + "loss": 3.9369, + "loss/crossentropy": 2.314085602760315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21073052287101746, + "step": 15580 + }, + { + "epoch": 0.31164, + "grad_norm": 1.8203125, + "grad_norm_var": 0.008973948160807292, + "learning_rate": 0.0001, + "loss": 3.7951, + "loss/crossentropy": 2.1868577003479004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21520060300827026, + "step": 15582 + }, + { + "epoch": 0.31168, + "grad_norm": 1.953125, + "grad_norm_var": 0.007957967122395833, + "learning_rate": 0.0001, + "loss": 4.1618, + "loss/crossentropy": 1.9785407185554504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18188606202602386, + "step": 15584 + }, + { + "epoch": 0.31172, + "grad_norm": 2.109375, + "grad_norm_var": 0.009598795572916667, + "learning_rate": 0.0001, + "loss": 3.9173, + "loss/crossentropy": 1.9320110082626343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2022753208875656, + "step": 15586 + }, + { + "epoch": 0.31176, + "grad_norm": 2.28125, + "grad_norm_var": 0.015653483072916665, + "learning_rate": 0.0001, + "loss": 4.0995, + "loss/crossentropy": 1.7925443649291992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17203447967767715, + "step": 15588 + }, + { + "epoch": 0.3118, + "grad_norm": 1.9609375, + "grad_norm_var": 0.016556803385416666, + "learning_rate": 0.0001, + "loss": 3.8876, + "loss/crossentropy": 2.048543393611908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18696243315935135, + "step": 15590 + }, + { + "epoch": 0.31184, + "grad_norm": 1.921875, + "grad_norm_var": 0.01658935546875, + "learning_rate": 0.0001, + "loss": 4.125, + "loss/crossentropy": 2.376060724258423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20351988822221756, + "step": 15592 + }, + { + "epoch": 0.31188, + "grad_norm": 2.171875, + "grad_norm_var": 0.0164947509765625, + "learning_rate": 0.0001, + "loss": 4.1778, + "loss/crossentropy": 1.5962707996368408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1538907140493393, + "step": 15594 + }, + { + "epoch": 0.31192, + "grad_norm": 1.9453125, + "grad_norm_var": 0.016281890869140624, + "learning_rate": 0.0001, + "loss": 3.7529, + "loss/crossentropy": 1.7845428586006165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18107154965400696, + "step": 15596 + }, + { + "epoch": 0.31196, + "grad_norm": 2.109375, + "grad_norm_var": 0.01536865234375, + "learning_rate": 0.0001, + "loss": 4.0385, + "loss/crossentropy": 1.8557100296020508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2010527327656746, + "step": 15598 + }, + { + "epoch": 0.312, + "grad_norm": 1.921875, + "grad_norm_var": 0.015372721354166667, + "learning_rate": 0.0001, + "loss": 3.9735, + "loss/crossentropy": 1.8659257888793945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19198190420866013, + "step": 15600 + }, + { + "epoch": 0.31204, + "grad_norm": 2.046875, + "grad_norm_var": 0.014078521728515625, + "learning_rate": 0.0001, + "loss": 3.9842, + "loss/crossentropy": 2.0340747833251953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19963373243808746, + "step": 15602 + }, + { + "epoch": 0.31208, + "grad_norm": 2.09375, + "grad_norm_var": 0.00931396484375, + "learning_rate": 0.0001, + "loss": 4.1192, + "loss/crossentropy": 2.0676557421684265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20130064338445663, + "step": 15604 + }, + { + "epoch": 0.31212, + "grad_norm": 1.84375, + "grad_norm_var": 0.008397420247395834, + "learning_rate": 0.0001, + "loss": 3.8069, + "loss/crossentropy": 2.038145124912262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2034953236579895, + "step": 15606 + }, + { + "epoch": 0.31216, + "grad_norm": 2.65625, + "grad_norm_var": 0.03449605305989583, + "learning_rate": 0.0001, + "loss": 4.3615, + "loss/crossentropy": 2.3371706008911133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21727421879768372, + "step": 15608 + }, + { + "epoch": 0.3122, + "grad_norm": 2.03125, + "grad_norm_var": 0.03327611287434896, + "learning_rate": 0.0001, + "loss": 4.1988, + "loss/crossentropy": 2.3314318656921387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064831778407097, + "step": 15610 + }, + { + "epoch": 0.31224, + "grad_norm": 1.9375, + "grad_norm_var": 0.03206558227539062, + "learning_rate": 0.0001, + "loss": 3.9634, + "loss/crossentropy": 1.8074566721916199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1755329668521881, + "step": 15612 + }, + { + "epoch": 0.31228, + "grad_norm": 1.96875, + "grad_norm_var": 0.032138824462890625, + "learning_rate": 0.0001, + "loss": 4.1849, + "loss/crossentropy": 2.1219228506088257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23183195292949677, + "step": 15614 + }, + { + "epoch": 0.31232, + "grad_norm": 2.03125, + "grad_norm_var": 0.031434885660807294, + "learning_rate": 0.0001, + "loss": 4.1418, + "loss/crossentropy": 2.3573983907699585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20713336020708084, + "step": 15616 + }, + { + "epoch": 0.31236, + "grad_norm": 1.859375, + "grad_norm_var": 0.033933258056640624, + "learning_rate": 0.0001, + "loss": 4.048, + "loss/crossentropy": 2.0748316049575806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19384868443012238, + "step": 15618 + }, + { + "epoch": 0.3124, + "grad_norm": 2.03125, + "grad_norm_var": 0.033882395426432295, + "learning_rate": 0.0001, + "loss": 4.1834, + "loss/crossentropy": 1.7933568358421326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19711802154779434, + "step": 15620 + }, + { + "epoch": 0.31244, + "grad_norm": 2.046875, + "grad_norm_var": 0.03115819295247396, + "learning_rate": 0.0001, + "loss": 4.2023, + "loss/crossentropy": 1.8922778367996216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19847699999809265, + "step": 15622 + }, + { + "epoch": 0.31248, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0048004150390625, + "learning_rate": 0.0001, + "loss": 4.2326, + "loss/crossentropy": 2.3432679176330566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22804179042577744, + "step": 15624 + }, + { + "epoch": 0.31252, + "grad_norm": 2.203125, + "grad_norm_var": 0.007469685872395834, + "learning_rate": 0.0001, + "loss": 4.1215, + "loss/crossentropy": 2.340356707572937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22454006224870682, + "step": 15626 + }, + { + "epoch": 0.31256, + "grad_norm": 1.8046875, + "grad_norm_var": 0.010737864176432292, + "learning_rate": 0.0001, + "loss": 4.0874, + "loss/crossentropy": 2.037777841091156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20374249666929245, + "step": 15628 + }, + { + "epoch": 0.3126, + "grad_norm": 2.078125, + "grad_norm_var": 0.010562896728515625, + "learning_rate": 0.0001, + "loss": 4.3193, + "loss/crossentropy": 2.2576274275779724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19910124689340591, + "step": 15630 + }, + { + "epoch": 0.31264, + "grad_norm": 1.8828125, + "grad_norm_var": 0.011302693684895834, + "learning_rate": 0.0001, + "loss": 4.1561, + "loss/crossentropy": 2.1354891061782837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20646820217370987, + "step": 15632 + }, + { + "epoch": 0.31268, + "grad_norm": 2.046875, + "grad_norm_var": 0.011136627197265625, + "learning_rate": 0.0001, + "loss": 3.906, + "loss/crossentropy": 1.8888733386993408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20730097591876984, + "step": 15634 + }, + { + "epoch": 0.31272, + "grad_norm": 1.921875, + "grad_norm_var": 0.012474568684895833, + "learning_rate": 0.0001, + "loss": 3.9522, + "loss/crossentropy": 2.1037773489952087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20308485627174377, + "step": 15636 + }, + { + "epoch": 0.31276, + "grad_norm": 2.21875, + "grad_norm_var": 0.017085774739583334, + "learning_rate": 0.0001, + "loss": 4.244, + "loss/crossentropy": 1.996969223022461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2080031782388687, + "step": 15638 + }, + { + "epoch": 0.3128, + "grad_norm": 2.671875, + "grad_norm_var": 0.0469146728515625, + "learning_rate": 0.0001, + "loss": 3.915, + "loss/crossentropy": 2.0528018474578857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2248053327202797, + "step": 15640 + }, + { + "epoch": 0.31284, + "grad_norm": 1.9765625, + "grad_norm_var": 0.044864654541015625, + "learning_rate": 0.0001, + "loss": 4.1718, + "loss/crossentropy": 2.270516335964203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19578035175800323, + "step": 15642 + }, + { + "epoch": 0.31288, + "grad_norm": 2.046875, + "grad_norm_var": 0.0400299072265625, + "learning_rate": 0.0001, + "loss": 4.1463, + "loss/crossentropy": 2.146915912628174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21282948553562164, + "step": 15644 + }, + { + "epoch": 0.31292, + "grad_norm": 2.125, + "grad_norm_var": 0.041562652587890624, + "learning_rate": 0.0001, + "loss": 4.3275, + "loss/crossentropy": 2.058235287666321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20604285597801208, + "step": 15646 + }, + { + "epoch": 0.31296, + "grad_norm": 2.015625, + "grad_norm_var": 0.0414215087890625, + "learning_rate": 0.0001, + "loss": 4.4304, + "loss/crossentropy": 2.3217705488204956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24202678352594376, + "step": 15648 + }, + { + "epoch": 0.313, + "grad_norm": 2.0625, + "grad_norm_var": 0.041257476806640624, + "learning_rate": 0.0001, + "loss": 4.1372, + "loss/crossentropy": 2.108501672744751, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20358111709356308, + "step": 15650 + }, + { + "epoch": 0.31304, + "grad_norm": 1.859375, + "grad_norm_var": 0.041025543212890626, + "learning_rate": 0.0001, + "loss": 4.1224, + "loss/crossentropy": 2.0875802636146545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20186328142881393, + "step": 15652 + }, + { + "epoch": 0.31308, + "grad_norm": 2.0625, + "grad_norm_var": 0.036321767171223956, + "learning_rate": 0.0001, + "loss": 4.1994, + "loss/crossentropy": 2.0159415006637573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2053827941417694, + "step": 15654 + }, + { + "epoch": 0.31312, + "grad_norm": 2.015625, + "grad_norm_var": 0.008402252197265625, + "learning_rate": 0.0001, + "loss": 3.9711, + "loss/crossentropy": 1.8664127588272095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18666206300258636, + "step": 15656 + }, + { + "epoch": 0.31316, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008548736572265625, + "learning_rate": 0.0001, + "loss": 3.9399, + "loss/crossentropy": 1.9351357221603394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.189142145216465, + "step": 15658 + }, + { + "epoch": 0.3132, + "grad_norm": 1.96875, + "grad_norm_var": 0.010872141520182291, + "learning_rate": 0.0001, + "loss": 4.3288, + "loss/crossentropy": 1.9937176704406738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18212904781103134, + "step": 15660 + }, + { + "epoch": 0.31324, + "grad_norm": 2.03125, + "grad_norm_var": 0.009801991780598958, + "learning_rate": 0.0001, + "loss": 4.1238, + "loss/crossentropy": 2.0626447796821594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20908795297145844, + "step": 15662 + }, + { + "epoch": 0.31328, + "grad_norm": 1.9375, + "grad_norm_var": 0.008506011962890626, + "learning_rate": 0.0001, + "loss": 4.0436, + "loss/crossentropy": 1.8491488695144653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18241358548402786, + "step": 15664 + }, + { + "epoch": 0.31332, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0084625244140625, + "learning_rate": 0.0001, + "loss": 3.9885, + "loss/crossentropy": 2.2744827270507812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2295297086238861, + "step": 15666 + }, + { + "epoch": 0.31336, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0078521728515625, + "learning_rate": 0.0001, + "loss": 3.9213, + "loss/crossentropy": 2.30733585357666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23149366676807404, + "step": 15668 + }, + { + "epoch": 0.3134, + "grad_norm": 2.140625, + "grad_norm_var": 0.00892333984375, + "learning_rate": 0.0001, + "loss": 4.1357, + "loss/crossentropy": 2.038204550743103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19608831405639648, + "step": 15670 + }, + { + "epoch": 0.31344, + "grad_norm": 1.953125, + "grad_norm_var": 0.008676910400390625, + "learning_rate": 0.0001, + "loss": 4.1235, + "loss/crossentropy": 2.076035261154175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21801196038722992, + "step": 15672 + }, + { + "epoch": 0.31348, + "grad_norm": 1.7734375, + "grad_norm_var": 0.011310831705729166, + "learning_rate": 0.0001, + "loss": 4.0923, + "loss/crossentropy": 1.9234278202056885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18380460888147354, + "step": 15674 + }, + { + "epoch": 0.31352, + "grad_norm": 2.125, + "grad_norm_var": 0.00931396484375, + "learning_rate": 0.0001, + "loss": 4.1002, + "loss/crossentropy": 1.9312421679496765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18474044650793076, + "step": 15676 + }, + { + "epoch": 0.31356, + "grad_norm": 2.0625, + "grad_norm_var": 0.009716542561848958, + "learning_rate": 0.0001, + "loss": 4.079, + "loss/crossentropy": 2.014132261276245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2045312151312828, + "step": 15678 + }, + { + "epoch": 0.3136, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0099365234375, + "learning_rate": 0.0001, + "loss": 4.2325, + "loss/crossentropy": 1.925381362438202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18306680023670197, + "step": 15680 + }, + { + "epoch": 0.31364, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009346516927083333, + "learning_rate": 0.0001, + "loss": 4.0766, + "loss/crossentropy": 1.932526171207428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20741897821426392, + "step": 15682 + }, + { + "epoch": 0.31368, + "grad_norm": 2.015625, + "grad_norm_var": 0.008963775634765626, + "learning_rate": 0.0001, + "loss": 4.2234, + "loss/crossentropy": 2.1144750714302063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19398797303438187, + "step": 15684 + }, + { + "epoch": 0.31372, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0076983133951822914, + "learning_rate": 0.0001, + "loss": 3.8411, + "loss/crossentropy": 1.8115296363830566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1869400143623352, + "step": 15686 + }, + { + "epoch": 0.31376, + "grad_norm": 2.296875, + "grad_norm_var": 0.014686838785807291, + "learning_rate": 0.0001, + "loss": 4.0935, + "loss/crossentropy": 2.0550562739372253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2093418911099434, + "step": 15688 + }, + { + "epoch": 0.3138, + "grad_norm": 1.8046875, + "grad_norm_var": 0.013541412353515626, + "learning_rate": 0.0001, + "loss": 3.8549, + "loss/crossentropy": 2.144737482070923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20025025308132172, + "step": 15690 + }, + { + "epoch": 0.31384, + "grad_norm": 1.8984375, + "grad_norm_var": 0.012629191080729166, + "learning_rate": 0.0001, + "loss": 3.9911, + "loss/crossentropy": 2.2495052814483643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21178698539733887, + "step": 15692 + }, + { + "epoch": 0.31388, + "grad_norm": 2.03125, + "grad_norm_var": 0.012740071614583333, + "learning_rate": 0.0001, + "loss": 4.2208, + "loss/crossentropy": 2.1858155727386475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20891498774290085, + "step": 15694 + }, + { + "epoch": 0.31392, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0131744384765625, + "learning_rate": 0.0001, + "loss": 4.1841, + "loss/crossentropy": 2.1480127573013306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2149479240179062, + "step": 15696 + }, + { + "epoch": 0.31396, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013329060872395833, + "learning_rate": 0.0001, + "loss": 3.9218, + "loss/crossentropy": 2.1585733294487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20317788422107697, + "step": 15698 + }, + { + "epoch": 0.314, + "grad_norm": 1.9609375, + "grad_norm_var": 0.2103179931640625, + "learning_rate": 0.0001, + "loss": 4.06, + "loss/crossentropy": 2.1281010508537292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2032032608985901, + "step": 15700 + }, + { + "epoch": 0.31404, + "grad_norm": 1.9375, + "grad_norm_var": 0.2096588134765625, + "learning_rate": 0.0001, + "loss": 4.0189, + "loss/crossentropy": 2.160776972770691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20999448001384735, + "step": 15702 + }, + { + "epoch": 0.31408, + "grad_norm": 1.9140625, + "grad_norm_var": 0.20640869140625, + "learning_rate": 0.0001, + "loss": 3.961, + "loss/crossentropy": 2.1508986949920654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21675271540880203, + "step": 15704 + }, + { + "epoch": 0.31412, + "grad_norm": 2.109375, + "grad_norm_var": 0.2011871337890625, + "learning_rate": 0.0001, + "loss": 4.1573, + "loss/crossentropy": 2.083084225654602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21110595017671585, + "step": 15706 + }, + { + "epoch": 0.31416, + "grad_norm": 1.8046875, + "grad_norm_var": 0.20275472005208334, + "learning_rate": 0.0001, + "loss": 3.9785, + "loss/crossentropy": 2.1268805265426636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20728368312120438, + "step": 15708 + }, + { + "epoch": 0.3142, + "grad_norm": 2.078125, + "grad_norm_var": 0.20265706380208334, + "learning_rate": 0.0001, + "loss": 4.4291, + "loss/crossentropy": 2.156775116920471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22175253927707672, + "step": 15710 + }, + { + "epoch": 0.31424, + "grad_norm": 1.984375, + "grad_norm_var": 0.20224583943684896, + "learning_rate": 0.0001, + "loss": 4.1537, + "loss/crossentropy": 2.2541415691375732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22773633152246475, + "step": 15712 + }, + { + "epoch": 0.31428, + "grad_norm": 2.0625, + "grad_norm_var": 0.20350723266601561, + "learning_rate": 0.0001, + "loss": 3.9166, + "loss/crossentropy": 1.6463716626167297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18172992020845413, + "step": 15714 + }, + { + "epoch": 0.31432, + "grad_norm": 2.03125, + "grad_norm_var": 0.011669921875, + "learning_rate": 0.0001, + "loss": 4.0144, + "loss/crossentropy": 1.9325580596923828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19147639721632004, + "step": 15716 + }, + { + "epoch": 0.31436, + "grad_norm": 2.109375, + "grad_norm_var": 0.014115142822265624, + "learning_rate": 0.0001, + "loss": 3.9952, + "loss/crossentropy": 2.4016847610473633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2058410421013832, + "step": 15718 + }, + { + "epoch": 0.3144, + "grad_norm": 2.125, + "grad_norm_var": 0.014338175455729166, + "learning_rate": 0.0001, + "loss": 4.1847, + "loss/crossentropy": 2.3456791639328003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22646376490592957, + "step": 15720 + }, + { + "epoch": 0.31444, + "grad_norm": 2.046875, + "grad_norm_var": 0.013785807291666667, + "learning_rate": 0.0001, + "loss": 4.0551, + "loss/crossentropy": 1.6703909635543823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1818019300699234, + "step": 15722 + }, + { + "epoch": 0.31448, + "grad_norm": 1.953125, + "grad_norm_var": 0.011065419514973958, + "learning_rate": 0.0001, + "loss": 4.0561, + "loss/crossentropy": 2.0004162192344666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033773437142372, + "step": 15724 + }, + { + "epoch": 0.31452, + "grad_norm": 2.0, + "grad_norm_var": 0.009905751546223958, + "learning_rate": 0.0001, + "loss": 4.1799, + "loss/crossentropy": 2.018653154373169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20320426672697067, + "step": 15726 + }, + { + "epoch": 0.31456, + "grad_norm": 2.046875, + "grad_norm_var": 0.009592437744140625, + "learning_rate": 0.0001, + "loss": 4.0991, + "loss/crossentropy": 2.0881033539772034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126169577240944, + "step": 15728 + }, + { + "epoch": 0.3146, + "grad_norm": 2.046875, + "grad_norm_var": 0.0067942301432291664, + "learning_rate": 0.0001, + "loss": 4.2159, + "loss/crossentropy": 2.1103954911231995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2163369059562683, + "step": 15730 + }, + { + "epoch": 0.31464, + "grad_norm": 2.015625, + "grad_norm_var": 0.007868448893229166, + "learning_rate": 0.0001, + "loss": 4.142, + "loss/crossentropy": 1.9757474660873413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21398526430130005, + "step": 15732 + }, + { + "epoch": 0.31468, + "grad_norm": 1.921875, + "grad_norm_var": 0.006304677327473958, + "learning_rate": 0.0001, + "loss": 3.8965, + "loss/crossentropy": 2.114749312400818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21620750427246094, + "step": 15734 + }, + { + "epoch": 0.31472, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007163238525390625, + "learning_rate": 0.0001, + "loss": 3.9688, + "loss/crossentropy": 1.9022989869117737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1895657405257225, + "step": 15736 + }, + { + "epoch": 0.31476, + "grad_norm": 2.296875, + "grad_norm_var": 0.013059234619140625, + "learning_rate": 0.0001, + "loss": 4.2254, + "loss/crossentropy": 2.0457635521888733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21281428635120392, + "step": 15738 + }, + { + "epoch": 0.3148, + "grad_norm": 2.015625, + "grad_norm_var": 0.011954752604166667, + "learning_rate": 0.0001, + "loss": 3.8985, + "loss/crossentropy": 1.9585834741592407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2003040909767151, + "step": 15740 + }, + { + "epoch": 0.31484, + "grad_norm": 2.046875, + "grad_norm_var": 0.014745076497395834, + "learning_rate": 0.0001, + "loss": 4.1682, + "loss/crossentropy": 2.06454074382782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22020584344863892, + "step": 15742 + }, + { + "epoch": 0.31488, + "grad_norm": 1.921875, + "grad_norm_var": 0.015665690104166668, + "learning_rate": 0.0001, + "loss": 4.2384, + "loss/crossentropy": 2.2651820182800293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22998705506324768, + "step": 15744 + }, + { + "epoch": 0.31492, + "grad_norm": 1.9375, + "grad_norm_var": 0.0172607421875, + "learning_rate": 0.0001, + "loss": 4.258, + "loss/crossentropy": 2.269286036491394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21012098342180252, + "step": 15746 + }, + { + "epoch": 0.31496, + "grad_norm": 1.953125, + "grad_norm_var": 0.017891438802083333, + "learning_rate": 0.0001, + "loss": 4.0604, + "loss/crossentropy": 1.6778026223182678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17330823838710785, + "step": 15748 + }, + { + "epoch": 0.315, + "grad_norm": 1.78125, + "grad_norm_var": 0.020775349934895833, + "learning_rate": 0.0001, + "loss": 3.9785, + "loss/crossentropy": 1.8929405808448792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18170948326587677, + "step": 15750 + }, + { + "epoch": 0.31504, + "grad_norm": 1.953125, + "grad_norm_var": 0.01980768839518229, + "learning_rate": 0.0001, + "loss": 4.0874, + "loss/crossentropy": 1.8814340233802795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2097821682691574, + "step": 15752 + }, + { + "epoch": 0.31508, + "grad_norm": 2.09375, + "grad_norm_var": 0.014465077718098959, + "learning_rate": 0.0001, + "loss": 3.8622, + "loss/crossentropy": 1.7101264595985413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1834770292043686, + "step": 15754 + }, + { + "epoch": 0.31512, + "grad_norm": 1.828125, + "grad_norm_var": 0.01607233683268229, + "learning_rate": 0.0001, + "loss": 3.8498, + "loss/crossentropy": 2.0287395119667053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19603880494832993, + "step": 15756 + }, + { + "epoch": 0.31516, + "grad_norm": 1.875, + "grad_norm_var": 0.010467274983723959, + "learning_rate": 0.0001, + "loss": 3.7485, + "loss/crossentropy": 2.0119568705558777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.212313212454319, + "step": 15758 + }, + { + "epoch": 0.3152, + "grad_norm": 2.015625, + "grad_norm_var": 0.011441802978515625, + "learning_rate": 0.0001, + "loss": 4.5141, + "loss/crossentropy": 1.8937278985977173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1774902269244194, + "step": 15760 + }, + { + "epoch": 0.31524, + "grad_norm": 2.015625, + "grad_norm_var": 0.009104156494140625, + "learning_rate": 0.0001, + "loss": 4.3562, + "loss/crossentropy": 2.557780146598816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22111114859580994, + "step": 15762 + }, + { + "epoch": 0.31528, + "grad_norm": 2.1875, + "grad_norm_var": 0.011262003580729167, + "learning_rate": 0.0001, + "loss": 4.1646, + "loss/crossentropy": 2.1467400789260864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2134222611784935, + "step": 15764 + }, + { + "epoch": 0.31532, + "grad_norm": 2.03125, + "grad_norm_var": 0.00814208984375, + "learning_rate": 0.0001, + "loss": 3.9631, + "loss/crossentropy": 1.9758468270301819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1853625252842903, + "step": 15766 + }, + { + "epoch": 0.31536, + "grad_norm": 2.171875, + "grad_norm_var": 0.009699503580729166, + "learning_rate": 0.0001, + "loss": 4.2144, + "loss/crossentropy": 1.8620794415473938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18791437149047852, + "step": 15768 + }, + { + "epoch": 0.3154, + "grad_norm": 2.03125, + "grad_norm_var": 0.008893839518229167, + "learning_rate": 0.0001, + "loss": 4.2066, + "loss/crossentropy": 2.187020480632782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2102060467004776, + "step": 15770 + }, + { + "epoch": 0.31544, + "grad_norm": 2.03125, + "grad_norm_var": 0.006522369384765625, + "learning_rate": 0.0001, + "loss": 4.1828, + "loss/crossentropy": 2.0049954652786255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19090547412633896, + "step": 15772 + }, + { + "epoch": 0.31548, + "grad_norm": 2.03125, + "grad_norm_var": 0.010357411702473958, + "learning_rate": 0.0001, + "loss": 4.195, + "loss/crossentropy": 2.0030736327171326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20134840160608292, + "step": 15774 + }, + { + "epoch": 0.31552, + "grad_norm": 2.0, + "grad_norm_var": 0.010687001546223958, + "learning_rate": 0.0001, + "loss": 4.3092, + "loss/crossentropy": 1.8941256999969482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20826192945241928, + "step": 15776 + }, + { + "epoch": 0.31556, + "grad_norm": 1.8046875, + "grad_norm_var": 0.015596516927083333, + "learning_rate": 0.0001, + "loss": 4.0767, + "loss/crossentropy": 2.0645129680633545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19137012213468552, + "step": 15778 + }, + { + "epoch": 0.3156, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014240519205729166, + "learning_rate": 0.0001, + "loss": 4.0349, + "loss/crossentropy": 2.198352813720703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21606199443340302, + "step": 15780 + }, + { + "epoch": 0.31564, + "grad_norm": 2.125, + "grad_norm_var": 0.014020792643229167, + "learning_rate": 0.0001, + "loss": 4.1544, + "loss/crossentropy": 1.9041627049446106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19438787549734116, + "step": 15782 + }, + { + "epoch": 0.31568, + "grad_norm": 2.078125, + "grad_norm_var": 0.027741495768229166, + "learning_rate": 0.0001, + "loss": 4.2615, + "loss/crossentropy": 2.110167443752289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2026640698313713, + "step": 15784 + }, + { + "epoch": 0.31572, + "grad_norm": 2.171875, + "grad_norm_var": 0.02655029296875, + "learning_rate": 0.0001, + "loss": 4.1932, + "loss/crossentropy": 2.5422832369804382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20274986326694489, + "step": 15786 + }, + { + "epoch": 0.31576, + "grad_norm": 1.921875, + "grad_norm_var": 0.02840576171875, + "learning_rate": 0.0001, + "loss": 4.2038, + "loss/crossentropy": 2.106445074081421, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20275159180164337, + "step": 15788 + }, + { + "epoch": 0.3158, + "grad_norm": 2.375, + "grad_norm_var": 0.029857381184895834, + "learning_rate": 0.0001, + "loss": 4.2718, + "loss/crossentropy": 2.341952681541443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22970512509346008, + "step": 15790 + }, + { + "epoch": 0.31584, + "grad_norm": 1.96875, + "grad_norm_var": 0.030228678385416666, + "learning_rate": 0.0001, + "loss": 4.238, + "loss/crossentropy": 2.120057761669159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21171308308839798, + "step": 15792 + }, + { + "epoch": 0.31588, + "grad_norm": 1.8671875, + "grad_norm_var": 0.027644856770833334, + "learning_rate": 0.0001, + "loss": 3.8819, + "loss/crossentropy": 1.5601414442062378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20278310775756836, + "step": 15794 + }, + { + "epoch": 0.31592, + "grad_norm": 1.984375, + "grad_norm_var": 0.027596028645833333, + "learning_rate": 0.0001, + "loss": 4.0743, + "loss/crossentropy": 2.0634138584136963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1815958246588707, + "step": 15796 + }, + { + "epoch": 0.31596, + "grad_norm": 1.953125, + "grad_norm_var": 0.0312652587890625, + "learning_rate": 0.0001, + "loss": 3.8183, + "loss/crossentropy": 1.8437300324440002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18099966645240784, + "step": 15798 + }, + { + "epoch": 0.316, + "grad_norm": 1.984375, + "grad_norm_var": 0.015897369384765624, + "learning_rate": 0.0001, + "loss": 4.1102, + "loss/crossentropy": 1.9323074221611023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1884680539369583, + "step": 15800 + }, + { + "epoch": 0.31604, + "grad_norm": 2.265625, + "grad_norm_var": 0.018027496337890626, + "learning_rate": 0.0001, + "loss": 4.3145, + "loss/crossentropy": 2.158499598503113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23394601047039032, + "step": 15802 + }, + { + "epoch": 0.31608, + "grad_norm": 1.9921875, + "grad_norm_var": 0.018062337239583334, + "learning_rate": 0.0001, + "loss": 4.0894, + "loss/crossentropy": 1.9635908007621765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20692527294158936, + "step": 15804 + }, + { + "epoch": 0.31612, + "grad_norm": 1.8671875, + "grad_norm_var": 0.010503896077473958, + "learning_rate": 0.0001, + "loss": 4.1912, + "loss/crossentropy": 1.96099454164505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1971232146024704, + "step": 15806 + }, + { + "epoch": 0.31616, + "grad_norm": 2.09375, + "grad_norm_var": 0.010643513997395833, + "learning_rate": 0.0001, + "loss": 4.2205, + "loss/crossentropy": 1.9690999388694763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19965435564517975, + "step": 15808 + }, + { + "epoch": 0.3162, + "grad_norm": 2.015625, + "grad_norm_var": 0.011354319254557292, + "learning_rate": 0.0001, + "loss": 4.113, + "loss/crossentropy": 2.3924723863601685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2192596197128296, + "step": 15810 + }, + { + "epoch": 0.31624, + "grad_norm": 1.7890625, + "grad_norm_var": 0.013999176025390626, + "learning_rate": 0.0001, + "loss": 3.8794, + "loss/crossentropy": 2.137382209300995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20949169248342514, + "step": 15812 + }, + { + "epoch": 0.31628, + "grad_norm": 2.0625, + "grad_norm_var": 0.013103993733723958, + "learning_rate": 0.0001, + "loss": 4.0187, + "loss/crossentropy": 1.986245334148407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19678276032209396, + "step": 15814 + }, + { + "epoch": 0.31632, + "grad_norm": 1.8984375, + "grad_norm_var": 0.013492584228515625, + "learning_rate": 0.0001, + "loss": 3.9623, + "loss/crossentropy": 1.9210307598114014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18748773634433746, + "step": 15816 + }, + { + "epoch": 0.31636, + "grad_norm": 2.0625, + "grad_norm_var": 0.0087066650390625, + "learning_rate": 0.0001, + "loss": 4.1328, + "loss/crossentropy": 1.9330076575279236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21187862008810043, + "step": 15818 + }, + { + "epoch": 0.3164, + "grad_norm": 2.015625, + "grad_norm_var": 0.007306925455729167, + "learning_rate": 0.0001, + "loss": 4.2475, + "loss/crossentropy": 2.275822639465332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2303369864821434, + "step": 15820 + }, + { + "epoch": 0.31644, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0082275390625, + "learning_rate": 0.0001, + "loss": 4.3641, + "loss/crossentropy": 1.9317327737808228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19543734192848206, + "step": 15822 + }, + { + "epoch": 0.31648, + "grad_norm": 2.109375, + "grad_norm_var": 0.008715565999348958, + "learning_rate": 0.0001, + "loss": 4.3209, + "loss/crossentropy": 2.2146100997924805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22172697633504868, + "step": 15824 + }, + { + "epoch": 0.31652, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007173411051432292, + "learning_rate": 0.0001, + "loss": 4.0226, + "loss/crossentropy": 2.296873092651367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22240595519542694, + "step": 15826 + }, + { + "epoch": 0.31656, + "grad_norm": 1.953125, + "grad_norm_var": 0.005900065104166667, + "learning_rate": 0.0001, + "loss": 4.2914, + "loss/crossentropy": 2.130843997001648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21288827806711197, + "step": 15828 + }, + { + "epoch": 0.3166, + "grad_norm": 1.9609375, + "grad_norm_var": 0.017856597900390625, + "learning_rate": 0.0001, + "loss": 4.1067, + "loss/crossentropy": 2.042823553085327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1972874104976654, + "step": 15830 + }, + { + "epoch": 0.31664, + "grad_norm": 1.9453125, + "grad_norm_var": 0.01701838175455729, + "learning_rate": 0.0001, + "loss": 3.8878, + "loss/crossentropy": 1.8040228486061096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19943677634000778, + "step": 15832 + }, + { + "epoch": 0.31668, + "grad_norm": 2.1875, + "grad_norm_var": 0.018400065104166665, + "learning_rate": 0.0001, + "loss": 4.2647, + "loss/crossentropy": 2.085720181465149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2592329904437065, + "step": 15834 + }, + { + "epoch": 0.31672, + "grad_norm": 2.046875, + "grad_norm_var": 0.022334798177083334, + "learning_rate": 0.0001, + "loss": 3.7694, + "loss/crossentropy": 1.9271164536476135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1766722872853279, + "step": 15836 + }, + { + "epoch": 0.31676, + "grad_norm": 2.09375, + "grad_norm_var": 0.022106679280598958, + "learning_rate": 0.0001, + "loss": 4.0742, + "loss/crossentropy": 2.188960611820221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19409415870904922, + "step": 15838 + }, + { + "epoch": 0.3168, + "grad_norm": 2.0, + "grad_norm_var": 0.020949045817057293, + "learning_rate": 0.0001, + "loss": 3.7821, + "loss/crossentropy": 1.5765752792358398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16517025232315063, + "step": 15840 + }, + { + "epoch": 0.31684, + "grad_norm": 2.203125, + "grad_norm_var": 0.020804595947265626, + "learning_rate": 0.0001, + "loss": 4.3356, + "loss/crossentropy": 1.9913761019706726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20838972181081772, + "step": 15842 + }, + { + "epoch": 0.31688, + "grad_norm": 1.984375, + "grad_norm_var": 0.034366607666015625, + "learning_rate": 0.0001, + "loss": 4.3525, + "loss/crossentropy": 1.8618363738059998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19304241985082626, + "step": 15844 + }, + { + "epoch": 0.31692, + "grad_norm": 1.9375, + "grad_norm_var": 0.026911417643229168, + "learning_rate": 0.0001, + "loss": 3.8087, + "loss/crossentropy": 1.9099775552749634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2251715511083603, + "step": 15846 + }, + { + "epoch": 0.31696, + "grad_norm": 1.953125, + "grad_norm_var": 0.026486968994140624, + "learning_rate": 0.0001, + "loss": 4.0781, + "loss/crossentropy": 2.3534491062164307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19920790195465088, + "step": 15848 + }, + { + "epoch": 0.317, + "grad_norm": 2.203125, + "grad_norm_var": 0.02804743448893229, + "learning_rate": 0.0001, + "loss": 4.1676, + "loss/crossentropy": 2.1890534162521362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2140251249074936, + "step": 15850 + }, + { + "epoch": 0.31704, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0298980712890625, + "learning_rate": 0.0001, + "loss": 3.6303, + "loss/crossentropy": 1.880380094051361, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18094944953918457, + "step": 15852 + }, + { + "epoch": 0.31708, + "grad_norm": 2.4375, + "grad_norm_var": 0.03880106608072917, + "learning_rate": 0.0001, + "loss": 4.3276, + "loss/crossentropy": 1.8723257184028625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2101321741938591, + "step": 15854 + }, + { + "epoch": 0.31712, + "grad_norm": 1.953125, + "grad_norm_var": 0.039582316080729166, + "learning_rate": 0.0001, + "loss": 4.077, + "loss/crossentropy": 2.0442943572998047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1947987675666809, + "step": 15856 + }, + { + "epoch": 0.31716, + "grad_norm": 2.0625, + "grad_norm_var": 0.03819071451822917, + "learning_rate": 0.0001, + "loss": 4.2969, + "loss/crossentropy": 2.0415892601013184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2022911012172699, + "step": 15858 + }, + { + "epoch": 0.3172, + "grad_norm": 1.875, + "grad_norm_var": 0.02176513671875, + "learning_rate": 0.0001, + "loss": 3.8801, + "loss/crossentropy": 1.8290876150131226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1914537101984024, + "step": 15860 + }, + { + "epoch": 0.31724, + "grad_norm": 1.859375, + "grad_norm_var": 0.022904459635416666, + "learning_rate": 0.0001, + "loss": 3.8257, + "loss/crossentropy": 1.6169148087501526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21117067337036133, + "step": 15862 + }, + { + "epoch": 0.31728, + "grad_norm": 2.0, + "grad_norm_var": 0.022648111979166666, + "learning_rate": 0.0001, + "loss": 4.2553, + "loss/crossentropy": 2.3701289892196655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2557762861251831, + "step": 15864 + }, + { + "epoch": 0.31732, + "grad_norm": 2.03125, + "grad_norm_var": 0.019657135009765625, + "learning_rate": 0.0001, + "loss": 4.2299, + "loss/crossentropy": 2.2519643306732178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21390919387340546, + "step": 15866 + }, + { + "epoch": 0.31736, + "grad_norm": 2.015625, + "grad_norm_var": 0.015409088134765625, + "learning_rate": 0.0001, + "loss": 4.266, + "loss/crossentropy": 2.3287068009376526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22211260348558426, + "step": 15868 + }, + { + "epoch": 0.3174, + "grad_norm": 1.8515625, + "grad_norm_var": 0.004401652018229166, + "learning_rate": 0.0001, + "loss": 3.9622, + "loss/crossentropy": 2.144728899002075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148151397705078, + "step": 15870 + }, + { + "epoch": 0.31744, + "grad_norm": 1.8984375, + "grad_norm_var": 0.004847971598307291, + "learning_rate": 0.0001, + "loss": 3.9042, + "loss/crossentropy": 2.120876669883728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21692734956741333, + "step": 15872 + }, + { + "epoch": 0.31748, + "grad_norm": 1.8828125, + "grad_norm_var": 0.004703521728515625, + "learning_rate": 0.0001, + "loss": 3.9254, + "loss/crossentropy": 2.2827231884002686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20714152604341507, + "step": 15874 + }, + { + "epoch": 0.31752, + "grad_norm": 1.84375, + "grad_norm_var": 0.004788970947265625, + "learning_rate": 0.0001, + "loss": 3.8742, + "loss/crossentropy": 1.8011438846588135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19294942915439606, + "step": 15876 + }, + { + "epoch": 0.31756, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0048258463541666664, + "learning_rate": 0.0001, + "loss": 3.9082, + "loss/crossentropy": 1.9359464049339294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19418440759181976, + "step": 15878 + }, + { + "epoch": 0.3176, + "grad_norm": 2.03125, + "grad_norm_var": 0.006371053059895834, + "learning_rate": 0.0001, + "loss": 4.4035, + "loss/crossentropy": 2.220693826675415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2583991438150406, + "step": 15880 + }, + { + "epoch": 0.31764, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006254069010416667, + "learning_rate": 0.0001, + "loss": 3.9762, + "loss/crossentropy": 1.7859990000724792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1756511777639389, + "step": 15882 + }, + { + "epoch": 0.31768, + "grad_norm": 1.9140625, + "grad_norm_var": 0.00587158203125, + "learning_rate": 0.0001, + "loss": 3.7008, + "loss/crossentropy": 1.9461398720741272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18634959310293198, + "step": 15884 + }, + { + "epoch": 0.31772, + "grad_norm": 2.234375, + "grad_norm_var": 0.010833485921223959, + "learning_rate": 0.0001, + "loss": 4.3732, + "loss/crossentropy": 2.1066328287124634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2553827613592148, + "step": 15886 + }, + { + "epoch": 0.31776, + "grad_norm": 2.03125, + "grad_norm_var": 0.011087799072265625, + "learning_rate": 0.0001, + "loss": 3.8751, + "loss/crossentropy": 1.7809287309646606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19362767785787582, + "step": 15888 + }, + { + "epoch": 0.3178, + "grad_norm": 1.921875, + "grad_norm_var": 0.01077880859375, + "learning_rate": 0.0001, + "loss": 4.0424, + "loss/crossentropy": 1.9554992318153381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18544971197843552, + "step": 15890 + }, + { + "epoch": 0.31784, + "grad_norm": 1.84375, + "grad_norm_var": 0.0327392578125, + "learning_rate": 0.0001, + "loss": 3.9083, + "loss/crossentropy": 1.5843109488487244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15569136291742325, + "step": 15892 + }, + { + "epoch": 0.31788, + "grad_norm": 2.109375, + "grad_norm_var": 0.0307525634765625, + "learning_rate": 0.0001, + "loss": 4.1213, + "loss/crossentropy": 2.2000880241394043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2167157679796219, + "step": 15894 + }, + { + "epoch": 0.31792, + "grad_norm": 1.9765625, + "grad_norm_var": 0.031870269775390626, + "learning_rate": 0.0001, + "loss": 4.265, + "loss/crossentropy": 2.2597590684890747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21171507239341736, + "step": 15896 + }, + { + "epoch": 0.31796, + "grad_norm": 1.9296875, + "grad_norm_var": 0.03181330362955729, + "learning_rate": 0.0001, + "loss": 3.9516, + "loss/crossentropy": 2.0513627529144287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21095634251832962, + "step": 15898 + }, + { + "epoch": 0.318, + "grad_norm": 1.9765625, + "grad_norm_var": 0.029271443684895832, + "learning_rate": 0.0001, + "loss": 4.0667, + "loss/crossentropy": 2.1822222471237183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21117551624774933, + "step": 15900 + }, + { + "epoch": 0.31804, + "grad_norm": 2.125, + "grad_norm_var": 0.027784983317057293, + "learning_rate": 0.0001, + "loss": 4.3526, + "loss/crossentropy": 1.9500460624694824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20769241452217102, + "step": 15902 + }, + { + "epoch": 0.31808, + "grad_norm": 2.0625, + "grad_norm_var": 0.02684326171875, + "learning_rate": 0.0001, + "loss": 4.2862, + "loss/crossentropy": 2.179791212081909, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20147693157196045, + "step": 15904 + }, + { + "epoch": 0.31812, + "grad_norm": 2.03125, + "grad_norm_var": 0.025217437744140626, + "learning_rate": 0.0001, + "loss": 4.0485, + "loss/crossentropy": 1.939364731311798, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18659047782421112, + "step": 15906 + }, + { + "epoch": 0.31816, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0059478759765625, + "learning_rate": 0.0001, + "loss": 3.9782, + "loss/crossentropy": 2.219409167766571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19967788457870483, + "step": 15908 + }, + { + "epoch": 0.3182, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007741038004557292, + "learning_rate": 0.0001, + "loss": 3.829, + "loss/crossentropy": 1.8363550901412964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18773943930864334, + "step": 15910 + }, + { + "epoch": 0.31824, + "grad_norm": 2.0625, + "grad_norm_var": 0.006151326497395833, + "learning_rate": 0.0001, + "loss": 4.2196, + "loss/crossentropy": 1.779226541519165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20676184445619583, + "step": 15912 + }, + { + "epoch": 0.31828, + "grad_norm": 1.953125, + "grad_norm_var": 0.006172688802083334, + "learning_rate": 0.0001, + "loss": 3.9532, + "loss/crossentropy": 2.1175760626792908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20498017221689224, + "step": 15914 + }, + { + "epoch": 0.31832, + "grad_norm": 1.9609375, + "grad_norm_var": 0.005893707275390625, + "learning_rate": 0.0001, + "loss": 4.1601, + "loss/crossentropy": 2.0964329838752747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1898122876882553, + "step": 15916 + }, + { + "epoch": 0.31836, + "grad_norm": 1.90625, + "grad_norm_var": 0.005680084228515625, + "learning_rate": 0.0001, + "loss": 3.9093, + "loss/crossentropy": 1.9687228798866272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1931748390197754, + "step": 15918 + }, + { + "epoch": 0.3184, + "grad_norm": 2.015625, + "grad_norm_var": 0.005197906494140625, + "learning_rate": 0.0001, + "loss": 4.0003, + "loss/crossentropy": 1.8344767689704895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19926829636096954, + "step": 15920 + }, + { + "epoch": 0.31844, + "grad_norm": 1.984375, + "grad_norm_var": 0.004443359375, + "learning_rate": 0.0001, + "loss": 4.0426, + "loss/crossentropy": 2.154146194458008, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21284651011228561, + "step": 15922 + }, + { + "epoch": 0.31848, + "grad_norm": 2.03125, + "grad_norm_var": 0.004223378499348959, + "learning_rate": 0.0001, + "loss": 4.2112, + "loss/crossentropy": 1.877986490726471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18935512751340866, + "step": 15924 + }, + { + "epoch": 0.31852, + "grad_norm": 2.171875, + "grad_norm_var": 0.009297434488932292, + "learning_rate": 0.0001, + "loss": 4.4296, + "loss/crossentropy": 2.361938714981079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23210977017879486, + "step": 15926 + }, + { + "epoch": 0.31856, + "grad_norm": 2.109375, + "grad_norm_var": 0.01043701171875, + "learning_rate": 0.0001, + "loss": 4.1209, + "loss/crossentropy": 2.3886083364486694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22222661972045898, + "step": 15928 + }, + { + "epoch": 0.3186, + "grad_norm": 2.25, + "grad_norm_var": 0.014806874593098958, + "learning_rate": 0.0001, + "loss": 4.1164, + "loss/crossentropy": 2.073238492012024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20130325853824615, + "step": 15930 + }, + { + "epoch": 0.31864, + "grad_norm": 2.125, + "grad_norm_var": 0.015569814046223958, + "learning_rate": 0.0001, + "loss": 4.1692, + "loss/crossentropy": 1.9937713742256165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.199419766664505, + "step": 15932 + }, + { + "epoch": 0.31868, + "grad_norm": 1.96875, + "grad_norm_var": 0.0135894775390625, + "learning_rate": 0.0001, + "loss": 4.0594, + "loss/crossentropy": 2.003056764602661, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18682510405778885, + "step": 15934 + }, + { + "epoch": 0.31872, + "grad_norm": 1.984375, + "grad_norm_var": 0.0137359619140625, + "learning_rate": 0.0001, + "loss": 3.9555, + "loss/crossentropy": 1.7363762259483337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17377281934022903, + "step": 15936 + }, + { + "epoch": 0.31876, + "grad_norm": 2.0, + "grad_norm_var": 0.014412180582682291, + "learning_rate": 0.0001, + "loss": 4.1321, + "loss/crossentropy": 1.9791433811187744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24057063460350037, + "step": 15938 + }, + { + "epoch": 0.3188, + "grad_norm": 2.0625, + "grad_norm_var": 0.011378733317057292, + "learning_rate": 0.0001, + "loss": 3.9662, + "loss/crossentropy": 1.9962583780288696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19619113206863403, + "step": 15940 + }, + { + "epoch": 0.31884, + "grad_norm": 1.9140625, + "grad_norm_var": 0.01053466796875, + "learning_rate": 0.0001, + "loss": 4.2093, + "loss/crossentropy": 1.8568945527076721, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1979837343096733, + "step": 15942 + }, + { + "epoch": 0.31888, + "grad_norm": 2.09375, + "grad_norm_var": 0.008991495768229166, + "learning_rate": 0.0001, + "loss": 4.0148, + "loss/crossentropy": 2.031070291996002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1951117366552353, + "step": 15944 + }, + { + "epoch": 0.31892, + "grad_norm": 1.9375, + "grad_norm_var": 0.007916005452473958, + "learning_rate": 0.0001, + "loss": 4.0139, + "loss/crossentropy": 2.2226544618606567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21066214889287949, + "step": 15946 + }, + { + "epoch": 0.31896, + "grad_norm": 1.890625, + "grad_norm_var": 0.007783762613932292, + "learning_rate": 0.0001, + "loss": 4.0275, + "loss/crossentropy": 2.090229034423828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.225972518324852, + "step": 15948 + }, + { + "epoch": 0.319, + "grad_norm": 1.90625, + "grad_norm_var": 0.0080078125, + "learning_rate": 0.0001, + "loss": 3.9792, + "loss/crossentropy": 1.8562734723091125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17633049190044403, + "step": 15950 + }, + { + "epoch": 0.31904, + "grad_norm": 2.046875, + "grad_norm_var": 0.00992431640625, + "learning_rate": 0.0001, + "loss": 4.0227, + "loss/crossentropy": 1.6677230596542358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16521989554166794, + "step": 15952 + }, + { + "epoch": 0.31908, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008939361572265625, + "learning_rate": 0.0001, + "loss": 4.2391, + "loss/crossentropy": 2.1918782591819763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21977706253528595, + "step": 15954 + }, + { + "epoch": 0.31912, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008701324462890625, + "learning_rate": 0.0001, + "loss": 3.8932, + "loss/crossentropy": 1.8623137474060059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1937999278306961, + "step": 15956 + }, + { + "epoch": 0.31916, + "grad_norm": 2.296875, + "grad_norm_var": 0.014839426676432291, + "learning_rate": 0.0001, + "loss": 4.2378, + "loss/crossentropy": 2.262889266014099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19410346448421478, + "step": 15958 + }, + { + "epoch": 0.3192, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014654286702473958, + "learning_rate": 0.0001, + "loss": 4.0487, + "loss/crossentropy": 1.9620846509933472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1978064849972725, + "step": 15960 + }, + { + "epoch": 0.31924, + "grad_norm": 1.9375, + "grad_norm_var": 0.014207967122395833, + "learning_rate": 0.0001, + "loss": 4.2746, + "loss/crossentropy": 2.453945517539978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21578459441661835, + "step": 15962 + }, + { + "epoch": 0.31928, + "grad_norm": 1.984375, + "grad_norm_var": 0.013529459635416666, + "learning_rate": 0.0001, + "loss": 3.9692, + "loss/crossentropy": 1.9565781354904175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17789364606142044, + "step": 15964 + }, + { + "epoch": 0.31932, + "grad_norm": 1.9765625, + "grad_norm_var": 0.012589518229166667, + "learning_rate": 0.0001, + "loss": 4.3504, + "loss/crossentropy": 2.1175976991653442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2239348292350769, + "step": 15966 + }, + { + "epoch": 0.31936, + "grad_norm": 1.984375, + "grad_norm_var": 0.010554758707682292, + "learning_rate": 0.0001, + "loss": 4.0815, + "loss/crossentropy": 1.8919953107833862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19832338392734528, + "step": 15968 + }, + { + "epoch": 0.3194, + "grad_norm": 2.015625, + "grad_norm_var": 0.0126220703125, + "learning_rate": 0.0001, + "loss": 4.1282, + "loss/crossentropy": 1.9757861495018005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19513867795467377, + "step": 15970 + }, + { + "epoch": 0.31944, + "grad_norm": 1.90625, + "grad_norm_var": 0.012630208333333334, + "learning_rate": 0.0001, + "loss": 4.1045, + "loss/crossentropy": 1.8716764450073242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18571900576353073, + "step": 15972 + }, + { + "epoch": 0.31948, + "grad_norm": 1.984375, + "grad_norm_var": 0.006575520833333333, + "learning_rate": 0.0001, + "loss": 4.2572, + "loss/crossentropy": 2.251496374607086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2199217826128006, + "step": 15974 + }, + { + "epoch": 0.31952, + "grad_norm": 2.140625, + "grad_norm_var": 0.008424631754557292, + "learning_rate": 0.0001, + "loss": 4.1033, + "loss/crossentropy": 1.8639826774597168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18889226019382477, + "step": 15976 + }, + { + "epoch": 0.31956, + "grad_norm": 1.890625, + "grad_norm_var": 0.008499908447265624, + "learning_rate": 0.0001, + "loss": 3.8386, + "loss/crossentropy": 1.824375331401825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1809096783399582, + "step": 15978 + }, + { + "epoch": 0.3196, + "grad_norm": 2.0625, + "grad_norm_var": 0.010361480712890624, + "learning_rate": 0.0001, + "loss": 4.4576, + "loss/crossentropy": 2.357889413833618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21762560307979584, + "step": 15980 + }, + { + "epoch": 0.31964, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011527252197265626, + "learning_rate": 0.0001, + "loss": 4.0706, + "loss/crossentropy": 2.0944892168045044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18413915485143661, + "step": 15982 + }, + { + "epoch": 0.31968, + "grad_norm": 1.8984375, + "grad_norm_var": 0.011736806233723958, + "learning_rate": 0.0001, + "loss": 3.9458, + "loss/crossentropy": 1.8600184321403503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18580162525177002, + "step": 15984 + }, + { + "epoch": 0.31972, + "grad_norm": 2.03125, + "grad_norm_var": 0.008697255452473959, + "learning_rate": 0.0001, + "loss": 4.1152, + "loss/crossentropy": 1.9213852882385254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20599720627069473, + "step": 15986 + }, + { + "epoch": 0.31976, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0119537353515625, + "learning_rate": 0.0001, + "loss": 4.2187, + "loss/crossentropy": 2.0295584201812744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125483900308609, + "step": 15988 + }, + { + "epoch": 0.3198, + "grad_norm": 2.09375, + "grad_norm_var": 0.012198638916015626, + "learning_rate": 0.0001, + "loss": 4.2448, + "loss/crossentropy": 2.148836612701416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22027160972356796, + "step": 15990 + }, + { + "epoch": 0.31984, + "grad_norm": 2.03125, + "grad_norm_var": 0.010201009114583333, + "learning_rate": 0.0001, + "loss": 4.122, + "loss/crossentropy": 1.9927499294281006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20413245260715485, + "step": 15992 + }, + { + "epoch": 0.31988, + "grad_norm": 2.21875, + "grad_norm_var": 0.0113433837890625, + "learning_rate": 0.0001, + "loss": 4.2675, + "loss/crossentropy": 2.090071678161621, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21670149266719818, + "step": 15994 + }, + { + "epoch": 0.31992, + "grad_norm": 1.953125, + "grad_norm_var": 0.010074869791666666, + "learning_rate": 0.0001, + "loss": 4.1333, + "loss/crossentropy": 1.8375160098075867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1938413679599762, + "step": 15996 + }, + { + "epoch": 0.31996, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010498046875, + "learning_rate": 0.0001, + "loss": 3.8915, + "loss/crossentropy": 1.5196507573127747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16407200694084167, + "step": 15998 + }, + { + "epoch": 0.32, + "grad_norm": 1.859375, + "grad_norm_var": 0.011344146728515626, + "learning_rate": 0.0001, + "loss": 4.0965, + "loss/crossentropy": 2.0738461017608643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21532364934682846, + "step": 16000 + }, + { + "epoch": 0.32004, + "grad_norm": 1.984375, + "grad_norm_var": 0.011948394775390624, + "learning_rate": 0.0001, + "loss": 4.0687, + "loss/crossentropy": 2.3243744373321533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23544494807720184, + "step": 16002 + }, + { + "epoch": 0.32008, + "grad_norm": 2.046875, + "grad_norm_var": 0.009862263997395834, + "learning_rate": 0.0001, + "loss": 3.9637, + "loss/crossentropy": 1.7145931124687195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1816524863243103, + "step": 16004 + }, + { + "epoch": 0.32012, + "grad_norm": 2.0625, + "grad_norm_var": 0.010506184895833333, + "learning_rate": 0.0001, + "loss": 4.2317, + "loss/crossentropy": 2.369017481803894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21978877484798431, + "step": 16006 + }, + { + "epoch": 0.32016, + "grad_norm": 1.984375, + "grad_norm_var": 0.010276031494140626, + "learning_rate": 0.0001, + "loss": 4.2283, + "loss/crossentropy": 2.2246848344802856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21983492374420166, + "step": 16008 + }, + { + "epoch": 0.3202, + "grad_norm": 2.671875, + "grad_norm_var": 0.03774185180664062, + "learning_rate": 0.0001, + "loss": 4.4077, + "loss/crossentropy": 2.0235647559165955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22780708968639374, + "step": 16010 + }, + { + "epoch": 0.32024, + "grad_norm": 2.015625, + "grad_norm_var": 0.03724543253580729, + "learning_rate": 0.0001, + "loss": 4.1692, + "loss/crossentropy": 2.3185853958129883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22308620065450668, + "step": 16012 + }, + { + "epoch": 0.32028, + "grad_norm": 2.109375, + "grad_norm_var": 0.0337890625, + "learning_rate": 0.0001, + "loss": 4.1314, + "loss/crossentropy": 1.8504603505134583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1909484937787056, + "step": 16014 + }, + { + "epoch": 0.32032, + "grad_norm": 1.859375, + "grad_norm_var": 0.03502604166666667, + "learning_rate": 0.0001, + "loss": 3.8247, + "loss/crossentropy": 1.7095224857330322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18588167428970337, + "step": 16016 + }, + { + "epoch": 0.32036, + "grad_norm": 1.8515625, + "grad_norm_var": 0.03749974568684896, + "learning_rate": 0.0001, + "loss": 4.1015, + "loss/crossentropy": 1.8398523330688477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18085161596536636, + "step": 16018 + }, + { + "epoch": 0.3204, + "grad_norm": 1.921875, + "grad_norm_var": 0.03569717407226562, + "learning_rate": 0.0001, + "loss": 4.1463, + "loss/crossentropy": 2.297620415687561, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20937249064445496, + "step": 16020 + }, + { + "epoch": 0.32044, + "grad_norm": 1.984375, + "grad_norm_var": 0.03618545532226562, + "learning_rate": 0.0001, + "loss": 4.3518, + "loss/crossentropy": 1.9124106764793396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2072141021490097, + "step": 16022 + }, + { + "epoch": 0.32048, + "grad_norm": 1.921875, + "grad_norm_var": 0.03752415974934896, + "learning_rate": 0.0001, + "loss": 4.2695, + "loss/crossentropy": 2.119450092315674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20875975489616394, + "step": 16024 + }, + { + "epoch": 0.32052, + "grad_norm": 2.125, + "grad_norm_var": 0.008125559488932291, + "learning_rate": 0.0001, + "loss": 4.35, + "loss/crossentropy": 1.722611904144287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1751066967844963, + "step": 16026 + }, + { + "epoch": 0.32056, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008760579427083333, + "learning_rate": 0.0001, + "loss": 4.0689, + "loss/crossentropy": 2.246767997741699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1999046877026558, + "step": 16028 + }, + { + "epoch": 0.3206, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0083404541015625, + "learning_rate": 0.0001, + "loss": 3.9949, + "loss/crossentropy": 2.0449349880218506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1969577819108963, + "step": 16030 + }, + { + "epoch": 0.32064, + "grad_norm": 2.140625, + "grad_norm_var": 0.010050201416015625, + "learning_rate": 0.0001, + "loss": 4.1257, + "loss/crossentropy": 2.125577926635742, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22252507507801056, + "step": 16032 + }, + { + "epoch": 0.32068, + "grad_norm": 1.984375, + "grad_norm_var": 0.009079742431640624, + "learning_rate": 0.0001, + "loss": 3.639, + "loss/crossentropy": 1.8161216974258423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17801867425441742, + "step": 16034 + }, + { + "epoch": 0.32072, + "grad_norm": 2.0, + "grad_norm_var": 0.009422810872395833, + "learning_rate": 0.0001, + "loss": 3.9475, + "loss/crossentropy": 2.0116894841194153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18890849500894547, + "step": 16036 + }, + { + "epoch": 0.32076, + "grad_norm": 1.875, + "grad_norm_var": 0.0100982666015625, + "learning_rate": 0.0001, + "loss": 3.8274, + "loss/crossentropy": 1.505588710308075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17151623219251633, + "step": 16038 + }, + { + "epoch": 0.3208, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008585357666015625, + "learning_rate": 0.0001, + "loss": 4.2162, + "loss/crossentropy": 2.1405014991760254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2036641389131546, + "step": 16040 + }, + { + "epoch": 0.32084, + "grad_norm": 1.953125, + "grad_norm_var": 0.006628163655598958, + "learning_rate": 0.0001, + "loss": 4.0743, + "loss/crossentropy": 2.1562809348106384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21301954984664917, + "step": 16042 + }, + { + "epoch": 0.32088, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006628163655598958, + "learning_rate": 0.0001, + "loss": 3.9773, + "loss/crossentropy": 1.9317167401313782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19167031347751617, + "step": 16044 + }, + { + "epoch": 0.32092, + "grad_norm": 1.796875, + "grad_norm_var": 0.007330067952473958, + "learning_rate": 0.0001, + "loss": 4.0523, + "loss/crossentropy": 1.6257455348968506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1676744669675827, + "step": 16046 + }, + { + "epoch": 0.32096, + "grad_norm": 2.109375, + "grad_norm_var": 0.006640625, + "learning_rate": 0.0001, + "loss": 4.2454, + "loss/crossentropy": 2.0711347460746765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2268730029463768, + "step": 16048 + }, + { + "epoch": 0.321, + "grad_norm": 2.03125, + "grad_norm_var": 0.0071489969889322914, + "learning_rate": 0.0001, + "loss": 4.3333, + "loss/crossentropy": 2.173452377319336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981828361749649, + "step": 16050 + }, + { + "epoch": 0.32104, + "grad_norm": 1.953125, + "grad_norm_var": 0.009593709309895834, + "learning_rate": 0.0001, + "loss": 4.4601, + "loss/crossentropy": 2.348356604576111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2538425847887993, + "step": 16052 + }, + { + "epoch": 0.32108, + "grad_norm": 1.984375, + "grad_norm_var": 0.0089263916015625, + "learning_rate": 0.0001, + "loss": 4.3408, + "loss/crossentropy": 2.5441235303878784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22638535499572754, + "step": 16054 + }, + { + "epoch": 0.32112, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0098876953125, + "learning_rate": 0.0001, + "loss": 4.08, + "loss/crossentropy": 2.129339337348938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20430771261453629, + "step": 16056 + }, + { + "epoch": 0.32116, + "grad_norm": 1.9296875, + "grad_norm_var": 0.01142578125, + "learning_rate": 0.0001, + "loss": 3.9048, + "loss/crossentropy": 2.1599501371383667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20718347281217575, + "step": 16058 + }, + { + "epoch": 0.3212, + "grad_norm": 1.984375, + "grad_norm_var": 0.010296376546223958, + "learning_rate": 0.0001, + "loss": 4.2746, + "loss/crossentropy": 2.2704890966415405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22025802731513977, + "step": 16060 + }, + { + "epoch": 0.32124, + "grad_norm": 1.796875, + "grad_norm_var": 0.011702473958333333, + "learning_rate": 0.0001, + "loss": 3.7282, + "loss/crossentropy": 1.7704021334648132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17811128497123718, + "step": 16062 + }, + { + "epoch": 0.32128, + "grad_norm": 2.1875, + "grad_norm_var": 0.015705362955729166, + "learning_rate": 0.0001, + "loss": 4.2366, + "loss/crossentropy": 1.8030991554260254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20078134536743164, + "step": 16064 + }, + { + "epoch": 0.32132, + "grad_norm": 2.015625, + "grad_norm_var": 0.0161041259765625, + "learning_rate": 0.0001, + "loss": 3.9832, + "loss/crossentropy": 2.14484703540802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21000836789608002, + "step": 16066 + }, + { + "epoch": 0.32136, + "grad_norm": 2.203125, + "grad_norm_var": 0.016521962483723958, + "learning_rate": 0.0001, + "loss": 4.0812, + "loss/crossentropy": 2.1680833101272583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22287416458129883, + "step": 16068 + }, + { + "epoch": 0.3214, + "grad_norm": 2.078125, + "grad_norm_var": 0.01708958943684896, + "learning_rate": 0.0001, + "loss": 4.1355, + "loss/crossentropy": 1.9971441626548767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20709602534770966, + "step": 16070 + }, + { + "epoch": 0.32144, + "grad_norm": 2.09375, + "grad_norm_var": 0.016462198893229165, + "learning_rate": 0.0001, + "loss": 4.2735, + "loss/crossentropy": 2.180538833141327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2097538262605667, + "step": 16072 + }, + { + "epoch": 0.32148, + "grad_norm": 2.046875, + "grad_norm_var": 0.013529459635416666, + "learning_rate": 0.0001, + "loss": 4.0725, + "loss/crossentropy": 2.074296534061432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20961299538612366, + "step": 16074 + }, + { + "epoch": 0.32152, + "grad_norm": 1.9375, + "grad_norm_var": 0.014998372395833333, + "learning_rate": 0.0001, + "loss": 4.3, + "loss/crossentropy": 2.0936360359191895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18253444135189056, + "step": 16076 + }, + { + "epoch": 0.32156, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0121337890625, + "learning_rate": 0.0001, + "loss": 3.9992, + "loss/crossentropy": 2.1265164613723755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2127028927206993, + "step": 16078 + }, + { + "epoch": 0.3216, + "grad_norm": 2.0, + "grad_norm_var": 0.008410390218098958, + "learning_rate": 0.0001, + "loss": 4.1907, + "loss/crossentropy": 2.0804547667503357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23104286193847656, + "step": 16080 + }, + { + "epoch": 0.32164, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007759602864583334, + "learning_rate": 0.0001, + "loss": 3.9653, + "loss/crossentropy": 1.9458459615707397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18746595829725266, + "step": 16082 + }, + { + "epoch": 0.32168, + "grad_norm": 1.96875, + "grad_norm_var": 0.0056793212890625, + "learning_rate": 0.0001, + "loss": 4.0581, + "loss/crossentropy": 2.186868667602539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20350559055805206, + "step": 16084 + }, + { + "epoch": 0.32172, + "grad_norm": 1.953125, + "grad_norm_var": 0.007340494791666667, + "learning_rate": 0.0001, + "loss": 4.2364, + "loss/crossentropy": 1.5468108654022217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17710693180561066, + "step": 16086 + }, + { + "epoch": 0.32176, + "grad_norm": 2.0, + "grad_norm_var": 0.006180572509765625, + "learning_rate": 0.0001, + "loss": 4.2635, + "loss/crossentropy": 2.222296118736267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22016968578100204, + "step": 16088 + }, + { + "epoch": 0.3218, + "grad_norm": 1.984375, + "grad_norm_var": 0.005619049072265625, + "learning_rate": 0.0001, + "loss": 4.0659, + "loss/crossentropy": 1.7950996160507202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1892552226781845, + "step": 16090 + }, + { + "epoch": 0.32184, + "grad_norm": 2.15625, + "grad_norm_var": 0.007120768229166667, + "learning_rate": 0.0001, + "loss": 4.2899, + "loss/crossentropy": 2.293117642402649, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2351122871041298, + "step": 16092 + }, + { + "epoch": 0.32188, + "grad_norm": 1.890625, + "grad_norm_var": 0.006208292643229167, + "learning_rate": 0.0001, + "loss": 3.8235, + "loss/crossentropy": 1.6602438688278198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18052196502685547, + "step": 16094 + }, + { + "epoch": 0.32192, + "grad_norm": 2.140625, + "grad_norm_var": 0.008501942952473958, + "learning_rate": 0.0001, + "loss": 4.114, + "loss/crossentropy": 1.9335945844650269, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20585428178310394, + "step": 16096 + }, + { + "epoch": 0.32196, + "grad_norm": 1.9140625, + "grad_norm_var": 0.011378733317057292, + "learning_rate": 0.0001, + "loss": 4.3911, + "loss/crossentropy": 2.0107831358909607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19297368824481964, + "step": 16098 + }, + { + "epoch": 0.322, + "grad_norm": 2.0, + "grad_norm_var": 0.011801910400390626, + "learning_rate": 0.0001, + "loss": 4.0153, + "loss/crossentropy": 1.9804013967514038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17699319869279861, + "step": 16100 + }, + { + "epoch": 0.32204, + "grad_norm": 1.9375, + "grad_norm_var": 0.01048583984375, + "learning_rate": 0.0001, + "loss": 4.1069, + "loss/crossentropy": 1.9009913206100464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1931428462266922, + "step": 16102 + }, + { + "epoch": 0.32208, + "grad_norm": 2.140625, + "grad_norm_var": 0.011358388264973958, + "learning_rate": 0.0001, + "loss": 4.5134, + "loss/crossentropy": 1.9558063745498657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2139815017580986, + "step": 16104 + }, + { + "epoch": 0.32212, + "grad_norm": 2.1875, + "grad_norm_var": 0.013323720296223958, + "learning_rate": 0.0001, + "loss": 4.0989, + "loss/crossentropy": 2.0626373887062073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2022782266139984, + "step": 16106 + }, + { + "epoch": 0.32216, + "grad_norm": 1.8515625, + "grad_norm_var": 0.013814036051432292, + "learning_rate": 0.0001, + "loss": 3.9408, + "loss/crossentropy": 2.166835308074951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22070696204900742, + "step": 16108 + }, + { + "epoch": 0.3222, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012963612874348959, + "learning_rate": 0.0001, + "loss": 4.1382, + "loss/crossentropy": 2.2970025539398193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23291311413049698, + "step": 16110 + }, + { + "epoch": 0.32224, + "grad_norm": 1.921875, + "grad_norm_var": 0.012174224853515625, + "learning_rate": 0.0001, + "loss": 3.9577, + "loss/crossentropy": 1.9400970935821533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.199058435857296, + "step": 16112 + }, + { + "epoch": 0.32228, + "grad_norm": 1.9453125, + "grad_norm_var": 0.009110260009765624, + "learning_rate": 0.0001, + "loss": 3.9659, + "loss/crossentropy": 1.6467864513397217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19500216841697693, + "step": 16114 + }, + { + "epoch": 0.32232, + "grad_norm": 2.03125, + "grad_norm_var": 0.010546875, + "learning_rate": 0.0001, + "loss": 4.1791, + "loss/crossentropy": 2.477790355682373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22540342807769775, + "step": 16116 + }, + { + "epoch": 0.32236, + "grad_norm": 1.875, + "grad_norm_var": 0.011066691080729166, + "learning_rate": 0.0001, + "loss": 4.0661, + "loss/crossentropy": 1.946933627128601, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17866672575473785, + "step": 16118 + }, + { + "epoch": 0.3224, + "grad_norm": 1.859375, + "grad_norm_var": 0.01153564453125, + "learning_rate": 0.0001, + "loss": 4.1416, + "loss/crossentropy": 2.4484145641326904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21314629167318344, + "step": 16120 + }, + { + "epoch": 0.32244, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009155019124348959, + "learning_rate": 0.0001, + "loss": 3.9754, + "loss/crossentropy": 1.9663755893707275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19814299046993256, + "step": 16122 + }, + { + "epoch": 0.32248, + "grad_norm": 2.125, + "grad_norm_var": 0.5887278238932292, + "learning_rate": 0.0001, + "loss": 3.7417, + "loss/crossentropy": 1.9786240458488464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19666218757629395, + "step": 16124 + }, + { + "epoch": 0.32252, + "grad_norm": 1.9453125, + "grad_norm_var": 0.5901079813639323, + "learning_rate": 0.0001, + "loss": 3.9504, + "loss/crossentropy": 1.90617835521698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1803947538137436, + "step": 16126 + }, + { + "epoch": 0.32256, + "grad_norm": 2.03125, + "grad_norm_var": 0.5877764383951823, + "learning_rate": 0.0001, + "loss": 4.3267, + "loss/crossentropy": 2.1199004650115967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21409574151039124, + "step": 16128 + }, + { + "epoch": 0.3226, + "grad_norm": 1.8984375, + "grad_norm_var": 0.5875445048014323, + "learning_rate": 0.0001, + "loss": 4.184, + "loss/crossentropy": 2.081290364265442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20103489607572556, + "step": 16130 + }, + { + "epoch": 0.32264, + "grad_norm": 1.9375, + "grad_norm_var": 0.5902565002441407, + "learning_rate": 0.0001, + "loss": 4.216, + "loss/crossentropy": 1.9839922785758972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20679790526628494, + "step": 16132 + }, + { + "epoch": 0.32268, + "grad_norm": 2.078125, + "grad_norm_var": 0.5830800374348958, + "learning_rate": 0.0001, + "loss": 4.3441, + "loss/crossentropy": 2.2765486240386963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21012597531080246, + "step": 16134 + }, + { + "epoch": 0.32272, + "grad_norm": 2.125, + "grad_norm_var": 0.57388916015625, + "learning_rate": 0.0001, + "loss": 4.0003, + "loss/crossentropy": 1.8353520035743713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19367212802171707, + "step": 16136 + }, + { + "epoch": 0.32276, + "grad_norm": 1.8984375, + "grad_norm_var": 0.5781572977701823, + "learning_rate": 0.0001, + "loss": 3.8104, + "loss/crossentropy": 1.8009640574455261, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18030469119548798, + "step": 16138 + }, + { + "epoch": 0.3228, + "grad_norm": 2.140625, + "grad_norm_var": 0.020975494384765626, + "learning_rate": 0.0001, + "loss": 4.1914, + "loss/crossentropy": 1.865959644317627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20606407523155212, + "step": 16140 + }, + { + "epoch": 0.32284, + "grad_norm": 2.109375, + "grad_norm_var": 0.021445465087890626, + "learning_rate": 0.0001, + "loss": 4.0253, + "loss/crossentropy": 2.238165020942688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22822897136211395, + "step": 16142 + }, + { + "epoch": 0.32288, + "grad_norm": 1.90625, + "grad_norm_var": 0.02393366495768229, + "learning_rate": 0.0001, + "loss": 4.0425, + "loss/crossentropy": 2.175424814224243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2260037586092949, + "step": 16144 + }, + { + "epoch": 0.32292, + "grad_norm": 1.9453125, + "grad_norm_var": 0.025050608317057292, + "learning_rate": 0.0001, + "loss": 4.1493, + "loss/crossentropy": 2.2071722745895386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23026321083307266, + "step": 16146 + }, + { + "epoch": 0.32296, + "grad_norm": 2.109375, + "grad_norm_var": 0.024468739827473957, + "learning_rate": 0.0001, + "loss": 4.2384, + "loss/crossentropy": 2.211812973022461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20882735401391983, + "step": 16148 + }, + { + "epoch": 0.323, + "grad_norm": 2.046875, + "grad_norm_var": 0.02444636027018229, + "learning_rate": 0.0001, + "loss": 4.0356, + "loss/crossentropy": 1.6503818035125732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1640765517950058, + "step": 16150 + }, + { + "epoch": 0.32304, + "grad_norm": 2.015625, + "grad_norm_var": 0.02520726521809896, + "learning_rate": 0.0001, + "loss": 4.0692, + "loss/crossentropy": 1.9449399709701538, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1876515969634056, + "step": 16152 + }, + { + "epoch": 0.32308, + "grad_norm": 1.8984375, + "grad_norm_var": 0.022581990559895834, + "learning_rate": 0.0001, + "loss": 4.1665, + "loss/crossentropy": 2.001387894153595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21191075444221497, + "step": 16154 + }, + { + "epoch": 0.32312, + "grad_norm": 2.234375, + "grad_norm_var": 0.0143218994140625, + "learning_rate": 0.0001, + "loss": 4.2115, + "loss/crossentropy": 2.1146541833877563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2692951038479805, + "step": 16156 + }, + { + "epoch": 0.32316, + "grad_norm": 1.9609375, + "grad_norm_var": 0.015602366129557291, + "learning_rate": 0.0001, + "loss": 4.1534, + "loss/crossentropy": 2.0765844583511353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19518104195594788, + "step": 16158 + }, + { + "epoch": 0.3232, + "grad_norm": 2.0, + "grad_norm_var": 0.0128814697265625, + "learning_rate": 0.0001, + "loss": 4.1432, + "loss/crossentropy": 2.5394046306610107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22651594877243042, + "step": 16160 + }, + { + "epoch": 0.32324, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011702473958333333, + "learning_rate": 0.0001, + "loss": 4.0412, + "loss/crossentropy": 2.0140721797943115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18245946615934372, + "step": 16162 + }, + { + "epoch": 0.32328, + "grad_norm": 1.9375, + "grad_norm_var": 0.012303670247395834, + "learning_rate": 0.0001, + "loss": 4.3029, + "loss/crossentropy": 2.472638249397278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23127944022417068, + "step": 16164 + }, + { + "epoch": 0.32332, + "grad_norm": 2.03125, + "grad_norm_var": 0.014924112955729167, + "learning_rate": 0.0001, + "loss": 4.1974, + "loss/crossentropy": 2.1551883220672607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2093031033873558, + "step": 16166 + }, + { + "epoch": 0.32336, + "grad_norm": 2.25, + "grad_norm_var": 0.01641845703125, + "learning_rate": 0.0001, + "loss": 4.1221, + "loss/crossentropy": 2.042950928211212, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2049141526222229, + "step": 16168 + }, + { + "epoch": 0.3234, + "grad_norm": 2.03125, + "grad_norm_var": 0.014849599202473958, + "learning_rate": 0.0001, + "loss": 4.4795, + "loss/crossentropy": 2.0262559056282043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18649922311306, + "step": 16170 + }, + { + "epoch": 0.32344, + "grad_norm": 1.984375, + "grad_norm_var": 0.011675771077473958, + "learning_rate": 0.0001, + "loss": 4.0477, + "loss/crossentropy": 2.3014276027679443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21770215779542923, + "step": 16172 + }, + { + "epoch": 0.32348, + "grad_norm": 2.078125, + "grad_norm_var": 0.009349568684895834, + "learning_rate": 0.0001, + "loss": 4.2802, + "loss/crossentropy": 2.213370680809021, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18508044630289078, + "step": 16174 + }, + { + "epoch": 0.32352, + "grad_norm": 2.03125, + "grad_norm_var": 0.009075673421223958, + "learning_rate": 0.0001, + "loss": 4.2529, + "loss/crossentropy": 1.973215639591217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1937488242983818, + "step": 16176 + }, + { + "epoch": 0.32356, + "grad_norm": 2.09375, + "grad_norm_var": 0.018553670247395834, + "learning_rate": 0.0001, + "loss": 4.3534, + "loss/crossentropy": 2.1585731506347656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2029733881354332, + "step": 16178 + }, + { + "epoch": 0.3236, + "grad_norm": 2.359375, + "grad_norm_var": 0.021647135416666668, + "learning_rate": 0.0001, + "loss": 4.3324, + "loss/crossentropy": 2.3878796100616455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19985762238502502, + "step": 16180 + }, + { + "epoch": 0.32364, + "grad_norm": 1.84375, + "grad_norm_var": 0.027860514322916665, + "learning_rate": 0.0001, + "loss": 3.8043, + "loss/crossentropy": 2.07839834690094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1994347795844078, + "step": 16182 + }, + { + "epoch": 0.32368, + "grad_norm": 1.953125, + "grad_norm_var": 0.02607421875, + "learning_rate": 0.0001, + "loss": 4.3147, + "loss/crossentropy": 2.1800806522369385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21444538235664368, + "step": 16184 + }, + { + "epoch": 0.32372, + "grad_norm": 1.8984375, + "grad_norm_var": 0.029352823893229168, + "learning_rate": 0.0001, + "loss": 3.9927, + "loss/crossentropy": 2.0900917053222656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20476030558347702, + "step": 16186 + }, + { + "epoch": 0.32376, + "grad_norm": 2.0625, + "grad_norm_var": 0.02899169921875, + "learning_rate": 0.0001, + "loss": 4.209, + "loss/crossentropy": 2.1523303985595703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20984850078821182, + "step": 16188 + }, + { + "epoch": 0.3238, + "grad_norm": 1.765625, + "grad_norm_var": 0.03593648274739583, + "learning_rate": 0.0001, + "loss": 3.799, + "loss/crossentropy": 1.8808646202087402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18587414175271988, + "step": 16190 + }, + { + "epoch": 0.32384, + "grad_norm": 1.953125, + "grad_norm_var": 0.03593648274739583, + "learning_rate": 0.0001, + "loss": 4.1002, + "loss/crossentropy": 1.981968104839325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20611849427223206, + "step": 16192 + }, + { + "epoch": 0.32388, + "grad_norm": 2.03125, + "grad_norm_var": 0.020951334635416666, + "learning_rate": 0.0001, + "loss": 4.2811, + "loss/crossentropy": 2.1454352140426636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21696189790964127, + "step": 16194 + }, + { + "epoch": 0.32392, + "grad_norm": 2.078125, + "grad_norm_var": 0.011905924479166666, + "learning_rate": 0.0001, + "loss": 4.159, + "loss/crossentropy": 2.026822328567505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2112257331609726, + "step": 16196 + }, + { + "epoch": 0.32396, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009706370035807292, + "learning_rate": 0.0001, + "loss": 4.0838, + "loss/crossentropy": 2.332329034805298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20129209011793137, + "step": 16198 + }, + { + "epoch": 0.324, + "grad_norm": 2.046875, + "grad_norm_var": 0.007260894775390625, + "learning_rate": 0.0001, + "loss": 4.11, + "loss/crossentropy": 2.271657705307007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2460309863090515, + "step": 16200 + }, + { + "epoch": 0.32404, + "grad_norm": 2.0, + "grad_norm_var": 0.0064084370930989586, + "learning_rate": 0.0001, + "loss": 4.0835, + "loss/crossentropy": 2.0890920162200928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2132192626595497, + "step": 16202 + }, + { + "epoch": 0.32408, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007226308186848958, + "learning_rate": 0.0001, + "loss": 4.0271, + "loss/crossentropy": 1.8368538618087769, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1891050636768341, + "step": 16204 + }, + { + "epoch": 0.32412, + "grad_norm": 2.0625, + "grad_norm_var": 0.004375966389973959, + "learning_rate": 0.0001, + "loss": 4.2859, + "loss/crossentropy": 2.3123772144317627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22660605609416962, + "step": 16206 + }, + { + "epoch": 0.32416, + "grad_norm": 2.125, + "grad_norm_var": 0.005411529541015625, + "learning_rate": 0.0001, + "loss": 4.0989, + "loss/crossentropy": 2.21794331073761, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21767260879278183, + "step": 16208 + }, + { + "epoch": 0.3242, + "grad_norm": 2.0625, + "grad_norm_var": 0.010591379801432292, + "learning_rate": 0.0001, + "loss": 4.5407, + "loss/crossentropy": 2.169134736061096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172672376036644, + "step": 16210 + }, + { + "epoch": 0.32424, + "grad_norm": 1.921875, + "grad_norm_var": 0.011331939697265625, + "learning_rate": 0.0001, + "loss": 4.0327, + "loss/crossentropy": 2.2830699682235718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22920189797878265, + "step": 16212 + }, + { + "epoch": 0.32428, + "grad_norm": 1.90625, + "grad_norm_var": 0.012770334879557291, + "learning_rate": 0.0001, + "loss": 3.9278, + "loss/crossentropy": 2.024571657180786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21096136420965195, + "step": 16214 + }, + { + "epoch": 0.32432, + "grad_norm": 1.9375, + "grad_norm_var": 0.013057200113932292, + "learning_rate": 0.0001, + "loss": 4.0244, + "loss/crossentropy": 2.060720980167389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21043655276298523, + "step": 16216 + }, + { + "epoch": 0.32436, + "grad_norm": 2.078125, + "grad_norm_var": 0.014134724934895834, + "learning_rate": 0.0001, + "loss": 4.074, + "loss/crossentropy": 2.0332056283950806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19506454467773438, + "step": 16218 + }, + { + "epoch": 0.3244, + "grad_norm": 1.8515625, + "grad_norm_var": 0.014021555582682291, + "learning_rate": 0.0001, + "loss": 3.8173, + "loss/crossentropy": 1.811396062374115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19359800219535828, + "step": 16220 + }, + { + "epoch": 0.32444, + "grad_norm": 1.9453125, + "grad_norm_var": 0.013117472330729166, + "learning_rate": 0.0001, + "loss": 4.1846, + "loss/crossentropy": 2.074379801750183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21763048321008682, + "step": 16222 + }, + { + "epoch": 0.32448, + "grad_norm": 2.203125, + "grad_norm_var": 0.014682769775390625, + "learning_rate": 0.0001, + "loss": 4.1984, + "loss/crossentropy": 2.1836538314819336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20814575254917145, + "step": 16224 + }, + { + "epoch": 0.32452, + "grad_norm": 1.953125, + "grad_norm_var": 0.010794830322265626, + "learning_rate": 0.0001, + "loss": 4.2866, + "loss/crossentropy": 2.365849494934082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24219170212745667, + "step": 16226 + }, + { + "epoch": 0.32456, + "grad_norm": 2.0625, + "grad_norm_var": 0.010343424479166667, + "learning_rate": 0.0001, + "loss": 4.1976, + "loss/crossentropy": 1.8738153576850891, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22152718901634216, + "step": 16228 + }, + { + "epoch": 0.3246, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010209147135416667, + "learning_rate": 0.0001, + "loss": 4.1458, + "loss/crossentropy": 2.2568124532699585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21668314933776855, + "step": 16230 + }, + { + "epoch": 0.32464, + "grad_norm": 2.109375, + "grad_norm_var": 0.010900624593098958, + "learning_rate": 0.0001, + "loss": 4.2889, + "loss/crossentropy": 2.4037723541259766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2225075364112854, + "step": 16232 + }, + { + "epoch": 0.32468, + "grad_norm": 1.765625, + "grad_norm_var": 0.01375732421875, + "learning_rate": 0.0001, + "loss": 3.7288, + "loss/crossentropy": 2.2305864095687866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19381006807088852, + "step": 16234 + }, + { + "epoch": 0.32472, + "grad_norm": 2.03125, + "grad_norm_var": 0.013862864176432291, + "learning_rate": 0.0001, + "loss": 4.186, + "loss/crossentropy": 2.1115931272506714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20033665746450424, + "step": 16236 + }, + { + "epoch": 0.32476, + "grad_norm": 1.9765625, + "grad_norm_var": 0.014054361979166667, + "learning_rate": 0.0001, + "loss": 4.1602, + "loss/crossentropy": 2.0450429916381836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2080780565738678, + "step": 16238 + }, + { + "epoch": 0.3248, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011888631184895833, + "learning_rate": 0.0001, + "loss": 4.056, + "loss/crossentropy": 2.003769636154175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20560404658317566, + "step": 16240 + }, + { + "epoch": 0.32484, + "grad_norm": 2.125, + "grad_norm_var": 0.010383860270182291, + "learning_rate": 0.0001, + "loss": 4.3504, + "loss/crossentropy": 1.9452934265136719, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063707485795021, + "step": 16242 + }, + { + "epoch": 0.32488, + "grad_norm": 1.7734375, + "grad_norm_var": 0.012497711181640624, + "learning_rate": 0.0001, + "loss": 3.9172, + "loss/crossentropy": 2.1412742137908936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20292682200670242, + "step": 16244 + }, + { + "epoch": 0.32492, + "grad_norm": 1.96875, + "grad_norm_var": 0.011140950520833333, + "learning_rate": 0.0001, + "loss": 4.3025, + "loss/crossentropy": 2.2285404205322266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22571790218353271, + "step": 16246 + }, + { + "epoch": 0.32496, + "grad_norm": 1.8046875, + "grad_norm_var": 0.010713704427083333, + "learning_rate": 0.0001, + "loss": 3.6582, + "loss/crossentropy": 1.82416570186615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1771545708179474, + "step": 16248 + }, + { + "epoch": 0.325, + "grad_norm": 2.109375, + "grad_norm_var": 0.010422515869140624, + "learning_rate": 0.0001, + "loss": 4.2533, + "loss/crossentropy": 2.1432559490203857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20037438720464706, + "step": 16250 + }, + { + "epoch": 0.32504, + "grad_norm": 2.234375, + "grad_norm_var": 0.015337880452473958, + "learning_rate": 0.0001, + "loss": 4.162, + "loss/crossentropy": 2.1962517499923706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22743911296129227, + "step": 16252 + }, + { + "epoch": 0.32508, + "grad_norm": 2.125, + "grad_norm_var": 0.01636530558268229, + "learning_rate": 0.0001, + "loss": 4.2223, + "loss/crossentropy": 1.932455599308014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2043299823999405, + "step": 16254 + }, + { + "epoch": 0.32512, + "grad_norm": 1.9453125, + "grad_norm_var": 0.015824127197265624, + "learning_rate": 0.0001, + "loss": 3.9024, + "loss/crossentropy": 1.9702956676483154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19528628885746002, + "step": 16256 + }, + { + "epoch": 0.32516, + "grad_norm": 2.296875, + "grad_norm_var": 0.020475260416666665, + "learning_rate": 0.0001, + "loss": 4.0628, + "loss/crossentropy": 2.0042436718940735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19861597567796707, + "step": 16258 + }, + { + "epoch": 0.3252, + "grad_norm": 2.28125, + "grad_norm_var": 0.019573720296223958, + "learning_rate": 0.0001, + "loss": 4.1876, + "loss/crossentropy": 2.3544296622276306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22385136783123016, + "step": 16260 + }, + { + "epoch": 0.32524, + "grad_norm": 1.984375, + "grad_norm_var": 0.01942723592122396, + "learning_rate": 0.0001, + "loss": 4.1435, + "loss/crossentropy": 2.1482596397399902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21662700176239014, + "step": 16262 + }, + { + "epoch": 0.32528, + "grad_norm": 1.96875, + "grad_norm_var": 0.01395263671875, + "learning_rate": 0.0001, + "loss": 4.3441, + "loss/crossentropy": 2.243022322654724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22729884833097458, + "step": 16264 + }, + { + "epoch": 0.32532, + "grad_norm": 1.875, + "grad_norm_var": 0.018656158447265626, + "learning_rate": 0.0001, + "loss": 3.9013, + "loss/crossentropy": 2.2499197721481323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20558901876211166, + "step": 16266 + }, + { + "epoch": 0.32536, + "grad_norm": 1.9140625, + "grad_norm_var": 0.022907511393229166, + "learning_rate": 0.0001, + "loss": 4.2313, + "loss/crossentropy": 1.6232356429100037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18864237517118454, + "step": 16268 + }, + { + "epoch": 0.3254, + "grad_norm": 1.96875, + "grad_norm_var": 0.02402521769205729, + "learning_rate": 0.0001, + "loss": 4.0435, + "loss/crossentropy": 1.559517502784729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1704516038298607, + "step": 16270 + }, + { + "epoch": 0.32544, + "grad_norm": 1.890625, + "grad_norm_var": 0.02484308878580729, + "learning_rate": 0.0001, + "loss": 3.9018, + "loss/crossentropy": 1.7346046566963196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17380736768245697, + "step": 16272 + }, + { + "epoch": 0.32548, + "grad_norm": 2.21875, + "grad_norm_var": 0.02237523396809896, + "learning_rate": 0.0001, + "loss": 4.2894, + "loss/crossentropy": 2.1451315879821777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21778792887926102, + "step": 16274 + }, + { + "epoch": 0.32552, + "grad_norm": 2.25, + "grad_norm_var": 0.022684733072916668, + "learning_rate": 0.0001, + "loss": 3.8737, + "loss/crossentropy": 1.8013367056846619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18631768971681595, + "step": 16276 + }, + { + "epoch": 0.32556, + "grad_norm": 1.9609375, + "grad_norm_var": 0.025655110677083332, + "learning_rate": 0.0001, + "loss": 3.8573, + "loss/crossentropy": 1.8858280181884766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1940329447388649, + "step": 16278 + }, + { + "epoch": 0.3256, + "grad_norm": 1.9375, + "grad_norm_var": 0.0259429931640625, + "learning_rate": 0.0001, + "loss": 4.1268, + "loss/crossentropy": 1.8475046157836914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18596403300762177, + "step": 16280 + }, + { + "epoch": 0.32564, + "grad_norm": 2.0, + "grad_norm_var": 0.023636881510416666, + "learning_rate": 0.0001, + "loss": 4.2602, + "loss/crossentropy": 2.2585566639900208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22430463135242462, + "step": 16282 + }, + { + "epoch": 0.32568, + "grad_norm": 1.8984375, + "grad_norm_var": 0.015900675455729166, + "learning_rate": 0.0001, + "loss": 3.7224, + "loss/crossentropy": 1.7233783602714539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17318468540906906, + "step": 16284 + }, + { + "epoch": 0.32572, + "grad_norm": 1.9921875, + "grad_norm_var": 0.022172037760416666, + "learning_rate": 0.0001, + "loss": 4.3042, + "loss/crossentropy": 2.174088716506958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21329404413700104, + "step": 16286 + }, + { + "epoch": 0.32576, + "grad_norm": 2.015625, + "grad_norm_var": 0.022345987955729167, + "learning_rate": 0.0001, + "loss": 3.8932, + "loss/crossentropy": 1.8016705513000488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20395102351903915, + "step": 16288 + }, + { + "epoch": 0.3258, + "grad_norm": 2.03125, + "grad_norm_var": 0.020182037353515626, + "learning_rate": 0.0001, + "loss": 3.8634, + "loss/crossentropy": 2.166901111602783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21237251162528992, + "step": 16290 + }, + { + "epoch": 0.32584, + "grad_norm": 1.9921875, + "grad_norm_var": 0.014766184488932292, + "learning_rate": 0.0001, + "loss": 4.07, + "loss/crossentropy": 2.024773359298706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1973796784877777, + "step": 16292 + }, + { + "epoch": 0.32588, + "grad_norm": 2.203125, + "grad_norm_var": 0.015466054280598959, + "learning_rate": 0.0001, + "loss": 4.173, + "loss/crossentropy": 2.457033157348633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21896487474441528, + "step": 16294 + }, + { + "epoch": 0.32592, + "grad_norm": 2.03125, + "grad_norm_var": 0.014180501302083334, + "learning_rate": 0.0001, + "loss": 4.11, + "loss/crossentropy": 2.4872595071792603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2348102256655693, + "step": 16296 + }, + { + "epoch": 0.32596, + "grad_norm": 2.03125, + "grad_norm_var": 0.014249420166015625, + "learning_rate": 0.0001, + "loss": 4.0756, + "loss/crossentropy": 2.110987067222595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21079359203577042, + "step": 16298 + }, + { + "epoch": 0.326, + "grad_norm": 1.9375, + "grad_norm_var": 0.0133941650390625, + "learning_rate": 0.0001, + "loss": 4.2334, + "loss/crossentropy": 2.2166510820388794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19656993448734283, + "step": 16300 + }, + { + "epoch": 0.32604, + "grad_norm": 2.046875, + "grad_norm_var": 0.008125813802083333, + "learning_rate": 0.0001, + "loss": 4.0212, + "loss/crossentropy": 2.014153838157654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1956234946846962, + "step": 16302 + }, + { + "epoch": 0.32608, + "grad_norm": 2.671875, + "grad_norm_var": 0.03588231404622396, + "learning_rate": 0.0001, + "loss": 4.3045, + "loss/crossentropy": 2.298485517501831, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21710477769374847, + "step": 16304 + }, + { + "epoch": 0.32612, + "grad_norm": 1.9921875, + "grad_norm_var": 0.033599599202473955, + "learning_rate": 0.0001, + "loss": 4.3134, + "loss/crossentropy": 2.233761191368103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21609390527009964, + "step": 16306 + }, + { + "epoch": 0.32616, + "grad_norm": 1.8515625, + "grad_norm_var": 0.03675918579101563, + "learning_rate": 0.0001, + "loss": 3.9141, + "loss/crossentropy": 1.9988956451416016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1924634352326393, + "step": 16308 + }, + { + "epoch": 0.3262, + "grad_norm": 1.96875, + "grad_norm_var": 0.035194651285807295, + "learning_rate": 0.0001, + "loss": 3.9726, + "loss/crossentropy": 1.9185590147972107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20295168459415436, + "step": 16310 + }, + { + "epoch": 0.32624, + "grad_norm": 2.0625, + "grad_norm_var": 0.03476155598958333, + "learning_rate": 0.0001, + "loss": 4.0805, + "loss/crossentropy": 2.219870448112488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21425887942314148, + "step": 16312 + }, + { + "epoch": 0.32628, + "grad_norm": 2.21875, + "grad_norm_var": 0.03612442016601562, + "learning_rate": 0.0001, + "loss": 4.5119, + "loss/crossentropy": 2.219307541847229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22380203753709793, + "step": 16314 + }, + { + "epoch": 0.32632, + "grad_norm": 1.875, + "grad_norm_var": 0.03746515909830729, + "learning_rate": 0.0001, + "loss": 3.9652, + "loss/crossentropy": 1.7907224893569946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18037232011556625, + "step": 16316 + }, + { + "epoch": 0.32636, + "grad_norm": 1.8203125, + "grad_norm_var": 0.03876927693684896, + "learning_rate": 0.0001, + "loss": 4.1337, + "loss/crossentropy": 1.9702708721160889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18266448378562927, + "step": 16318 + }, + { + "epoch": 0.3264, + "grad_norm": 1.90625, + "grad_norm_var": 0.012481435139973959, + "learning_rate": 0.0001, + "loss": 4.0551, + "loss/crossentropy": 2.1748844385147095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20725325495004654, + "step": 16320 + }, + { + "epoch": 0.32644, + "grad_norm": 1.953125, + "grad_norm_var": 0.0119781494140625, + "learning_rate": 0.0001, + "loss": 4.2882, + "loss/crossentropy": 2.2458006143569946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21462788432836533, + "step": 16322 + }, + { + "epoch": 0.32648, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009276326497395833, + "learning_rate": 0.0001, + "loss": 4.1231, + "loss/crossentropy": 1.923665463924408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.223393976688385, + "step": 16324 + }, + { + "epoch": 0.32652, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010129547119140625, + "learning_rate": 0.0001, + "loss": 4.1643, + "loss/crossentropy": 2.36092209815979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22316965460777283, + "step": 16326 + }, + { + "epoch": 0.32656, + "grad_norm": 2.046875, + "grad_norm_var": 0.010456339518229166, + "learning_rate": 0.0001, + "loss": 4.0778, + "loss/crossentropy": 1.7765440344810486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1736016422510147, + "step": 16328 + }, + { + "epoch": 0.3266, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006638336181640625, + "learning_rate": 0.0001, + "loss": 4.4885, + "loss/crossentropy": 2.2588138580322266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21762139350175858, + "step": 16330 + }, + { + "epoch": 0.32664, + "grad_norm": 1.921875, + "grad_norm_var": 0.0064084370930989586, + "learning_rate": 0.0001, + "loss": 4.2476, + "loss/crossentropy": 2.177221417427063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19594799727201462, + "step": 16332 + }, + { + "epoch": 0.32668, + "grad_norm": 2.0625, + "grad_norm_var": 0.0051422119140625, + "learning_rate": 0.0001, + "loss": 3.941, + "loss/crossentropy": 1.8424673676490784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19562739878892899, + "step": 16334 + }, + { + "epoch": 0.32672, + "grad_norm": 1.8515625, + "grad_norm_var": 0.006473541259765625, + "learning_rate": 0.0001, + "loss": 3.861, + "loss/crossentropy": 1.8677524328231812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19708558171987534, + "step": 16336 + }, + { + "epoch": 0.32676, + "grad_norm": 1.859375, + "grad_norm_var": 0.007081858317057292, + "learning_rate": 0.0001, + "loss": 3.9222, + "loss/crossentropy": 1.9559081196784973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19585052132606506, + "step": 16338 + }, + { + "epoch": 0.3268, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006892649332682291, + "learning_rate": 0.0001, + "loss": 4.144, + "loss/crossentropy": 1.979565978050232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2045053392648697, + "step": 16340 + }, + { + "epoch": 0.32684, + "grad_norm": 1.953125, + "grad_norm_var": 0.00714111328125, + "learning_rate": 0.0001, + "loss": 4.2517, + "loss/crossentropy": 2.296669840812683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.207671657204628, + "step": 16342 + }, + { + "epoch": 0.32688, + "grad_norm": 2.0, + "grad_norm_var": 0.0065305074055989586, + "learning_rate": 0.0001, + "loss": 4.0279, + "loss/crossentropy": 1.9962339401245117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19833384454250336, + "step": 16344 + }, + { + "epoch": 0.32692, + "grad_norm": 2.015625, + "grad_norm_var": 0.006078084309895833, + "learning_rate": 0.0001, + "loss": 4.1779, + "loss/crossentropy": 2.020018517971039, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19721734523773193, + "step": 16346 + }, + { + "epoch": 0.32696, + "grad_norm": 1.8828125, + "grad_norm_var": 0.006815592447916667, + "learning_rate": 0.0001, + "loss": 3.7614, + "loss/crossentropy": 2.112026810646057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20460353046655655, + "step": 16348 + }, + { + "epoch": 0.327, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006540679931640625, + "learning_rate": 0.0001, + "loss": 4.1509, + "loss/crossentropy": 2.2069387435913086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2233232483267784, + "step": 16350 + }, + { + "epoch": 0.32704, + "grad_norm": 1.9375, + "grad_norm_var": 0.004792277018229167, + "learning_rate": 0.0001, + "loss": 3.8675, + "loss/crossentropy": 1.9365113377571106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19625376164913177, + "step": 16352 + }, + { + "epoch": 0.32708, + "grad_norm": 2.125, + "grad_norm_var": 0.0052154541015625, + "learning_rate": 0.0001, + "loss": 4.1831, + "loss/crossentropy": 2.2316179871559143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22712747752666473, + "step": 16354 + }, + { + "epoch": 0.32712, + "grad_norm": 2.0, + "grad_norm_var": 0.035672760009765624, + "learning_rate": 0.0001, + "loss": 3.9148, + "loss/crossentropy": 1.801742434501648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1831187978386879, + "step": 16356 + }, + { + "epoch": 0.32716, + "grad_norm": 1.9921875, + "grad_norm_var": 0.03590087890625, + "learning_rate": 0.0001, + "loss": 3.9819, + "loss/crossentropy": 2.157936453819275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21520215272903442, + "step": 16358 + }, + { + "epoch": 0.3272, + "grad_norm": 2.03125, + "grad_norm_var": 0.035835520426432295, + "learning_rate": 0.0001, + "loss": 4.0743, + "loss/crossentropy": 1.6054654717445374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1769917830824852, + "step": 16360 + }, + { + "epoch": 0.32724, + "grad_norm": 2.15625, + "grad_norm_var": 0.039613596598307294, + "learning_rate": 0.0001, + "loss": 4.0751, + "loss/crossentropy": 2.0573782324790955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22550390660762787, + "step": 16362 + }, + { + "epoch": 0.32728, + "grad_norm": 2.03125, + "grad_norm_var": 0.035131581624348956, + "learning_rate": 0.0001, + "loss": 4.281, + "loss/crossentropy": 2.0643117427825928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21060281991958618, + "step": 16364 + }, + { + "epoch": 0.32732, + "grad_norm": 1.8984375, + "grad_norm_var": 0.042525227864583334, + "learning_rate": 0.0001, + "loss": 4.0884, + "loss/crossentropy": 2.1980225443840027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19246292114257812, + "step": 16366 + }, + { + "epoch": 0.32736, + "grad_norm": 2.078125, + "grad_norm_var": 0.042577107747395836, + "learning_rate": 0.0001, + "loss": 4.05, + "loss/crossentropy": 1.8917307257652283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17911095917224884, + "step": 16368 + }, + { + "epoch": 0.3274, + "grad_norm": 2.15625, + "grad_norm_var": 0.0429595947265625, + "learning_rate": 0.0001, + "loss": 4.1415, + "loss/crossentropy": 2.0520957708358765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19265015423297882, + "step": 16370 + }, + { + "epoch": 0.32744, + "grad_norm": 2.28125, + "grad_norm_var": 0.019510904947916668, + "learning_rate": 0.0001, + "loss": 4.3026, + "loss/crossentropy": 2.4500341415405273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21713324636220932, + "step": 16372 + }, + { + "epoch": 0.32748, + "grad_norm": 1.8828125, + "grad_norm_var": 0.02032648722330729, + "learning_rate": 0.0001, + "loss": 4.0754, + "loss/crossentropy": 1.8100037574768066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16967355459928513, + "step": 16374 + }, + { + "epoch": 0.32752, + "grad_norm": 1.9375, + "grad_norm_var": 0.021174875895182292, + "learning_rate": 0.0001, + "loss": 4.0244, + "loss/crossentropy": 2.092597723007202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1998964473605156, + "step": 16376 + }, + { + "epoch": 0.32756, + "grad_norm": 2.03125, + "grad_norm_var": 0.016076405843098957, + "learning_rate": 0.0001, + "loss": 4.3627, + "loss/crossentropy": 2.151292622089386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19120946526527405, + "step": 16378 + }, + { + "epoch": 0.3276, + "grad_norm": 1.796875, + "grad_norm_var": 0.020906575520833335, + "learning_rate": 0.0001, + "loss": 3.6949, + "loss/crossentropy": 1.7384315729141235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18261852860450745, + "step": 16380 + }, + { + "epoch": 0.32764, + "grad_norm": 1.953125, + "grad_norm_var": 0.019147745768229165, + "learning_rate": 0.0001, + "loss": 3.9341, + "loss/crossentropy": 1.662436068058014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16703163087368011, + "step": 16382 + }, + { + "epoch": 0.32768, + "grad_norm": 1.984375, + "grad_norm_var": 0.018871053059895834, + "learning_rate": 0.0001, + "loss": 4.3202, + "loss/crossentropy": 2.2057151794433594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20989537239074707, + "step": 16384 + }, + { + "epoch": 0.32772, + "grad_norm": 2.03125, + "grad_norm_var": 0.068359375, + "learning_rate": 0.0001, + "loss": 4.3033, + "loss/crossentropy": 2.1689382791519165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20804189145565033, + "step": 16386 + }, + { + "epoch": 0.32776, + "grad_norm": 2.171875, + "grad_norm_var": 0.06450169881184896, + "learning_rate": 0.0001, + "loss": 4.2241, + "loss/crossentropy": 2.3121412992477417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20552029460668564, + "step": 16388 + }, + { + "epoch": 0.3278, + "grad_norm": 2.046875, + "grad_norm_var": 0.06318333943684896, + "learning_rate": 0.0001, + "loss": 4.1396, + "loss/crossentropy": 2.0784353017807007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20427466928958893, + "step": 16390 + }, + { + "epoch": 0.32784, + "grad_norm": 1.9609375, + "grad_norm_var": 0.06282145182291667, + "learning_rate": 0.0001, + "loss": 4.1132, + "loss/crossentropy": 2.147130608558655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2181989550590515, + "step": 16392 + }, + { + "epoch": 0.32788, + "grad_norm": 1.9296875, + "grad_norm_var": 0.06468505859375, + "learning_rate": 0.0001, + "loss": 3.812, + "loss/crossentropy": 1.7986091375350952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18294771760702133, + "step": 16394 + }, + { + "epoch": 0.32792, + "grad_norm": 1.9140625, + "grad_norm_var": 0.059004720052083334, + "learning_rate": 0.0001, + "loss": 4.037, + "loss/crossentropy": 2.0563793778419495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19180169701576233, + "step": 16396 + }, + { + "epoch": 0.32796, + "grad_norm": 2.015625, + "grad_norm_var": 0.05506591796875, + "learning_rate": 0.0001, + "loss": 4.3111, + "loss/crossentropy": 2.462133765220642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21563740074634552, + "step": 16398 + }, + { + "epoch": 0.328, + "grad_norm": 1.9375, + "grad_norm_var": 0.059427897135416664, + "learning_rate": 0.0001, + "loss": 3.9743, + "loss/crossentropy": 1.8900938630104065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1845797374844551, + "step": 16400 + }, + { + "epoch": 0.32804, + "grad_norm": 2.375, + "grad_norm_var": 0.0195068359375, + "learning_rate": 0.0001, + "loss": 4.1653, + "loss/crossentropy": 2.0627527832984924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20268447697162628, + "step": 16402 + }, + { + "epoch": 0.32808, + "grad_norm": 2.125, + "grad_norm_var": 0.030147043863932292, + "learning_rate": 0.0001, + "loss": 4.3042, + "loss/crossentropy": 2.058929443359375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19984640181064606, + "step": 16404 + }, + { + "epoch": 0.32812, + "grad_norm": 1.9296875, + "grad_norm_var": 0.030564117431640624, + "learning_rate": 0.0001, + "loss": 4.1162, + "loss/crossentropy": 1.816510558128357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1754368245601654, + "step": 16406 + }, + { + "epoch": 0.32816, + "grad_norm": 2.015625, + "grad_norm_var": 0.0302886962890625, + "learning_rate": 0.0001, + "loss": 4.263, + "loss/crossentropy": 1.9079806208610535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18917685747146606, + "step": 16408 + }, + { + "epoch": 0.3282, + "grad_norm": 1.9765625, + "grad_norm_var": 0.028612263997395835, + "learning_rate": 0.0001, + "loss": 3.9264, + "loss/crossentropy": 2.0913302898406982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088220864534378, + "step": 16410 + }, + { + "epoch": 0.32824, + "grad_norm": 2.09375, + "grad_norm_var": 0.028547159830729165, + "learning_rate": 0.0001, + "loss": 3.9029, + "loss/crossentropy": 2.1609301567077637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2010297030210495, + "step": 16412 + }, + { + "epoch": 0.32828, + "grad_norm": 3.078125, + "grad_norm_var": 0.0982666015625, + "learning_rate": 0.0001, + "loss": 4.2563, + "loss/crossentropy": 1.8449677228927612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23620254546403885, + "step": 16414 + }, + { + "epoch": 0.32832, + "grad_norm": 2.078125, + "grad_norm_var": 0.09134699503580729, + "learning_rate": 0.0001, + "loss": 4.1053, + "loss/crossentropy": 2.059410274028778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20259736478328705, + "step": 16416 + }, + { + "epoch": 0.32836, + "grad_norm": 1.8828125, + "grad_norm_var": 0.08761571248372396, + "learning_rate": 0.0001, + "loss": 3.8718, + "loss/crossentropy": 1.6095005869865417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17026344686746597, + "step": 16418 + }, + { + "epoch": 0.3284, + "grad_norm": 2.03125, + "grad_norm_var": 0.07932510375976562, + "learning_rate": 0.0001, + "loss": 4.2623, + "loss/crossentropy": 2.200170636177063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2259170413017273, + "step": 16420 + }, + { + "epoch": 0.32844, + "grad_norm": 1.921875, + "grad_norm_var": 0.08116226196289063, + "learning_rate": 0.0001, + "loss": 4.0724, + "loss/crossentropy": 1.970030963420868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19053932279348373, + "step": 16422 + }, + { + "epoch": 0.32848, + "grad_norm": 1.890625, + "grad_norm_var": 0.08263753255208334, + "learning_rate": 0.0001, + "loss": 4.106, + "loss/crossentropy": 2.2675901651382446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22277145832777023, + "step": 16424 + }, + { + "epoch": 0.32852, + "grad_norm": 1.78125, + "grad_norm_var": 0.08910725911458334, + "learning_rate": 0.0001, + "loss": 3.723, + "loss/crossentropy": 1.6446372866630554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15234287828207016, + "step": 16426 + }, + { + "epoch": 0.32856, + "grad_norm": 1.984375, + "grad_norm_var": 0.0875689188639323, + "learning_rate": 0.0001, + "loss": 4.3421, + "loss/crossentropy": 2.350424647331238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2190476581454277, + "step": 16428 + }, + { + "epoch": 0.3286, + "grad_norm": 2.046875, + "grad_norm_var": 0.009279123942057292, + "learning_rate": 0.0001, + "loss": 3.9663, + "loss/crossentropy": 1.8809763193130493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19404269009828568, + "step": 16430 + }, + { + "epoch": 0.32864, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008257802327473958, + "learning_rate": 0.0001, + "loss": 4.1731, + "loss/crossentropy": 2.1159849166870117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21232837438583374, + "step": 16432 + }, + { + "epoch": 0.32868, + "grad_norm": 2.421875, + "grad_norm_var": 0.02133763631184896, + "learning_rate": 0.0001, + "loss": 4.2875, + "loss/crossentropy": 2.3252066373825073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24867044389247894, + "step": 16434 + }, + { + "epoch": 0.32872, + "grad_norm": 1.9140625, + "grad_norm_var": 0.020186360677083334, + "learning_rate": 0.0001, + "loss": 4.2462, + "loss/crossentropy": 2.0642696619033813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18409788608551025, + "step": 16436 + }, + { + "epoch": 0.32876, + "grad_norm": 2.203125, + "grad_norm_var": 0.022725423177083332, + "learning_rate": 0.0001, + "loss": 4.3832, + "loss/crossentropy": 2.359580874443054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.218842551112175, + "step": 16438 + }, + { + "epoch": 0.3288, + "grad_norm": 1.8984375, + "grad_norm_var": 0.02264378865559896, + "learning_rate": 0.0001, + "loss": 3.7996, + "loss/crossentropy": 2.0959436893463135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993689462542534, + "step": 16440 + }, + { + "epoch": 0.32884, + "grad_norm": 1.96875, + "grad_norm_var": 0.017679595947265626, + "learning_rate": 0.0001, + "loss": 4.0838, + "loss/crossentropy": 2.1507667303085327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20800111442804337, + "step": 16442 + }, + { + "epoch": 0.32888, + "grad_norm": 1.875, + "grad_norm_var": 0.020005035400390624, + "learning_rate": 0.0001, + "loss": 3.9428, + "loss/crossentropy": 1.8771533370018005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20476838946342468, + "step": 16444 + }, + { + "epoch": 0.32892, + "grad_norm": 2.046875, + "grad_norm_var": 0.020005035400390624, + "learning_rate": 0.0001, + "loss": 4.2689, + "loss/crossentropy": 2.365285038948059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22512932121753693, + "step": 16446 + }, + { + "epoch": 0.32896, + "grad_norm": 2.0625, + "grad_norm_var": 0.019701131184895835, + "learning_rate": 0.0001, + "loss": 4.2928, + "loss/crossentropy": 2.2584418058395386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21187593042850494, + "step": 16448 + }, + { + "epoch": 0.329, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010270182291666667, + "learning_rate": 0.0001, + "loss": 3.8186, + "loss/crossentropy": 1.9227403402328491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1945207566022873, + "step": 16450 + }, + { + "epoch": 0.32904, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011897786458333334, + "learning_rate": 0.0001, + "loss": 3.8512, + "loss/crossentropy": 2.062865734100342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064652517437935, + "step": 16452 + }, + { + "epoch": 0.32908, + "grad_norm": 1.9140625, + "grad_norm_var": 0.007505035400390625, + "learning_rate": 0.0001, + "loss": 4.0493, + "loss/crossentropy": 1.98399817943573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22567245364189148, + "step": 16454 + }, + { + "epoch": 0.32912, + "grad_norm": 2.0, + "grad_norm_var": 0.007428995768229167, + "learning_rate": 0.0001, + "loss": 4.2888, + "loss/crossentropy": 2.0655510425567627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20936425030231476, + "step": 16456 + }, + { + "epoch": 0.32916, + "grad_norm": 2.140625, + "grad_norm_var": 0.01739501953125, + "learning_rate": 0.0001, + "loss": 4.1486, + "loss/crossentropy": 1.8360095024108887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18788952380418777, + "step": 16458 + }, + { + "epoch": 0.3292, + "grad_norm": 1.9453125, + "grad_norm_var": 0.01672948201497396, + "learning_rate": 0.0001, + "loss": 4.2265, + "loss/crossentropy": 2.1088568568229675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20265556871891022, + "step": 16460 + }, + { + "epoch": 0.32924, + "grad_norm": 1.921875, + "grad_norm_var": 0.017601521809895833, + "learning_rate": 0.0001, + "loss": 4.1004, + "loss/crossentropy": 2.3746429681777954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21173583716154099, + "step": 16462 + }, + { + "epoch": 0.32928, + "grad_norm": 2.078125, + "grad_norm_var": 0.01946996053059896, + "learning_rate": 0.0001, + "loss": 3.9577, + "loss/crossentropy": 2.0129969716072083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18141476809978485, + "step": 16464 + }, + { + "epoch": 0.32932, + "grad_norm": 1.9921875, + "grad_norm_var": 0.019160970052083334, + "learning_rate": 0.0001, + "loss": 4.2122, + "loss/crossentropy": 2.385707139968872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22456685453653336, + "step": 16466 + }, + { + "epoch": 0.32936, + "grad_norm": 2.09375, + "grad_norm_var": 0.015958658854166665, + "learning_rate": 0.0001, + "loss": 4.183, + "loss/crossentropy": 2.466804623603821, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23166514188051224, + "step": 16468 + }, + { + "epoch": 0.3294, + "grad_norm": 1.8515625, + "grad_norm_var": 0.018700154622395833, + "learning_rate": 0.0001, + "loss": 3.8552, + "loss/crossentropy": 1.9182489514350891, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1918247565627098, + "step": 16470 + }, + { + "epoch": 0.32944, + "grad_norm": 2.125, + "grad_norm_var": 0.021993001302083332, + "learning_rate": 0.0001, + "loss": 4.4364, + "loss/crossentropy": 2.27492892742157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2237030565738678, + "step": 16472 + }, + { + "epoch": 0.32948, + "grad_norm": 2.03125, + "grad_norm_var": 0.0144927978515625, + "learning_rate": 0.0001, + "loss": 4.0022, + "loss/crossentropy": 1.856327474117279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17718566209077835, + "step": 16474 + }, + { + "epoch": 0.32952, + "grad_norm": 1.859375, + "grad_norm_var": 0.0145263671875, + "learning_rate": 0.0001, + "loss": 3.9994, + "loss/crossentropy": 2.014274477958679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20536774396896362, + "step": 16476 + }, + { + "epoch": 0.32956, + "grad_norm": 2.03125, + "grad_norm_var": 0.016383616129557292, + "learning_rate": 0.0001, + "loss": 4.4606, + "loss/crossentropy": 2.098844289779663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18847080320119858, + "step": 16478 + }, + { + "epoch": 0.3296, + "grad_norm": 1.890625, + "grad_norm_var": 0.014647420247395833, + "learning_rate": 0.0001, + "loss": 4.1978, + "loss/crossentropy": 2.0402814149856567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1929381936788559, + "step": 16480 + }, + { + "epoch": 0.32964, + "grad_norm": 2.0625, + "grad_norm_var": 0.013315582275390625, + "learning_rate": 0.0001, + "loss": 4.2737, + "loss/crossentropy": 2.413077712059021, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2426547110080719, + "step": 16482 + }, + { + "epoch": 0.32968, + "grad_norm": 2.015625, + "grad_norm_var": 0.013401031494140625, + "learning_rate": 0.0001, + "loss": 3.9544, + "loss/crossentropy": 2.081916332244873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2190306931734085, + "step": 16484 + }, + { + "epoch": 0.32972, + "grad_norm": 1.9375, + "grad_norm_var": 0.010765584309895833, + "learning_rate": 0.0001, + "loss": 4.1093, + "loss/crossentropy": 2.1017117500305176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19809769093990326, + "step": 16486 + }, + { + "epoch": 0.32976, + "grad_norm": 1.8515625, + "grad_norm_var": 0.008139801025390626, + "learning_rate": 0.0001, + "loss": 3.9585, + "loss/crossentropy": 2.289687156677246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21503399312496185, + "step": 16488 + }, + { + "epoch": 0.3298, + "grad_norm": 2.09375, + "grad_norm_var": 0.008642323811848958, + "learning_rate": 0.0001, + "loss": 4.1952, + "loss/crossentropy": 2.0613157749176025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20428171753883362, + "step": 16490 + }, + { + "epoch": 0.32984, + "grad_norm": 2.0, + "grad_norm_var": 0.0073964436848958336, + "learning_rate": 0.0001, + "loss": 4.3191, + "loss/crossentropy": 2.1484400033950806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20850211381912231, + "step": 16492 + }, + { + "epoch": 0.32988, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0072629292805989586, + "learning_rate": 0.0001, + "loss": 3.875, + "loss/crossentropy": 1.6853107213974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1736336275935173, + "step": 16494 + }, + { + "epoch": 0.32992, + "grad_norm": 1.96875, + "grad_norm_var": 0.007389068603515625, + "learning_rate": 0.0001, + "loss": 4.2047, + "loss/crossentropy": 2.323481321334839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22742202132940292, + "step": 16496 + }, + { + "epoch": 0.32996, + "grad_norm": 2.078125, + "grad_norm_var": 0.00738525390625, + "learning_rate": 0.0001, + "loss": 4.124, + "loss/crossentropy": 2.1196956038475037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19792834669351578, + "step": 16498 + }, + { + "epoch": 0.33, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007417805989583333, + "learning_rate": 0.0001, + "loss": 4.0013, + "loss/crossentropy": 1.947241187095642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21067001670598984, + "step": 16500 + }, + { + "epoch": 0.33004, + "grad_norm": 2.046875, + "grad_norm_var": 0.00758056640625, + "learning_rate": 0.0001, + "loss": 4.0631, + "loss/crossentropy": 2.256502628326416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22300700098276138, + "step": 16502 + }, + { + "epoch": 0.33008, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007574208577473958, + "learning_rate": 0.0001, + "loss": 3.7874, + "loss/crossentropy": 1.7947281002998352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1860191375017166, + "step": 16504 + }, + { + "epoch": 0.33012, + "grad_norm": 1.921875, + "grad_norm_var": 0.007995351155598959, + "learning_rate": 0.0001, + "loss": 4.0189, + "loss/crossentropy": 1.9331985712051392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20170047879219055, + "step": 16506 + }, + { + "epoch": 0.33016, + "grad_norm": 1.96875, + "grad_norm_var": 0.011030832926432291, + "learning_rate": 0.0001, + "loss": 4.2148, + "loss/crossentropy": 2.2274327278137207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1977931559085846, + "step": 16508 + }, + { + "epoch": 0.3302, + "grad_norm": 2.015625, + "grad_norm_var": 0.007884724934895834, + "learning_rate": 0.0001, + "loss": 4.15, + "loss/crossentropy": 1.8204082250595093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18195096403360367, + "step": 16510 + }, + { + "epoch": 0.33024, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009633127848307292, + "learning_rate": 0.0001, + "loss": 4.2363, + "loss/crossentropy": 2.308950901031494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22067535668611526, + "step": 16512 + }, + { + "epoch": 0.33028, + "grad_norm": 2.3125, + "grad_norm_var": 0.014794921875, + "learning_rate": 0.0001, + "loss": 4.3895, + "loss/crossentropy": 2.1682082414627075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2021712213754654, + "step": 16514 + }, + { + "epoch": 0.33032, + "grad_norm": 1.90625, + "grad_norm_var": 0.016047159830729168, + "learning_rate": 0.0001, + "loss": 3.9213, + "loss/crossentropy": 1.9669402837753296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19773626327514648, + "step": 16516 + }, + { + "epoch": 0.33036, + "grad_norm": 2.046875, + "grad_norm_var": 0.016047159830729168, + "learning_rate": 0.0001, + "loss": 4.1723, + "loss/crossentropy": 2.0478034019470215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19190094619989395, + "step": 16518 + }, + { + "epoch": 0.3304, + "grad_norm": 1.984375, + "grad_norm_var": 0.014127349853515625, + "learning_rate": 0.0001, + "loss": 4.1399, + "loss/crossentropy": 1.8262990713119507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19281092286109924, + "step": 16520 + }, + { + "epoch": 0.33044, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0139801025390625, + "learning_rate": 0.0001, + "loss": 4.385, + "loss/crossentropy": 2.3943055868148804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2286393940448761, + "step": 16522 + }, + { + "epoch": 0.33048, + "grad_norm": 1.90625, + "grad_norm_var": 0.0124664306640625, + "learning_rate": 0.0001, + "loss": 4.1117, + "loss/crossentropy": 1.7113690972328186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20282406359910965, + "step": 16524 + }, + { + "epoch": 0.33052, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012924957275390624, + "learning_rate": 0.0001, + "loss": 3.9182, + "loss/crossentropy": 1.9077526926994324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18814775347709656, + "step": 16526 + }, + { + "epoch": 0.33056, + "grad_norm": 2.0625, + "grad_norm_var": 0.0106597900390625, + "learning_rate": 0.0001, + "loss": 4.3404, + "loss/crossentropy": 1.9100408554077148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1897171437740326, + "step": 16528 + }, + { + "epoch": 0.3306, + "grad_norm": 1.84375, + "grad_norm_var": 0.0066314697265625, + "learning_rate": 0.0001, + "loss": 4.1403, + "loss/crossentropy": 1.8145674467086792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18221604079008102, + "step": 16530 + }, + { + "epoch": 0.33064, + "grad_norm": 2.1875, + "grad_norm_var": 0.0085113525390625, + "learning_rate": 0.0001, + "loss": 4.0647, + "loss/crossentropy": 2.017480492591858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19173581898212433, + "step": 16532 + }, + { + "epoch": 0.33068, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008739980061848958, + "learning_rate": 0.0001, + "loss": 4.1918, + "loss/crossentropy": 1.8937708139419556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974407583475113, + "step": 16534 + }, + { + "epoch": 0.33072, + "grad_norm": 2.203125, + "grad_norm_var": 0.012664540608723959, + "learning_rate": 0.0001, + "loss": 4.1354, + "loss/crossentropy": 1.824431598186493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18114721775054932, + "step": 16536 + }, + { + "epoch": 0.33076, + "grad_norm": 2.03125, + "grad_norm_var": 0.018464152018229166, + "learning_rate": 0.0001, + "loss": 4.034, + "loss/crossentropy": 1.6330693364143372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17686167359352112, + "step": 16538 + }, + { + "epoch": 0.3308, + "grad_norm": 2.453125, + "grad_norm_var": 0.028595987955729166, + "learning_rate": 0.0001, + "loss": 4.2359, + "loss/crossentropy": 2.1948810815811157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21977168321609497, + "step": 16540 + }, + { + "epoch": 0.33084, + "grad_norm": 2.328125, + "grad_norm_var": 0.03220926920572917, + "learning_rate": 0.0001, + "loss": 4.1942, + "loss/crossentropy": 2.1222333908081055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19481626898050308, + "step": 16542 + }, + { + "epoch": 0.33088, + "grad_norm": 2.140625, + "grad_norm_var": 0.03241780598958333, + "learning_rate": 0.0001, + "loss": 3.9908, + "loss/crossentropy": 1.9703376293182373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20912524312734604, + "step": 16544 + }, + { + "epoch": 0.33092, + "grad_norm": 1.8515625, + "grad_norm_var": 0.03206965128580729, + "learning_rate": 0.0001, + "loss": 4.0774, + "loss/crossentropy": 2.2693361043930054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2137979120016098, + "step": 16546 + }, + { + "epoch": 0.33096, + "grad_norm": 3.703125, + "grad_norm_var": 0.19548746744791667, + "learning_rate": 0.0001, + "loss": 4.2372, + "loss/crossentropy": 2.029613673686981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21167294681072235, + "step": 16548 + }, + { + "epoch": 0.331, + "grad_norm": 1.90625, + "grad_norm_var": 0.19552586873372396, + "learning_rate": 0.0001, + "loss": 3.7223, + "loss/crossentropy": 2.0102853775024414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100009247660637, + "step": 16550 + }, + { + "epoch": 0.33104, + "grad_norm": 2.234375, + "grad_norm_var": 0.19806722005208333, + "learning_rate": 0.0001, + "loss": 4.6058, + "loss/crossentropy": 2.1570287942886353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21619540452957153, + "step": 16552 + }, + { + "epoch": 0.33108, + "grad_norm": 2.046875, + "grad_norm_var": 0.20053609212239584, + "learning_rate": 0.0001, + "loss": 4.0271, + "loss/crossentropy": 1.9157934188842773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.193254753947258, + "step": 16554 + }, + { + "epoch": 0.33112, + "grad_norm": 1.8515625, + "grad_norm_var": 0.2052642822265625, + "learning_rate": 0.0001, + "loss": 3.6593, + "loss/crossentropy": 1.7680367827415466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19283122569322586, + "step": 16556 + }, + { + "epoch": 0.33116, + "grad_norm": 1.8984375, + "grad_norm_var": 0.2058013916015625, + "learning_rate": 0.0001, + "loss": 3.8828, + "loss/crossentropy": 1.8805989623069763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20299813151359558, + "step": 16558 + }, + { + "epoch": 0.3312, + "grad_norm": 1.71875, + "grad_norm_var": 0.21938069661458334, + "learning_rate": 0.0001, + "loss": 3.7657, + "loss/crossentropy": 1.9145240187644958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18402665108442307, + "step": 16560 + }, + { + "epoch": 0.33124, + "grad_norm": 1.9453125, + "grad_norm_var": 0.2167144775390625, + "learning_rate": 0.0001, + "loss": 4.1984, + "loss/crossentropy": 1.7958417534828186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17038261890411377, + "step": 16562 + }, + { + "epoch": 0.33128, + "grad_norm": 1.8984375, + "grad_norm_var": 0.038919830322265626, + "learning_rate": 0.0001, + "loss": 3.8916, + "loss/crossentropy": 2.0782148838043213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21738532930612564, + "step": 16564 + }, + { + "epoch": 0.33132, + "grad_norm": 2.046875, + "grad_norm_var": 0.03910903930664063, + "learning_rate": 0.0001, + "loss": 4.3098, + "loss/crossentropy": 1.9767170548439026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2127690687775612, + "step": 16566 + }, + { + "epoch": 0.33136, + "grad_norm": 1.9296875, + "grad_norm_var": 0.01185302734375, + "learning_rate": 0.0001, + "loss": 3.9133, + "loss/crossentropy": 2.086383044719696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19811449944972992, + "step": 16568 + }, + { + "epoch": 0.3314, + "grad_norm": 1.9453125, + "grad_norm_var": 0.011991119384765625, + "learning_rate": 0.0001, + "loss": 4.0358, + "loss/crossentropy": 2.0471088886260986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20480120182037354, + "step": 16570 + }, + { + "epoch": 0.33144, + "grad_norm": 2.109375, + "grad_norm_var": 0.012485504150390625, + "learning_rate": 0.0001, + "loss": 4.1097, + "loss/crossentropy": 2.2637689113616943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21112071722745895, + "step": 16572 + }, + { + "epoch": 0.33148, + "grad_norm": 2.015625, + "grad_norm_var": 0.012018839518229166, + "learning_rate": 0.0001, + "loss": 4.023, + "loss/crossentropy": 1.8035425543785095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18357180804014206, + "step": 16574 + }, + { + "epoch": 0.33152, + "grad_norm": 1.953125, + "grad_norm_var": 0.007189687093098958, + "learning_rate": 0.0001, + "loss": 3.7453, + "loss/crossentropy": 1.647614598274231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17512448877096176, + "step": 16576 + }, + { + "epoch": 0.33156, + "grad_norm": 2.078125, + "grad_norm_var": 0.0063873291015625, + "learning_rate": 0.0001, + "loss": 4.0452, + "loss/crossentropy": 2.0647078156471252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21568534523248672, + "step": 16578 + }, + { + "epoch": 0.3316, + "grad_norm": 1.9296875, + "grad_norm_var": 0.00615234375, + "learning_rate": 0.0001, + "loss": 3.8656, + "loss/crossentropy": 1.7758954763412476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1891157627105713, + "step": 16580 + }, + { + "epoch": 0.33164, + "grad_norm": 2.125, + "grad_norm_var": 0.0060618082682291664, + "learning_rate": 0.0001, + "loss": 4.1022, + "loss/crossentropy": 2.0523850321769714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21198877692222595, + "step": 16582 + }, + { + "epoch": 0.33168, + "grad_norm": 2.078125, + "grad_norm_var": 0.006725819905598959, + "learning_rate": 0.0001, + "loss": 4.0584, + "loss/crossentropy": 1.99459570646286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20555292069911957, + "step": 16584 + }, + { + "epoch": 0.33172, + "grad_norm": 2.046875, + "grad_norm_var": 0.007771809895833333, + "learning_rate": 0.0001, + "loss": 4.4979, + "loss/crossentropy": 2.353522777557373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23193839937448502, + "step": 16586 + }, + { + "epoch": 0.33176, + "grad_norm": 2.078125, + "grad_norm_var": 0.020970662434895832, + "learning_rate": 0.0001, + "loss": 4.1303, + "loss/crossentropy": 2.0257768630981445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20071294903755188, + "step": 16588 + }, + { + "epoch": 0.3318, + "grad_norm": 2.046875, + "grad_norm_var": 0.020467122395833332, + "learning_rate": 0.0001, + "loss": 4.2135, + "loss/crossentropy": 2.2084985971450806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20876871049404144, + "step": 16590 + }, + { + "epoch": 0.33184, + "grad_norm": 2.15625, + "grad_norm_var": 0.019791412353515624, + "learning_rate": 0.0001, + "loss": 4.3275, + "loss/crossentropy": 1.9118791818618774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24547216296195984, + "step": 16592 + }, + { + "epoch": 0.33188, + "grad_norm": 1.90625, + "grad_norm_var": 0.022188313802083335, + "learning_rate": 0.0001, + "loss": 4.0069, + "loss/crossentropy": 2.184453248977661, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1988314613699913, + "step": 16594 + }, + { + "epoch": 0.33192, + "grad_norm": 1.71875, + "grad_norm_var": 0.02801488240559896, + "learning_rate": 0.0001, + "loss": 3.8711, + "loss/crossentropy": 2.2105953097343445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20877376198768616, + "step": 16596 + }, + { + "epoch": 0.33196, + "grad_norm": 2.078125, + "grad_norm_var": 0.028586578369140626, + "learning_rate": 0.0001, + "loss": 4.2969, + "loss/crossentropy": 2.028349459171295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20705370604991913, + "step": 16598 + }, + { + "epoch": 0.332, + "grad_norm": 2.015625, + "grad_norm_var": 0.02982177734375, + "learning_rate": 0.0001, + "loss": 3.8968, + "loss/crossentropy": 2.0778703689575195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19761346280574799, + "step": 16600 + }, + { + "epoch": 0.33204, + "grad_norm": 1.8828125, + "grad_norm_var": 0.029670206705729167, + "learning_rate": 0.0001, + "loss": 3.9786, + "loss/crossentropy": 2.046931743621826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21325716376304626, + "step": 16602 + }, + { + "epoch": 0.33208, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011486562093098958, + "learning_rate": 0.0001, + "loss": 4.1525, + "loss/crossentropy": 1.7907955050468445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1822371482849121, + "step": 16604 + }, + { + "epoch": 0.33212, + "grad_norm": 2.09375, + "grad_norm_var": 0.012108357747395833, + "learning_rate": 0.0001, + "loss": 4.1586, + "loss/crossentropy": 1.7620025277137756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19196610152721405, + "step": 16606 + }, + { + "epoch": 0.33216, + "grad_norm": 2.015625, + "grad_norm_var": 0.010282135009765625, + "learning_rate": 0.0001, + "loss": 3.9175, + "loss/crossentropy": 1.6695470213890076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16451346129179, + "step": 16608 + }, + { + "epoch": 0.3322, + "grad_norm": 2.09375, + "grad_norm_var": 0.012717437744140626, + "learning_rate": 0.0001, + "loss": 4.0304, + "loss/crossentropy": 2.2249897718429565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21898073703050613, + "step": 16610 + }, + { + "epoch": 0.33224, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009474436442057291, + "learning_rate": 0.0001, + "loss": 4.1589, + "loss/crossentropy": 2.0861976146698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993977203965187, + "step": 16612 + }, + { + "epoch": 0.33228, + "grad_norm": 1.859375, + "grad_norm_var": 0.009527333577473958, + "learning_rate": 0.0001, + "loss": 4.1516, + "loss/crossentropy": 2.195378541946411, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2103184312582016, + "step": 16614 + }, + { + "epoch": 0.33232, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009478505452473958, + "learning_rate": 0.0001, + "loss": 3.6947, + "loss/crossentropy": 1.9064915180206299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17310689389705658, + "step": 16616 + }, + { + "epoch": 0.33236, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009452311197916667, + "learning_rate": 0.0001, + "loss": 3.9698, + "loss/crossentropy": 1.7967488169670105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18099892884492874, + "step": 16618 + }, + { + "epoch": 0.3324, + "grad_norm": 2.140625, + "grad_norm_var": 0.012565104166666667, + "learning_rate": 0.0001, + "loss": 4.1319, + "loss/crossentropy": 2.0776742696762085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2005431056022644, + "step": 16620 + }, + { + "epoch": 0.33244, + "grad_norm": 1.90625, + "grad_norm_var": 0.0109375, + "learning_rate": 0.0001, + "loss": 4.1881, + "loss/crossentropy": 2.0590518712997437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20676207542419434, + "step": 16622 + }, + { + "epoch": 0.33248, + "grad_norm": 2.0, + "grad_norm_var": 0.011628977457682292, + "learning_rate": 0.0001, + "loss": 4.2067, + "loss/crossentropy": 1.8861089944839478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1860756129026413, + "step": 16624 + }, + { + "epoch": 0.33252, + "grad_norm": 1.9453125, + "grad_norm_var": 0.009308878580729167, + "learning_rate": 0.0001, + "loss": 3.9757, + "loss/crossentropy": 2.4207329750061035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22214040160179138, + "step": 16626 + }, + { + "epoch": 0.33256, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009295399983723958, + "learning_rate": 0.0001, + "loss": 3.934, + "loss/crossentropy": 2.089089274406433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21007104218006134, + "step": 16628 + }, + { + "epoch": 0.3326, + "grad_norm": 1.984375, + "grad_norm_var": 0.011380767822265625, + "learning_rate": 0.0001, + "loss": 4.3451, + "loss/crossentropy": 2.357658624649048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22044895589351654, + "step": 16630 + }, + { + "epoch": 0.33264, + "grad_norm": 1.875, + "grad_norm_var": 0.010223134358723959, + "learning_rate": 0.0001, + "loss": 3.9142, + "loss/crossentropy": 2.1036278009414673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2003040835261345, + "step": 16632 + }, + { + "epoch": 0.33268, + "grad_norm": 1.859375, + "grad_norm_var": 0.010749308268229167, + "learning_rate": 0.0001, + "loss": 4.0802, + "loss/crossentropy": 2.202688694000244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21549009531736374, + "step": 16634 + }, + { + "epoch": 0.33272, + "grad_norm": 2.078125, + "grad_norm_var": 0.008642323811848958, + "learning_rate": 0.0001, + "loss": 4.1642, + "loss/crossentropy": 2.261624753475189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2209767997264862, + "step": 16636 + }, + { + "epoch": 0.33276, + "grad_norm": 2.234375, + "grad_norm_var": 0.013570149739583334, + "learning_rate": 0.0001, + "loss": 4.0556, + "loss/crossentropy": 1.7452040910720825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18558355420827866, + "step": 16638 + }, + { + "epoch": 0.3328, + "grad_norm": 1.9609375, + "grad_norm_var": 0.012797037760416666, + "learning_rate": 0.0001, + "loss": 3.9364, + "loss/crossentropy": 1.9910151362419128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2115819901227951, + "step": 16640 + }, + { + "epoch": 0.33284, + "grad_norm": 2.015625, + "grad_norm_var": 0.014583079020182292, + "learning_rate": 0.0001, + "loss": 3.9412, + "loss/crossentropy": 2.0230116844177246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2001063972711563, + "step": 16642 + }, + { + "epoch": 0.33288, + "grad_norm": 1.8359375, + "grad_norm_var": 0.015818023681640626, + "learning_rate": 0.0001, + "loss": 4.0476, + "loss/crossentropy": 1.9575697183609009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20243427157402039, + "step": 16644 + }, + { + "epoch": 0.33292, + "grad_norm": 1.96875, + "grad_norm_var": 0.012890370686848958, + "learning_rate": 0.0001, + "loss": 4.052, + "loss/crossentropy": 1.9650630354881287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.203716441988945, + "step": 16646 + }, + { + "epoch": 0.33296, + "grad_norm": 1.859375, + "grad_norm_var": 0.0132476806640625, + "learning_rate": 0.0001, + "loss": 4.1345, + "loss/crossentropy": 1.497282326221466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15893913060426712, + "step": 16648 + }, + { + "epoch": 0.333, + "grad_norm": 1.9765625, + "grad_norm_var": 0.014491526285807292, + "learning_rate": 0.0001, + "loss": 4.1273, + "loss/crossentropy": 1.8017843961715698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18981081247329712, + "step": 16650 + }, + { + "epoch": 0.33304, + "grad_norm": 2.125, + "grad_norm_var": 0.014296213785807291, + "learning_rate": 0.0001, + "loss": 4.07, + "loss/crossentropy": 2.089366614818573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2076069712638855, + "step": 16652 + }, + { + "epoch": 0.33308, + "grad_norm": 1.8515625, + "grad_norm_var": 0.008695475260416667, + "learning_rate": 0.0001, + "loss": 3.9662, + "loss/crossentropy": 1.9445012211799622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18241490423679352, + "step": 16654 + }, + { + "epoch": 0.33312, + "grad_norm": 1.90625, + "grad_norm_var": 0.008740234375, + "learning_rate": 0.0001, + "loss": 3.9922, + "loss/crossentropy": 1.844423532485962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18664997816085815, + "step": 16656 + }, + { + "epoch": 0.33316, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007085927327473958, + "learning_rate": 0.0001, + "loss": 4.2221, + "loss/crossentropy": 2.1095730662345886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21053151786327362, + "step": 16658 + }, + { + "epoch": 0.3332, + "grad_norm": 2.03125, + "grad_norm_var": 0.007477823893229167, + "learning_rate": 0.0001, + "loss": 4.255, + "loss/crossentropy": 2.139304041862488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19131766259670258, + "step": 16660 + }, + { + "epoch": 0.33324, + "grad_norm": 2.0, + "grad_norm_var": 0.007450103759765625, + "learning_rate": 0.0001, + "loss": 3.9358, + "loss/crossentropy": 1.8872849345207214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19200573861598969, + "step": 16662 + }, + { + "epoch": 0.33328, + "grad_norm": 1.984375, + "grad_norm_var": 0.006695302327473959, + "learning_rate": 0.0001, + "loss": 4.0939, + "loss/crossentropy": 2.488931655883789, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24411997199058533, + "step": 16664 + }, + { + "epoch": 0.33332, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006151326497395833, + "learning_rate": 0.0001, + "loss": 3.7294, + "loss/crossentropy": 1.8066997528076172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19391655176877975, + "step": 16666 + }, + { + "epoch": 0.33336, + "grad_norm": 2.0, + "grad_norm_var": 0.005370076497395833, + "learning_rate": 0.0001, + "loss": 4.1831, + "loss/crossentropy": 2.085490345954895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21987342834472656, + "step": 16668 + }, + { + "epoch": 0.3334, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006459299723307292, + "learning_rate": 0.0001, + "loss": 3.7791, + "loss/crossentropy": 2.052145302295685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21673783659934998, + "step": 16670 + }, + { + "epoch": 0.33344, + "grad_norm": 1.9140625, + "grad_norm_var": 0.00640869140625, + "learning_rate": 0.0001, + "loss": 4.1648, + "loss/crossentropy": 2.0924419164657593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21523578464984894, + "step": 16672 + }, + { + "epoch": 0.33348, + "grad_norm": 1.96875, + "grad_norm_var": 0.006525675455729167, + "learning_rate": 0.0001, + "loss": 4.125, + "loss/crossentropy": 2.0340868830680847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1785074919462204, + "step": 16674 + }, + { + "epoch": 0.33352, + "grad_norm": 2.25, + "grad_norm_var": 0.010990397135416666, + "learning_rate": 0.0001, + "loss": 4.2404, + "loss/crossentropy": 2.4421908855438232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24263421446084976, + "step": 16676 + }, + { + "epoch": 0.33356, + "grad_norm": 1.953125, + "grad_norm_var": 0.0108551025390625, + "learning_rate": 0.0001, + "loss": 4.1879, + "loss/crossentropy": 1.9141955971717834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18666712939739227, + "step": 16678 + }, + { + "epoch": 0.3336, + "grad_norm": 2.015625, + "grad_norm_var": 0.011336008707682291, + "learning_rate": 0.0001, + "loss": 4.0378, + "loss/crossentropy": 2.0019638538360596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17722105979919434, + "step": 16680 + }, + { + "epoch": 0.33364, + "grad_norm": 1.984375, + "grad_norm_var": 0.009511057535807292, + "learning_rate": 0.0001, + "loss": 4.2047, + "loss/crossentropy": 2.0835896134376526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.251879021525383, + "step": 16682 + }, + { + "epoch": 0.33368, + "grad_norm": 1.9609375, + "grad_norm_var": 0.015192667643229166, + "learning_rate": 0.0001, + "loss": 4.2406, + "loss/crossentropy": 1.8940032720565796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19034714996814728, + "step": 16684 + }, + { + "epoch": 0.33372, + "grad_norm": 2.09375, + "grad_norm_var": 0.014817047119140624, + "learning_rate": 0.0001, + "loss": 4.0845, + "loss/crossentropy": 1.863099992275238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19675587117671967, + "step": 16686 + }, + { + "epoch": 0.33376, + "grad_norm": 2.046875, + "grad_norm_var": 0.024836222330729168, + "learning_rate": 0.0001, + "loss": 4.2785, + "loss/crossentropy": 2.027459740638733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19650273770093918, + "step": 16688 + }, + { + "epoch": 0.3338, + "grad_norm": 1.890625, + "grad_norm_var": 0.026008097330729167, + "learning_rate": 0.0001, + "loss": 4.2892, + "loss/crossentropy": 2.2324944734573364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23016268759965897, + "step": 16690 + }, + { + "epoch": 0.33384, + "grad_norm": 2.03125, + "grad_norm_var": 0.02474950154622396, + "learning_rate": 0.0001, + "loss": 4.155, + "loss/crossentropy": 2.213741898536682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21159538626670837, + "step": 16692 + }, + { + "epoch": 0.33388, + "grad_norm": 2.03125, + "grad_norm_var": 0.025340779622395834, + "learning_rate": 0.0001, + "loss": 4.4426, + "loss/crossentropy": 2.084823966026306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22060541808605194, + "step": 16694 + }, + { + "epoch": 0.33392, + "grad_norm": 1.9765625, + "grad_norm_var": 0.02417780558268229, + "learning_rate": 0.0001, + "loss": 4.1847, + "loss/crossentropy": 2.2417017221450806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2050381377339363, + "step": 16696 + }, + { + "epoch": 0.33396, + "grad_norm": 2.015625, + "grad_norm_var": 0.032134755452473955, + "learning_rate": 0.0001, + "loss": 4.3037, + "loss/crossentropy": 2.3696242570877075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22559890151023865, + "step": 16698 + }, + { + "epoch": 0.334, + "grad_norm": 1.8828125, + "grad_norm_var": 0.030295562744140626, + "learning_rate": 0.0001, + "loss": 4.158, + "loss/crossentropy": 2.0899609327316284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20470503717660904, + "step": 16700 + }, + { + "epoch": 0.33404, + "grad_norm": 1.90625, + "grad_norm_var": 0.029386138916015624, + "learning_rate": 0.0001, + "loss": 3.924, + "loss/crossentropy": 1.8663234114646912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1815531924366951, + "step": 16702 + }, + { + "epoch": 0.33408, + "grad_norm": 3.0, + "grad_norm_var": 0.08069432576497396, + "learning_rate": 0.0001, + "loss": 4.1651, + "loss/crossentropy": 2.030432403087616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063748836517334, + "step": 16704 + }, + { + "epoch": 0.33412, + "grad_norm": 1.90625, + "grad_norm_var": 0.08033218383789062, + "learning_rate": 0.0001, + "loss": 4.152, + "loss/crossentropy": 2.12885981798172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19536730647087097, + "step": 16706 + }, + { + "epoch": 0.33416, + "grad_norm": 2.09375, + "grad_norm_var": 0.07674153645833333, + "learning_rate": 0.0001, + "loss": 3.909, + "loss/crossentropy": 2.0870607495307922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20478558540344238, + "step": 16708 + }, + { + "epoch": 0.3342, + "grad_norm": 2.03125, + "grad_norm_var": 0.07646382649739583, + "learning_rate": 0.0001, + "loss": 4.2058, + "loss/crossentropy": 2.215463638305664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2118721902370453, + "step": 16710 + }, + { + "epoch": 0.33424, + "grad_norm": 1.9609375, + "grad_norm_var": 0.08145243326822917, + "learning_rate": 0.0001, + "loss": 3.8417, + "loss/crossentropy": 2.0292049646377563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18440847843885422, + "step": 16712 + }, + { + "epoch": 0.33428, + "grad_norm": 1.828125, + "grad_norm_var": 0.08017756144205729, + "learning_rate": 0.0001, + "loss": 3.7722, + "loss/crossentropy": 1.9214341640472412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18510671705007553, + "step": 16714 + }, + { + "epoch": 0.33432, + "grad_norm": 2.046875, + "grad_norm_var": 0.07769266764322917, + "learning_rate": 0.0001, + "loss": 4.3905, + "loss/crossentropy": 2.1761534214019775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20527766644954681, + "step": 16716 + }, + { + "epoch": 0.33436, + "grad_norm": 1.9765625, + "grad_norm_var": 0.07639567057291667, + "learning_rate": 0.0001, + "loss": 3.9234, + "loss/crossentropy": 2.020824670791626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20723462104797363, + "step": 16718 + }, + { + "epoch": 0.3344, + "grad_norm": 1.921875, + "grad_norm_var": 0.0129547119140625, + "learning_rate": 0.0001, + "loss": 4.167, + "loss/crossentropy": 2.243465781211853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21866197139024734, + "step": 16720 + }, + { + "epoch": 0.33444, + "grad_norm": 1.859375, + "grad_norm_var": 0.013244374593098959, + "learning_rate": 0.0001, + "loss": 3.9963, + "loss/crossentropy": 1.9071390628814697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20368033647537231, + "step": 16722 + }, + { + "epoch": 0.33448, + "grad_norm": 1.7890625, + "grad_norm_var": 0.011966959635416666, + "learning_rate": 0.0001, + "loss": 3.8974, + "loss/crossentropy": 1.72525554895401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18200145661830902, + "step": 16724 + }, + { + "epoch": 0.33452, + "grad_norm": 2.109375, + "grad_norm_var": 0.0154052734375, + "learning_rate": 0.0001, + "loss": 4.101, + "loss/crossentropy": 2.174055576324463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21181844174861908, + "step": 16726 + }, + { + "epoch": 0.33456, + "grad_norm": 1.8984375, + "grad_norm_var": 0.014251454671223959, + "learning_rate": 0.0001, + "loss": 3.8004, + "loss/crossentropy": 1.6175724864006042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15338046848773956, + "step": 16728 + }, + { + "epoch": 0.3346, + "grad_norm": 1.953125, + "grad_norm_var": 0.012353515625, + "learning_rate": 0.0001, + "loss": 4.1101, + "loss/crossentropy": 2.0030194520950317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20353461802005768, + "step": 16730 + }, + { + "epoch": 0.33464, + "grad_norm": 2.03125, + "grad_norm_var": 0.012092081705729167, + "learning_rate": 0.0001, + "loss": 4.3876, + "loss/crossentropy": 2.272015690803528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22368235886096954, + "step": 16732 + }, + { + "epoch": 0.33468, + "grad_norm": 2.046875, + "grad_norm_var": 0.012189737955729167, + "learning_rate": 0.0001, + "loss": 4.3365, + "loss/crossentropy": 2.149975538253784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20634270459413528, + "step": 16734 + }, + { + "epoch": 0.33472, + "grad_norm": 1.921875, + "grad_norm_var": 0.009869130452473958, + "learning_rate": 0.0001, + "loss": 3.9201, + "loss/crossentropy": 2.095462441444397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20027077198028564, + "step": 16736 + }, + { + "epoch": 0.33476, + "grad_norm": 1.953125, + "grad_norm_var": 0.008819325764973959, + "learning_rate": 0.0001, + "loss": 4.0859, + "loss/crossentropy": 1.988598644733429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20503661036491394, + "step": 16738 + }, + { + "epoch": 0.3348, + "grad_norm": 2.03125, + "grad_norm_var": 0.0066569010416666664, + "learning_rate": 0.0001, + "loss": 4.2618, + "loss/crossentropy": 2.0615572333335876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20363956689834595, + "step": 16740 + }, + { + "epoch": 0.33484, + "grad_norm": 2.0625, + "grad_norm_var": 0.0027903238932291668, + "learning_rate": 0.0001, + "loss": 3.9476, + "loss/crossentropy": 1.8172455430030823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20091129839420319, + "step": 16742 + }, + { + "epoch": 0.33488, + "grad_norm": 1.984375, + "grad_norm_var": 0.0043853759765625, + "learning_rate": 0.0001, + "loss": 3.9541, + "loss/crossentropy": 2.212782144546509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19710610061883926, + "step": 16744 + }, + { + "epoch": 0.33492, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008674875895182291, + "learning_rate": 0.0001, + "loss": 4.2629, + "loss/crossentropy": 2.374183773994446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23687051236629486, + "step": 16746 + }, + { + "epoch": 0.33496, + "grad_norm": 2.0625, + "grad_norm_var": 0.008902740478515626, + "learning_rate": 0.0001, + "loss": 4.3642, + "loss/crossentropy": 1.9374622702598572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19821622967720032, + "step": 16748 + }, + { + "epoch": 0.335, + "grad_norm": 1.984375, + "grad_norm_var": 0.008740234375, + "learning_rate": 0.0001, + "loss": 4.1289, + "loss/crossentropy": 1.6063715815544128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16296840459108353, + "step": 16750 + }, + { + "epoch": 0.33504, + "grad_norm": 1.859375, + "grad_norm_var": 0.0096832275390625, + "learning_rate": 0.0001, + "loss": 4.0315, + "loss/crossentropy": 2.1251469254493713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17997874319553375, + "step": 16752 + }, + { + "epoch": 0.33508, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0097564697265625, + "learning_rate": 0.0001, + "loss": 4.03, + "loss/crossentropy": 2.126043915748596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2283206582069397, + "step": 16754 + }, + { + "epoch": 0.33512, + "grad_norm": 2.046875, + "grad_norm_var": 0.009901682535807291, + "learning_rate": 0.0001, + "loss": 4.2969, + "loss/crossentropy": 2.3409340381622314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21817704290151596, + "step": 16756 + }, + { + "epoch": 0.33516, + "grad_norm": 2.0, + "grad_norm_var": 0.01024169921875, + "learning_rate": 0.0001, + "loss": 4.0083, + "loss/crossentropy": 1.9816790223121643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19743956625461578, + "step": 16758 + }, + { + "epoch": 0.3352, + "grad_norm": 2.21875, + "grad_norm_var": 0.012967681884765625, + "learning_rate": 0.0001, + "loss": 4.3056, + "loss/crossentropy": 2.246092438697815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22662658989429474, + "step": 16760 + }, + { + "epoch": 0.33524, + "grad_norm": 2.125, + "grad_norm_var": 0.010794830322265626, + "learning_rate": 0.0001, + "loss": 4.2842, + "loss/crossentropy": 2.1571802496910095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20506682991981506, + "step": 16762 + }, + { + "epoch": 0.33528, + "grad_norm": 2.015625, + "grad_norm_var": 0.010636138916015624, + "learning_rate": 0.0001, + "loss": 4.0372, + "loss/crossentropy": 1.966954231262207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18581678718328476, + "step": 16764 + }, + { + "epoch": 0.33532, + "grad_norm": 2.125, + "grad_norm_var": 0.012961578369140626, + "learning_rate": 0.0001, + "loss": 3.9978, + "loss/crossentropy": 1.9360153079032898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20846693962812424, + "step": 16766 + }, + { + "epoch": 0.33536, + "grad_norm": 2.0625, + "grad_norm_var": 0.012644195556640625, + "learning_rate": 0.0001, + "loss": 4.0953, + "loss/crossentropy": 2.3978073596954346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20406542718410492, + "step": 16768 + }, + { + "epoch": 0.3354, + "grad_norm": 2.109375, + "grad_norm_var": 0.013639068603515625, + "learning_rate": 0.0001, + "loss": 4.0933, + "loss/crossentropy": 2.239229917526245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21536701172590256, + "step": 16770 + }, + { + "epoch": 0.33544, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013600413004557292, + "learning_rate": 0.0001, + "loss": 4.2153, + "loss/crossentropy": 2.1333428621292114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2062046378850937, + "step": 16772 + }, + { + "epoch": 0.33548, + "grad_norm": 2.09375, + "grad_norm_var": 0.014362589518229166, + "learning_rate": 0.0001, + "loss": 4.0094, + "loss/crossentropy": 2.3503127098083496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20422626286745071, + "step": 16774 + }, + { + "epoch": 0.33552, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009772745768229167, + "learning_rate": 0.0001, + "loss": 4.1466, + "loss/crossentropy": 1.9181209802627563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16844520717859268, + "step": 16776 + }, + { + "epoch": 0.33556, + "grad_norm": 2.015625, + "grad_norm_var": 0.009269205729166667, + "learning_rate": 0.0001, + "loss": 4.1975, + "loss/crossentropy": 2.022172212600708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19443488866090775, + "step": 16778 + }, + { + "epoch": 0.3356, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010951487223307292, + "learning_rate": 0.0001, + "loss": 3.9544, + "loss/crossentropy": 2.0416210293769836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17731749266386032, + "step": 16780 + }, + { + "epoch": 0.33564, + "grad_norm": 2.0625, + "grad_norm_var": 0.009748331705729167, + "learning_rate": 0.0001, + "loss": 4.0757, + "loss/crossentropy": 2.1722596883773804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19995421171188354, + "step": 16782 + }, + { + "epoch": 0.33568, + "grad_norm": 1.953125, + "grad_norm_var": 0.009146881103515626, + "learning_rate": 0.0001, + "loss": 4.0942, + "loss/crossentropy": 2.4041545391082764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22784388065338135, + "step": 16784 + }, + { + "epoch": 0.33572, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007757314046223958, + "learning_rate": 0.0001, + "loss": 4.0692, + "loss/crossentropy": 2.189204216003418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20079431682825089, + "step": 16786 + }, + { + "epoch": 0.33576, + "grad_norm": 1.890625, + "grad_norm_var": 0.007130686442057292, + "learning_rate": 0.0001, + "loss": 3.9771, + "loss/crossentropy": 2.2308130860328674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22352158278226852, + "step": 16788 + }, + { + "epoch": 0.3358, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005248006184895833, + "learning_rate": 0.0001, + "loss": 4.0901, + "loss/crossentropy": 2.238118886947632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21165399253368378, + "step": 16790 + }, + { + "epoch": 0.33584, + "grad_norm": 11.9375, + "grad_norm_var": 6.248060862223308, + "learning_rate": 0.0001, + "loss": 4.5633, + "loss/crossentropy": 2.452346444129944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2192159816622734, + "step": 16792 + }, + { + "epoch": 0.33588, + "grad_norm": 1.9765625, + "grad_norm_var": 6.234175364176433, + "learning_rate": 0.0001, + "loss": 4.2032, + "loss/crossentropy": 2.115506410598755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19228096306324005, + "step": 16794 + }, + { + "epoch": 0.33592, + "grad_norm": 2.046875, + "grad_norm_var": 6.207706705729167, + "learning_rate": 0.0001, + "loss": 4.313, + "loss/crossentropy": 2.1052395701408386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24314512312412262, + "step": 16796 + }, + { + "epoch": 0.33596, + "grad_norm": 1.9765625, + "grad_norm_var": 6.191576131184896, + "learning_rate": 0.0001, + "loss": 4.201, + "loss/crossentropy": 1.7643597722053528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17998731136322021, + "step": 16798 + }, + { + "epoch": 0.336, + "grad_norm": 1.890625, + "grad_norm_var": 6.213846588134766, + "learning_rate": 0.0001, + "loss": 3.886, + "loss/crossentropy": 2.0926302671432495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.199835367500782, + "step": 16800 + }, + { + "epoch": 0.33604, + "grad_norm": 2.109375, + "grad_norm_var": 6.19762954711914, + "learning_rate": 0.0001, + "loss": 4.0106, + "loss/crossentropy": 1.9939849972724915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19728610664606094, + "step": 16802 + }, + { + "epoch": 0.33608, + "grad_norm": 2.15625, + "grad_norm_var": 6.1746826171875, + "learning_rate": 0.0001, + "loss": 4.3381, + "loss/crossentropy": 2.154136300086975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2071739211678505, + "step": 16804 + }, + { + "epoch": 0.33612, + "grad_norm": 1.9296875, + "grad_norm_var": 6.173281860351563, + "learning_rate": 0.0001, + "loss": 4.06, + "loss/crossentropy": 1.6244451403617859, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1825866624712944, + "step": 16806 + }, + { + "epoch": 0.33616, + "grad_norm": 1.890625, + "grad_norm_var": 0.010945383707682292, + "learning_rate": 0.0001, + "loss": 3.6796, + "loss/crossentropy": 1.936405599117279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20031629502773285, + "step": 16808 + }, + { + "epoch": 0.3362, + "grad_norm": 2.046875, + "grad_norm_var": 0.011568196614583333, + "learning_rate": 0.0001, + "loss": 4.276, + "loss/crossentropy": 2.117881119251251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21941083669662476, + "step": 16810 + }, + { + "epoch": 0.33624, + "grad_norm": 1.921875, + "grad_norm_var": 0.0125152587890625, + "learning_rate": 0.0001, + "loss": 4.1104, + "loss/crossentropy": 2.0384327173233032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2019578069448471, + "step": 16812 + }, + { + "epoch": 0.33628, + "grad_norm": 1.921875, + "grad_norm_var": 0.011893463134765626, + "learning_rate": 0.0001, + "loss": 4.1383, + "loss/crossentropy": 2.058988571166992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20959977805614471, + "step": 16814 + }, + { + "epoch": 0.33632, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010680898030598959, + "learning_rate": 0.0001, + "loss": 4.0734, + "loss/crossentropy": 2.117241382598877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039024829864502, + "step": 16816 + }, + { + "epoch": 0.33636, + "grad_norm": 2.0, + "grad_norm_var": 0.009968058268229166, + "learning_rate": 0.0001, + "loss": 4.157, + "loss/crossentropy": 1.8142318725585938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1877349317073822, + "step": 16818 + }, + { + "epoch": 0.3364, + "grad_norm": 2.296875, + "grad_norm_var": 0.014440663655598958, + "learning_rate": 0.0001, + "loss": 4.2402, + "loss/crossentropy": 2.1766676902770996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21788865327835083, + "step": 16820 + }, + { + "epoch": 0.33644, + "grad_norm": 2.078125, + "grad_norm_var": 0.0163726806640625, + "learning_rate": 0.0001, + "loss": 4.2953, + "loss/crossentropy": 2.2596739530563354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23641519993543625, + "step": 16822 + }, + { + "epoch": 0.33648, + "grad_norm": 1.984375, + "grad_norm_var": 0.012572987874348959, + "learning_rate": 0.0001, + "loss": 4.1689, + "loss/crossentropy": 2.1782987117767334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20562626421451569, + "step": 16824 + }, + { + "epoch": 0.33652, + "grad_norm": 2.28125, + "grad_norm_var": 0.01633275349934896, + "learning_rate": 0.0001, + "loss": 4.0693, + "loss/crossentropy": 1.9579410552978516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20332375168800354, + "step": 16826 + }, + { + "epoch": 0.33656, + "grad_norm": 1.75, + "grad_norm_var": 0.020977528889973958, + "learning_rate": 0.0001, + "loss": 4.0193, + "loss/crossentropy": 1.917382538318634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.188996322453022, + "step": 16828 + }, + { + "epoch": 0.3366, + "grad_norm": 2.125, + "grad_norm_var": 0.02045466105143229, + "learning_rate": 0.0001, + "loss": 3.9885, + "loss/crossentropy": 1.8045424222946167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18587132543325424, + "step": 16830 + }, + { + "epoch": 0.33664, + "grad_norm": 2.046875, + "grad_norm_var": 0.07043355305989583, + "learning_rate": 0.0001, + "loss": 4.0171, + "loss/crossentropy": 2.113844871520996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20383527874946594, + "step": 16832 + }, + { + "epoch": 0.33668, + "grad_norm": 2.0625, + "grad_norm_var": 0.06795654296875, + "learning_rate": 0.0001, + "loss": 4.1255, + "loss/crossentropy": 1.9635592699050903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20447230339050293, + "step": 16834 + }, + { + "epoch": 0.33672, + "grad_norm": 2.046875, + "grad_norm_var": 0.06646219889322917, + "learning_rate": 0.0001, + "loss": 4.1336, + "loss/crossentropy": 1.8918602466583252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21786046028137207, + "step": 16836 + }, + { + "epoch": 0.33676, + "grad_norm": 1.9453125, + "grad_norm_var": 0.06734390258789062, + "learning_rate": 0.0001, + "loss": 4.3372, + "loss/crossentropy": 2.347909450531006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23047995567321777, + "step": 16838 + }, + { + "epoch": 0.3368, + "grad_norm": 1.90625, + "grad_norm_var": 0.06854248046875, + "learning_rate": 0.0001, + "loss": 4.0242, + "loss/crossentropy": 1.834633469581604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1915532797574997, + "step": 16840 + }, + { + "epoch": 0.33684, + "grad_norm": 1.84375, + "grad_norm_var": 0.06948954264322917, + "learning_rate": 0.0001, + "loss": 3.6871, + "loss/crossentropy": 1.9172112345695496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18901804834604263, + "step": 16842 + }, + { + "epoch": 0.33688, + "grad_norm": 1.953125, + "grad_norm_var": 0.06441141764322916, + "learning_rate": 0.0001, + "loss": 4.3911, + "loss/crossentropy": 2.37927508354187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2328021451830864, + "step": 16844 + }, + { + "epoch": 0.33692, + "grad_norm": 1.921875, + "grad_norm_var": 0.06558024088541667, + "learning_rate": 0.0001, + "loss": 3.8573, + "loss/crossentropy": 1.7887099385261536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17989587783813477, + "step": 16846 + }, + { + "epoch": 0.33696, + "grad_norm": 1.96875, + "grad_norm_var": 0.0051513671875, + "learning_rate": 0.0001, + "loss": 4.0826, + "loss/crossentropy": 2.008473217487335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20677915215492249, + "step": 16848 + }, + { + "epoch": 0.337, + "grad_norm": 1.90625, + "grad_norm_var": 0.005362955729166666, + "learning_rate": 0.0001, + "loss": 4.1751, + "loss/crossentropy": 2.101797103881836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19833290576934814, + "step": 16850 + }, + { + "epoch": 0.33704, + "grad_norm": 2.0, + "grad_norm_var": 0.005779774983723959, + "learning_rate": 0.0001, + "loss": 4.0087, + "loss/crossentropy": 1.9306662678718567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1893247440457344, + "step": 16852 + }, + { + "epoch": 0.33708, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005293782552083333, + "learning_rate": 0.0001, + "loss": 3.9986, + "loss/crossentropy": 2.1301704049110413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1975892335176468, + "step": 16854 + }, + { + "epoch": 0.33712, + "grad_norm": 2.09375, + "grad_norm_var": 0.007173411051432292, + "learning_rate": 0.0001, + "loss": 3.9583, + "loss/crossentropy": 2.020972192287445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19376973807811737, + "step": 16856 + }, + { + "epoch": 0.33716, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006159464518229167, + "learning_rate": 0.0001, + "loss": 3.9632, + "loss/crossentropy": 1.8190429210662842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1629476398229599, + "step": 16858 + }, + { + "epoch": 0.3372, + "grad_norm": 2.09375, + "grad_norm_var": 0.007183583577473959, + "learning_rate": 0.0001, + "loss": 4.2147, + "loss/crossentropy": 2.360092878341675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22874057292938232, + "step": 16860 + }, + { + "epoch": 0.33724, + "grad_norm": 2.171875, + "grad_norm_var": 0.008853912353515625, + "learning_rate": 0.0001, + "loss": 4.232, + "loss/crossentropy": 2.3477792739868164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2215813398361206, + "step": 16862 + }, + { + "epoch": 0.33728, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009919230143229167, + "learning_rate": 0.0001, + "loss": 4.2551, + "loss/crossentropy": 2.474452257156372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2185184359550476, + "step": 16864 + }, + { + "epoch": 0.33732, + "grad_norm": 2.015625, + "grad_norm_var": 0.010396067301432292, + "learning_rate": 0.0001, + "loss": 3.7782, + "loss/crossentropy": 1.8387269973754883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18137617409229279, + "step": 16866 + }, + { + "epoch": 0.33736, + "grad_norm": 1.953125, + "grad_norm_var": 0.009924062093098958, + "learning_rate": 0.0001, + "loss": 3.9414, + "loss/crossentropy": 1.8537965416908264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19480426609516144, + "step": 16868 + }, + { + "epoch": 0.3374, + "grad_norm": 1.96875, + "grad_norm_var": 0.010057576497395833, + "learning_rate": 0.0001, + "loss": 3.9808, + "loss/crossentropy": 1.9274648427963257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19545693695545197, + "step": 16870 + }, + { + "epoch": 0.33744, + "grad_norm": 2.140625, + "grad_norm_var": 0.010273996988932292, + "learning_rate": 0.0001, + "loss": 4.1097, + "loss/crossentropy": 1.9180024862289429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2180539146065712, + "step": 16872 + }, + { + "epoch": 0.33748, + "grad_norm": 2.078125, + "grad_norm_var": 0.011016591389973959, + "learning_rate": 0.0001, + "loss": 4.4008, + "loss/crossentropy": 2.185365915298462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2225860357284546, + "step": 16874 + }, + { + "epoch": 0.33752, + "grad_norm": 1.859375, + "grad_norm_var": 0.011523183186848958, + "learning_rate": 0.0001, + "loss": 3.9882, + "loss/crossentropy": 1.9749634861946106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19897626340389252, + "step": 16876 + }, + { + "epoch": 0.33756, + "grad_norm": 2.109375, + "grad_norm_var": 0.010284169514973959, + "learning_rate": 0.0001, + "loss": 4.237, + "loss/crossentropy": 2.1505807638168335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20921844244003296, + "step": 16878 + }, + { + "epoch": 0.3376, + "grad_norm": 1.859375, + "grad_norm_var": 0.01002197265625, + "learning_rate": 0.0001, + "loss": 4.0692, + "loss/crossentropy": 1.7159577012062073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19649318605661392, + "step": 16880 + }, + { + "epoch": 0.33764, + "grad_norm": 1.84375, + "grad_norm_var": 0.010251617431640625, + "learning_rate": 0.0001, + "loss": 4.1615, + "loss/crossentropy": 2.187040388584137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20251552015542984, + "step": 16882 + }, + { + "epoch": 0.33768, + "grad_norm": 2.234375, + "grad_norm_var": 0.014609527587890626, + "learning_rate": 0.0001, + "loss": 4.0587, + "loss/crossentropy": 1.9247627258300781, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17710037529468536, + "step": 16884 + }, + { + "epoch": 0.33772, + "grad_norm": 1.9375, + "grad_norm_var": 0.0141845703125, + "learning_rate": 0.0001, + "loss": 4.001, + "loss/crossentropy": 1.8499276041984558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18950388580560684, + "step": 16886 + }, + { + "epoch": 0.33776, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0138824462890625, + "learning_rate": 0.0001, + "loss": 4.0998, + "loss/crossentropy": 2.0127341747283936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18379998207092285, + "step": 16888 + }, + { + "epoch": 0.3378, + "grad_norm": 1.9375, + "grad_norm_var": 0.013602447509765626, + "learning_rate": 0.0001, + "loss": 3.9096, + "loss/crossentropy": 2.2561115026474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20785757154226303, + "step": 16890 + }, + { + "epoch": 0.33784, + "grad_norm": 1.9375, + "grad_norm_var": 0.0131103515625, + "learning_rate": 0.0001, + "loss": 3.9638, + "loss/crossentropy": 1.9391398429870605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19481037557125092, + "step": 16892 + }, + { + "epoch": 0.33788, + "grad_norm": 1.8984375, + "grad_norm_var": 0.014827219645182292, + "learning_rate": 0.0001, + "loss": 4.2368, + "loss/crossentropy": 2.3129481077194214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2065483033657074, + "step": 16894 + }, + { + "epoch": 0.33792, + "grad_norm": 2.296875, + "grad_norm_var": 0.01984430948893229, + "learning_rate": 0.0001, + "loss": 4.0774, + "loss/crossentropy": 1.8482372760772705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19291523844003677, + "step": 16896 + }, + { + "epoch": 0.33796, + "grad_norm": 2.03125, + "grad_norm_var": 0.020031483968098958, + "learning_rate": 0.0001, + "loss": 4.3876, + "loss/crossentropy": 1.973912537097931, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22137422114610672, + "step": 16898 + }, + { + "epoch": 0.338, + "grad_norm": 1.890625, + "grad_norm_var": 0.01654841105143229, + "learning_rate": 0.0001, + "loss": 3.9599, + "loss/crossentropy": 1.7987132668495178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18385548144578934, + "step": 16900 + }, + { + "epoch": 0.33804, + "grad_norm": 1.890625, + "grad_norm_var": 0.01727879842122396, + "learning_rate": 0.0001, + "loss": 4.3673, + "loss/crossentropy": 2.199060797691345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016109600663185, + "step": 16902 + }, + { + "epoch": 0.33808, + "grad_norm": 3.046875, + "grad_norm_var": 0.0830718994140625, + "learning_rate": 0.0001, + "loss": 4.27, + "loss/crossentropy": 2.179081439971924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21342043578624725, + "step": 16904 + }, + { + "epoch": 0.33812, + "grad_norm": 1.9375, + "grad_norm_var": 0.0810455322265625, + "learning_rate": 0.0001, + "loss": 3.9861, + "loss/crossentropy": 1.953968107700348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20462830364704132, + "step": 16906 + }, + { + "epoch": 0.33816, + "grad_norm": 2.0, + "grad_norm_var": 0.08123270670572917, + "learning_rate": 0.0001, + "loss": 4.3432, + "loss/crossentropy": 2.3202494382858276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21390480548143387, + "step": 16908 + }, + { + "epoch": 0.3382, + "grad_norm": 2.15625, + "grad_norm_var": 0.07872899373372395, + "learning_rate": 0.0001, + "loss": 4.3085, + "loss/crossentropy": 2.085767686367035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20946675539016724, + "step": 16910 + }, + { + "epoch": 0.33824, + "grad_norm": 2.0625, + "grad_norm_var": 0.07372945149739583, + "learning_rate": 0.0001, + "loss": 4.217, + "loss/crossentropy": 1.789705514907837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2247294932603836, + "step": 16912 + }, + { + "epoch": 0.33828, + "grad_norm": 1.9140625, + "grad_norm_var": 0.07440770467122396, + "learning_rate": 0.0001, + "loss": 4.0409, + "loss/crossentropy": 1.9354140758514404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17958863079547882, + "step": 16914 + }, + { + "epoch": 0.33832, + "grad_norm": 2.015625, + "grad_norm_var": 0.07529296875, + "learning_rate": 0.0001, + "loss": 3.6949, + "loss/crossentropy": 1.878045916557312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1812344640493393, + "step": 16916 + }, + { + "epoch": 0.33836, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0769488016764323, + "learning_rate": 0.0001, + "loss": 3.9392, + "loss/crossentropy": 1.7474132776260376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19444099068641663, + "step": 16918 + }, + { + "epoch": 0.3384, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007185618082682292, + "learning_rate": 0.0001, + "loss": 4.1524, + "loss/crossentropy": 2.0462751984596252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20077970623970032, + "step": 16920 + }, + { + "epoch": 0.33844, + "grad_norm": 2.03125, + "grad_norm_var": 0.006912994384765625, + "learning_rate": 0.0001, + "loss": 4.205, + "loss/crossentropy": 2.1104917526245117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21799270808696747, + "step": 16922 + }, + { + "epoch": 0.33848, + "grad_norm": 2.0625, + "grad_norm_var": 0.007083892822265625, + "learning_rate": 0.0001, + "loss": 4.1915, + "loss/crossentropy": 2.11525696516037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20585542917251587, + "step": 16924 + }, + { + "epoch": 0.33852, + "grad_norm": 1.96875, + "grad_norm_var": 0.005222320556640625, + "learning_rate": 0.0001, + "loss": 4.3629, + "loss/crossentropy": 2.3451485633850098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22672247886657715, + "step": 16926 + }, + { + "epoch": 0.33856, + "grad_norm": 2.03125, + "grad_norm_var": 0.004906209309895834, + "learning_rate": 0.0001, + "loss": 4.0493, + "loss/crossentropy": 1.8872862458229065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1973365843296051, + "step": 16928 + }, + { + "epoch": 0.3386, + "grad_norm": 2.1875, + "grad_norm_var": 0.00777587890625, + "learning_rate": 0.0001, + "loss": 4.1082, + "loss/crossentropy": 1.9465213418006897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2297648787498474, + "step": 16930 + }, + { + "epoch": 0.33864, + "grad_norm": 1.96875, + "grad_norm_var": 0.0084625244140625, + "learning_rate": 0.0001, + "loss": 3.9397, + "loss/crossentropy": 1.7463516592979431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.193784698843956, + "step": 16932 + }, + { + "epoch": 0.33868, + "grad_norm": 1.984375, + "grad_norm_var": 0.006738026936848958, + "learning_rate": 0.0001, + "loss": 4.1561, + "loss/crossentropy": 2.2214877605438232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064037024974823, + "step": 16934 + }, + { + "epoch": 0.33872, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007600657145182292, + "learning_rate": 0.0001, + "loss": 4.0355, + "loss/crossentropy": 1.896401822566986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21860874444246292, + "step": 16936 + }, + { + "epoch": 0.33876, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007209269205729166, + "learning_rate": 0.0001, + "loss": 4.0701, + "loss/crossentropy": 2.508669376373291, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23076221346855164, + "step": 16938 + }, + { + "epoch": 0.3388, + "grad_norm": 2.09375, + "grad_norm_var": 0.009266916910807292, + "learning_rate": 0.0001, + "loss": 3.8421, + "loss/crossentropy": 1.9412622451782227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1833793818950653, + "step": 16940 + }, + { + "epoch": 0.33884, + "grad_norm": 2.484375, + "grad_norm_var": 0.025406646728515624, + "learning_rate": 0.0001, + "loss": 4.362, + "loss/crossentropy": 2.4757901430130005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23629899322986603, + "step": 16942 + }, + { + "epoch": 0.33888, + "grad_norm": 2.046875, + "grad_norm_var": 0.02580744425455729, + "learning_rate": 0.0001, + "loss": 4.1204, + "loss/crossentropy": 1.788071870803833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1894787922501564, + "step": 16944 + }, + { + "epoch": 0.33892, + "grad_norm": 1.9296875, + "grad_norm_var": 0.023436482747395834, + "learning_rate": 0.0001, + "loss": 4.0626, + "loss/crossentropy": 2.1902449131011963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21310994774103165, + "step": 16946 + }, + { + "epoch": 0.33896, + "grad_norm": 1.96875, + "grad_norm_var": 0.021394856770833335, + "learning_rate": 0.0001, + "loss": 3.9828, + "loss/crossentropy": 2.0444132685661316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1755325198173523, + "step": 16948 + }, + { + "epoch": 0.339, + "grad_norm": 2.09375, + "grad_norm_var": 0.02200902303059896, + "learning_rate": 0.0001, + "loss": 4.1609, + "loss/crossentropy": 1.9242961406707764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18970132619142532, + "step": 16950 + }, + { + "epoch": 0.33904, + "grad_norm": 1.90625, + "grad_norm_var": 0.0216552734375, + "learning_rate": 0.0001, + "loss": 3.9819, + "loss/crossentropy": 1.7046860456466675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19807563722133636, + "step": 16952 + }, + { + "epoch": 0.33908, + "grad_norm": 2.0625, + "grad_norm_var": 0.02184015909830729, + "learning_rate": 0.0001, + "loss": 3.8975, + "loss/crossentropy": 1.8960286974906921, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17571083456277847, + "step": 16954 + }, + { + "epoch": 0.33912, + "grad_norm": 1.9296875, + "grad_norm_var": 0.019481404622395834, + "learning_rate": 0.0001, + "loss": 3.922, + "loss/crossentropy": 1.9351304769515991, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1918834000825882, + "step": 16956 + }, + { + "epoch": 0.33916, + "grad_norm": 2.453125, + "grad_norm_var": 0.017728424072265624, + "learning_rate": 0.0001, + "loss": 4.167, + "loss/crossentropy": 1.974304735660553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20828718692064285, + "step": 16958 + }, + { + "epoch": 0.3392, + "grad_norm": 1.921875, + "grad_norm_var": 0.017867024739583334, + "learning_rate": 0.0001, + "loss": 3.8506, + "loss/crossentropy": 2.3836190700531006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2074722871184349, + "step": 16960 + }, + { + "epoch": 0.33924, + "grad_norm": 1.875, + "grad_norm_var": 0.018123372395833334, + "learning_rate": 0.0001, + "loss": 4.0827, + "loss/crossentropy": 2.1663198471069336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20097027719020844, + "step": 16962 + }, + { + "epoch": 0.33928, + "grad_norm": 2.03125, + "grad_norm_var": 0.01810277303059896, + "learning_rate": 0.0001, + "loss": 4.2466, + "loss/crossentropy": 2.275226354598999, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22942040860652924, + "step": 16964 + }, + { + "epoch": 0.33932, + "grad_norm": 1.9375, + "grad_norm_var": 0.017796834309895832, + "learning_rate": 0.0001, + "loss": 3.9802, + "loss/crossentropy": 1.7741501331329346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1822734773159027, + "step": 16966 + }, + { + "epoch": 0.33936, + "grad_norm": 1.984375, + "grad_norm_var": 0.017215983072916666, + "learning_rate": 0.0001, + "loss": 3.9016, + "loss/crossentropy": 1.791014850139618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2008860930800438, + "step": 16968 + }, + { + "epoch": 0.3394, + "grad_norm": 2.046875, + "grad_norm_var": 0.016888173421223958, + "learning_rate": 0.0001, + "loss": 4.1941, + "loss/crossentropy": 1.9239555597305298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.192842036485672, + "step": 16970 + }, + { + "epoch": 0.33944, + "grad_norm": 2.09375, + "grad_norm_var": 0.016788482666015625, + "learning_rate": 0.0001, + "loss": 4.108, + "loss/crossentropy": 2.1543636322021484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20888526737689972, + "step": 16972 + }, + { + "epoch": 0.33948, + "grad_norm": 1.84375, + "grad_norm_var": 0.005012003580729166, + "learning_rate": 0.0001, + "loss": 3.975, + "loss/crossentropy": 2.2322418093681335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19024743884801865, + "step": 16974 + }, + { + "epoch": 0.33952, + "grad_norm": 2.109375, + "grad_norm_var": 0.012422434488932292, + "learning_rate": 0.0001, + "loss": 4.3208, + "loss/crossentropy": 1.9934163093566895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26193471997976303, + "step": 16976 + }, + { + "epoch": 0.33956, + "grad_norm": 2.03125, + "grad_norm_var": 0.013244374593098959, + "learning_rate": 0.0001, + "loss": 4.2753, + "loss/crossentropy": 2.340088725090027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2110593616962433, + "step": 16978 + }, + { + "epoch": 0.3396, + "grad_norm": 1.890625, + "grad_norm_var": 0.03021214803059896, + "learning_rate": 0.0001, + "loss": 4.2047, + "loss/crossentropy": 2.057206869125366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2071269005537033, + "step": 16980 + }, + { + "epoch": 0.33964, + "grad_norm": 1.953125, + "grad_norm_var": 0.030210113525390624, + "learning_rate": 0.0001, + "loss": 4.0262, + "loss/crossentropy": 2.002032458782196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19903026521205902, + "step": 16982 + }, + { + "epoch": 0.33968, + "grad_norm": 2.0, + "grad_norm_var": 0.030804189046223958, + "learning_rate": 0.0001, + "loss": 4.246, + "loss/crossentropy": 2.01567679643631, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20198361575603485, + "step": 16984 + }, + { + "epoch": 0.33972, + "grad_norm": 1.8515625, + "grad_norm_var": 0.032814280192057295, + "learning_rate": 0.0001, + "loss": 4.0606, + "loss/crossentropy": 1.661778211593628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16601823270320892, + "step": 16986 + }, + { + "epoch": 0.33976, + "grad_norm": 2.015625, + "grad_norm_var": 0.03467203776041667, + "learning_rate": 0.0001, + "loss": 3.9411, + "loss/crossentropy": 2.284485101699829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20108629018068314, + "step": 16988 + }, + { + "epoch": 0.3398, + "grad_norm": 2.03125, + "grad_norm_var": 0.03200861612955729, + "learning_rate": 0.0001, + "loss": 4.1066, + "loss/crossentropy": 2.1860578656196594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22195414453744888, + "step": 16990 + }, + { + "epoch": 0.33984, + "grad_norm": 1.984375, + "grad_norm_var": 0.02719904581705729, + "learning_rate": 0.0001, + "loss": 4.0212, + "loss/crossentropy": 2.032013416290283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19930274784564972, + "step": 16992 + }, + { + "epoch": 0.33988, + "grad_norm": 2.0625, + "grad_norm_var": 0.1898577372233073, + "learning_rate": 0.0001, + "loss": 4.2847, + "loss/crossentropy": 2.227185010910034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19806954264640808, + "step": 16994 + }, + { + "epoch": 0.33992, + "grad_norm": 1.9609375, + "grad_norm_var": 0.17535807291666666, + "learning_rate": 0.0001, + "loss": 4.1526, + "loss/crossentropy": 2.222020983695984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2354518324136734, + "step": 16996 + }, + { + "epoch": 0.33996, + "grad_norm": 2.109375, + "grad_norm_var": 0.1755767822265625, + "learning_rate": 0.0001, + "loss": 4.1766, + "loss/crossentropy": 2.336432456970215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19555255770683289, + "step": 16998 + }, + { + "epoch": 0.34, + "grad_norm": 1.96875, + "grad_norm_var": 0.1748443603515625, + "learning_rate": 0.0001, + "loss": 4.3312, + "loss/crossentropy": 2.17374986410141, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21119412034749985, + "step": 17000 + }, + { + "epoch": 0.34004, + "grad_norm": 2.203125, + "grad_norm_var": 0.17195536295572916, + "learning_rate": 0.0001, + "loss": 4.1445, + "loss/crossentropy": 2.135382056236267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2263253927230835, + "step": 17002 + }, + { + "epoch": 0.34008, + "grad_norm": 2.09375, + "grad_norm_var": 0.20149917602539064, + "learning_rate": 0.0001, + "loss": 3.811, + "loss/crossentropy": 1.9646037220954895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18890808522701263, + "step": 17004 + }, + { + "epoch": 0.34012, + "grad_norm": 1.921875, + "grad_norm_var": 0.20098368326822916, + "learning_rate": 0.0001, + "loss": 4.0889, + "loss/crossentropy": 1.8009583353996277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1882813572883606, + "step": 17006 + }, + { + "epoch": 0.34016, + "grad_norm": 3.109375, + "grad_norm_var": 0.24433492024739584, + "learning_rate": 0.0001, + "loss": 4.4921, + "loss/crossentropy": 2.0362982153892517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.208044171333313, + "step": 17008 + }, + { + "epoch": 0.3402, + "grad_norm": 1.9765625, + "grad_norm_var": 0.11477432250976563, + "learning_rate": 0.0001, + "loss": 4.1781, + "loss/crossentropy": 2.2055057287216187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21141493320465088, + "step": 17010 + }, + { + "epoch": 0.34024, + "grad_norm": 2.0, + "grad_norm_var": 0.11963475545247396, + "learning_rate": 0.0001, + "loss": 3.772, + "loss/crossentropy": 1.386088252067566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16025834530591965, + "step": 17012 + }, + { + "epoch": 0.34028, + "grad_norm": 2.1875, + "grad_norm_var": 0.11599299112955729, + "learning_rate": 0.0001, + "loss": 4.2389, + "loss/crossentropy": 2.131182312965393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20460083335638046, + "step": 17014 + }, + { + "epoch": 0.34032, + "grad_norm": 2.015625, + "grad_norm_var": 0.11592992146809895, + "learning_rate": 0.0001, + "loss": 4.2681, + "loss/crossentropy": 2.2682281732559204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22915159910917282, + "step": 17016 + }, + { + "epoch": 0.34036, + "grad_norm": 1.96875, + "grad_norm_var": 0.12011693318684896, + "learning_rate": 0.0001, + "loss": 4.0521, + "loss/crossentropy": 2.0855059027671814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19398894160985947, + "step": 17018 + }, + { + "epoch": 0.3404, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0842437744140625, + "learning_rate": 0.0001, + "loss": 4.0421, + "loss/crossentropy": 1.895095944404602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172701731324196, + "step": 17020 + }, + { + "epoch": 0.34044, + "grad_norm": 2.0, + "grad_norm_var": 0.08455403645833333, + "learning_rate": 0.0001, + "loss": 4.0814, + "loss/crossentropy": 2.10223788022995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2144509255886078, + "step": 17022 + }, + { + "epoch": 0.34048, + "grad_norm": 2.03125, + "grad_norm_var": 0.014241282145182292, + "learning_rate": 0.0001, + "loss": 4.1211, + "loss/crossentropy": 2.1131081581115723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21176592260599136, + "step": 17024 + }, + { + "epoch": 0.34052, + "grad_norm": 2.0, + "grad_norm_var": 0.009991200764973958, + "learning_rate": 0.0001, + "loss": 4.1581, + "loss/crossentropy": 2.1125452518463135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1931193768978119, + "step": 17026 + }, + { + "epoch": 0.34056, + "grad_norm": 1.8046875, + "grad_norm_var": 0.017775217692057293, + "learning_rate": 0.0001, + "loss": 4.2624, + "loss/crossentropy": 2.213385283946991, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2576863020658493, + "step": 17028 + }, + { + "epoch": 0.3406, + "grad_norm": 2.171875, + "grad_norm_var": 0.017773183186848958, + "learning_rate": 0.0001, + "loss": 4.408, + "loss/crossentropy": 2.0793206095695496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20338336378335953, + "step": 17030 + }, + { + "epoch": 0.34064, + "grad_norm": 2.078125, + "grad_norm_var": 0.01807225545247396, + "learning_rate": 0.0001, + "loss": 4.3102, + "loss/crossentropy": 1.8870239853858948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18544750660657883, + "step": 17032 + }, + { + "epoch": 0.34068, + "grad_norm": 1.9609375, + "grad_norm_var": 0.017254384358723958, + "learning_rate": 0.0001, + "loss": 4.1538, + "loss/crossentropy": 1.9893989562988281, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19679997861385345, + "step": 17034 + }, + { + "epoch": 0.34072, + "grad_norm": 1.9609375, + "grad_norm_var": 0.01845270792643229, + "learning_rate": 0.0001, + "loss": 3.9447, + "loss/crossentropy": 2.0401668548583984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19242482632398605, + "step": 17036 + }, + { + "epoch": 0.34076, + "grad_norm": 2.109375, + "grad_norm_var": 0.015472157796223959, + "learning_rate": 0.0001, + "loss": 4.0509, + "loss/crossentropy": 2.0786415934562683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21781006455421448, + "step": 17038 + }, + { + "epoch": 0.3408, + "grad_norm": 1.9296875, + "grad_norm_var": 0.015533192952473959, + "learning_rate": 0.0001, + "loss": 3.9472, + "loss/crossentropy": 2.143572211265564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20608216524124146, + "step": 17040 + }, + { + "epoch": 0.34084, + "grad_norm": 2.0625, + "grad_norm_var": 0.0154693603515625, + "learning_rate": 0.0001, + "loss": 4.3305, + "loss/crossentropy": 2.0578572750091553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22006108611822128, + "step": 17042 + }, + { + "epoch": 0.34088, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0061920166015625, + "learning_rate": 0.0001, + "loss": 4.1295, + "loss/crossentropy": 2.1016032099723816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2059948667883873, + "step": 17044 + }, + { + "epoch": 0.34092, + "grad_norm": 1.9609375, + "grad_norm_var": 0.00438232421875, + "learning_rate": 0.0001, + "loss": 3.9293, + "loss/crossentropy": 1.814449965953827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1946963220834732, + "step": 17046 + }, + { + "epoch": 0.34096, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004233551025390625, + "learning_rate": 0.0001, + "loss": 4.0696, + "loss/crossentropy": 2.2421000599861145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2114911824464798, + "step": 17048 + }, + { + "epoch": 0.341, + "grad_norm": 2.125, + "grad_norm_var": 0.0070231119791666664, + "learning_rate": 0.0001, + "loss": 3.984, + "loss/crossentropy": 2.1200218200683594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.212583489716053, + "step": 17050 + }, + { + "epoch": 0.34104, + "grad_norm": 1.921875, + "grad_norm_var": 0.007441965738932291, + "learning_rate": 0.0001, + "loss": 3.8656, + "loss/crossentropy": 1.9486305713653564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18631484359502792, + "step": 17052 + }, + { + "epoch": 0.34108, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007721964518229167, + "learning_rate": 0.0001, + "loss": 4.3888, + "loss/crossentropy": 2.4838292598724365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23271672427654266, + "step": 17054 + }, + { + "epoch": 0.34112, + "grad_norm": 1.96875, + "grad_norm_var": 0.008137766520182292, + "learning_rate": 0.0001, + "loss": 3.9099, + "loss/crossentropy": 1.8606489896774292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1955883800983429, + "step": 17056 + }, + { + "epoch": 0.34116, + "grad_norm": 2.15625, + "grad_norm_var": 0.013199615478515624, + "learning_rate": 0.0001, + "loss": 4.2847, + "loss/crossentropy": 2.1702204942703247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21619878709316254, + "step": 17058 + }, + { + "epoch": 0.3412, + "grad_norm": 2.015625, + "grad_norm_var": 0.013158162434895834, + "learning_rate": 0.0001, + "loss": 4.1987, + "loss/crossentropy": 2.1011393070220947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2120063453912735, + "step": 17060 + }, + { + "epoch": 0.34124, + "grad_norm": 1.953125, + "grad_norm_var": 0.013142903645833334, + "learning_rate": 0.0001, + "loss": 4.1704, + "loss/crossentropy": 2.267427682876587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20903929322957993, + "step": 17062 + }, + { + "epoch": 0.34128, + "grad_norm": 1.984375, + "grad_norm_var": 0.01620457967122396, + "learning_rate": 0.0001, + "loss": 4.3312, + "loss/crossentropy": 2.125304937362671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18540818989276886, + "step": 17064 + }, + { + "epoch": 0.34132, + "grad_norm": 1.90625, + "grad_norm_var": 0.013925933837890625, + "learning_rate": 0.0001, + "loss": 3.9552, + "loss/crossentropy": 1.8741024136543274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19243815541267395, + "step": 17066 + }, + { + "epoch": 0.34136, + "grad_norm": 2.46875, + "grad_norm_var": 0.023451487223307293, + "learning_rate": 0.0001, + "loss": 4.0252, + "loss/crossentropy": 2.12781822681427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23974300920963287, + "step": 17068 + }, + { + "epoch": 0.3414, + "grad_norm": 2.15625, + "grad_norm_var": 0.02448298136393229, + "learning_rate": 0.0001, + "loss": 4.2728, + "loss/crossentropy": 2.0786141753196716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22311320155858994, + "step": 17070 + }, + { + "epoch": 0.34144, + "grad_norm": 1.9140625, + "grad_norm_var": 0.02408421834309896, + "learning_rate": 0.0001, + "loss": 4.1686, + "loss/crossentropy": 2.1569892168045044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20391946285963058, + "step": 17072 + }, + { + "epoch": 0.34148, + "grad_norm": 1.90625, + "grad_norm_var": 0.024607086181640626, + "learning_rate": 0.0001, + "loss": 3.8035, + "loss/crossentropy": 1.8860353231430054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1937229335308075, + "step": 17074 + }, + { + "epoch": 0.34152, + "grad_norm": 2.0, + "grad_norm_var": 0.02448094685872396, + "learning_rate": 0.0001, + "loss": 4.0993, + "loss/crossentropy": 2.25645911693573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088249772787094, + "step": 17076 + }, + { + "epoch": 0.34156, + "grad_norm": 2.328125, + "grad_norm_var": 0.02972997029622396, + "learning_rate": 0.0001, + "loss": 4.3486, + "loss/crossentropy": 2.379481792449951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2341819554567337, + "step": 17078 + }, + { + "epoch": 0.3416, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0273345947265625, + "learning_rate": 0.0001, + "loss": 4.0841, + "loss/crossentropy": 2.1564733386039734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2162168025970459, + "step": 17080 + }, + { + "epoch": 0.34164, + "grad_norm": 3.015625, + "grad_norm_var": 0.0860260009765625, + "learning_rate": 0.0001, + "loss": 4.2526, + "loss/crossentropy": 1.8906886577606201, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18614360690116882, + "step": 17082 + }, + { + "epoch": 0.34168, + "grad_norm": 2.03125, + "grad_norm_var": 0.07732747395833334, + "learning_rate": 0.0001, + "loss": 4.0135, + "loss/crossentropy": 2.093637704849243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19867172092199326, + "step": 17084 + }, + { + "epoch": 0.34172, + "grad_norm": 2.03125, + "grad_norm_var": 0.0760210673014323, + "learning_rate": 0.0001, + "loss": 4.2364, + "loss/crossentropy": 2.2579731941223145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2153376340866089, + "step": 17086 + }, + { + "epoch": 0.34176, + "grad_norm": 2.15625, + "grad_norm_var": 0.07312393188476562, + "learning_rate": 0.0001, + "loss": 4.3275, + "loss/crossentropy": 2.250504732131958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2759099751710892, + "step": 17088 + }, + { + "epoch": 0.3418, + "grad_norm": 1.921875, + "grad_norm_var": 0.07190653483072916, + "learning_rate": 0.0001, + "loss": 3.8977, + "loss/crossentropy": 1.6062138676643372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15309542417526245, + "step": 17090 + }, + { + "epoch": 0.34184, + "grad_norm": 1.8828125, + "grad_norm_var": 0.07587865193684896, + "learning_rate": 0.0001, + "loss": 4.2199, + "loss/crossentropy": 2.227494239807129, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20563329756259918, + "step": 17092 + }, + { + "epoch": 0.34188, + "grad_norm": 1.921875, + "grad_norm_var": 0.07314631144205729, + "learning_rate": 0.0001, + "loss": 4.1095, + "loss/crossentropy": 2.1047890186309814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19775691628456116, + "step": 17094 + }, + { + "epoch": 0.34192, + "grad_norm": 2.015625, + "grad_norm_var": 0.07301813761393229, + "learning_rate": 0.0001, + "loss": 4.0508, + "loss/crossentropy": 2.044450581073761, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19867002964019775, + "step": 17096 + }, + { + "epoch": 0.34196, + "grad_norm": 1.953125, + "grad_norm_var": 0.009511057535807292, + "learning_rate": 0.0001, + "loss": 4.2319, + "loss/crossentropy": 1.9139947891235352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20143548399209976, + "step": 17098 + }, + { + "epoch": 0.342, + "grad_norm": 1.9375, + "grad_norm_var": 0.008162180582682291, + "learning_rate": 0.0001, + "loss": 3.7159, + "loss/crossentropy": 1.6752634048461914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17691873013973236, + "step": 17100 + }, + { + "epoch": 0.34204, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0156402587890625, + "learning_rate": 0.0001, + "loss": 4.2159, + "loss/crossentropy": 2.277552843093872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2049017697572708, + "step": 17102 + }, + { + "epoch": 0.34208, + "grad_norm": 1.9765625, + "grad_norm_var": 0.015608469645182291, + "learning_rate": 0.0001, + "loss": 4.0663, + "loss/crossentropy": 2.458711266517639, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21726765483617783, + "step": 17104 + }, + { + "epoch": 0.34212, + "grad_norm": 2.171875, + "grad_norm_var": 0.01627197265625, + "learning_rate": 0.0001, + "loss": 4.0384, + "loss/crossentropy": 1.9302194714546204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19446631520986557, + "step": 17106 + }, + { + "epoch": 0.34216, + "grad_norm": 1.890625, + "grad_norm_var": 0.01622314453125, + "learning_rate": 0.0001, + "loss": 3.8935, + "loss/crossentropy": 2.021213114261627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19394385814666748, + "step": 17108 + }, + { + "epoch": 0.3422, + "grad_norm": 2.0, + "grad_norm_var": 0.01689453125, + "learning_rate": 0.0001, + "loss": 4.4094, + "loss/crossentropy": 2.4856090545654297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2156004160642624, + "step": 17110 + }, + { + "epoch": 0.34224, + "grad_norm": 2.09375, + "grad_norm_var": 0.016869862874348957, + "learning_rate": 0.0001, + "loss": 4.4135, + "loss/crossentropy": 2.410070061683655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21203476190567017, + "step": 17112 + }, + { + "epoch": 0.34228, + "grad_norm": 1.9765625, + "grad_norm_var": 0.019136555989583335, + "learning_rate": 0.0001, + "loss": 4.0484, + "loss/crossentropy": 2.026827871799469, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20053605735301971, + "step": 17114 + }, + { + "epoch": 0.34232, + "grad_norm": 2.09375, + "grad_norm_var": 0.016283162434895835, + "learning_rate": 0.0001, + "loss": 4.2319, + "loss/crossentropy": 2.1833966970443726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2193789780139923, + "step": 17116 + }, + { + "epoch": 0.34236, + "grad_norm": 2.109375, + "grad_norm_var": 0.010798899332682292, + "learning_rate": 0.0001, + "loss": 4.0921, + "loss/crossentropy": 2.137472152709961, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21218669414520264, + "step": 17118 + }, + { + "epoch": 0.3424, + "grad_norm": 1.96875, + "grad_norm_var": 0.013741048177083333, + "learning_rate": 0.0001, + "loss": 4.2064, + "loss/crossentropy": 2.0607420206069946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2138473093509674, + "step": 17120 + }, + { + "epoch": 0.34244, + "grad_norm": 1.984375, + "grad_norm_var": 0.0123687744140625, + "learning_rate": 0.0001, + "loss": 4.1787, + "loss/crossentropy": 1.9109066724777222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20130306482315063, + "step": 17122 + }, + { + "epoch": 0.34248, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010944620768229166, + "learning_rate": 0.0001, + "loss": 3.7531, + "loss/crossentropy": 2.1049917936325073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20878130197525024, + "step": 17124 + }, + { + "epoch": 0.34252, + "grad_norm": 2.03125, + "grad_norm_var": 0.010008748372395833, + "learning_rate": 0.0001, + "loss": 4.0951, + "loss/crossentropy": 1.9087567925453186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18959704041481018, + "step": 17126 + }, + { + "epoch": 0.34256, + "grad_norm": 2.125, + "grad_norm_var": 0.010033162434895833, + "learning_rate": 0.0001, + "loss": 4.3854, + "loss/crossentropy": 2.491095781326294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21739569306373596, + "step": 17128 + }, + { + "epoch": 0.3426, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0081207275390625, + "learning_rate": 0.0001, + "loss": 4.0227, + "loss/crossentropy": 2.045976400375366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18868304044008255, + "step": 17130 + }, + { + "epoch": 0.34264, + "grad_norm": 2.109375, + "grad_norm_var": 0.009318033854166666, + "learning_rate": 0.0001, + "loss": 4.1221, + "loss/crossentropy": 2.021378517150879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1955355852842331, + "step": 17132 + }, + { + "epoch": 0.34268, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010247548421223959, + "learning_rate": 0.0001, + "loss": 4.0244, + "loss/crossentropy": 2.052547872066498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18654850870370865, + "step": 17134 + }, + { + "epoch": 0.34272, + "grad_norm": 2.09375, + "grad_norm_var": 0.008886464436848958, + "learning_rate": 0.0001, + "loss": 4.2536, + "loss/crossentropy": 2.1709023118019104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21335267275571823, + "step": 17136 + }, + { + "epoch": 0.34276, + "grad_norm": 1.921875, + "grad_norm_var": 0.010477701822916666, + "learning_rate": 0.0001, + "loss": 4.1682, + "loss/crossentropy": 2.310550093650818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18719495832920074, + "step": 17138 + }, + { + "epoch": 0.3428, + "grad_norm": 2.046875, + "grad_norm_var": 0.011262003580729167, + "learning_rate": 0.0001, + "loss": 4.0145, + "loss/crossentropy": 1.9357584714889526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1914527416229248, + "step": 17140 + }, + { + "epoch": 0.34284, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012684885660807292, + "learning_rate": 0.0001, + "loss": 3.9673, + "loss/crossentropy": 1.7561541199684143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1711764633655548, + "step": 17142 + }, + { + "epoch": 0.34288, + "grad_norm": 2.515625, + "grad_norm_var": 0.7546994527180989, + "learning_rate": 0.0001, + "loss": 4.562, + "loss/crossentropy": 2.4081480503082275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2405518889427185, + "step": 17144 + }, + { + "epoch": 0.34292, + "grad_norm": 2.34375, + "grad_norm_var": 0.7456451416015625, + "learning_rate": 0.0001, + "loss": 4.4023, + "loss/crossentropy": 2.44531512260437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23487353324890137, + "step": 17146 + }, + { + "epoch": 0.34296, + "grad_norm": 1.9296875, + "grad_norm_var": 0.7445696512858073, + "learning_rate": 0.0001, + "loss": 4.0845, + "loss/crossentropy": 1.9712305068969727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20045534521341324, + "step": 17148 + }, + { + "epoch": 0.343, + "grad_norm": 1.890625, + "grad_norm_var": 0.7468706766764323, + "learning_rate": 0.0001, + "loss": 3.8479, + "loss/crossentropy": 1.9718485474586487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18550659716129303, + "step": 17150 + }, + { + "epoch": 0.34304, + "grad_norm": 1.9453125, + "grad_norm_var": 0.7527414957682291, + "learning_rate": 0.0001, + "loss": 4.2708, + "loss/crossentropy": 2.1901930570602417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2082839384675026, + "step": 17152 + }, + { + "epoch": 0.34308, + "grad_norm": 1.953125, + "grad_norm_var": 0.7463905334472656, + "learning_rate": 0.0001, + "loss": 4.2104, + "loss/crossentropy": 2.098864793777466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19527551531791687, + "step": 17154 + }, + { + "epoch": 0.34312, + "grad_norm": 2.0, + "grad_norm_var": 0.7467437744140625, + "learning_rate": 0.0001, + "loss": 3.9873, + "loss/crossentropy": 1.7448402643203735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1862412989139557, + "step": 17156 + }, + { + "epoch": 0.34316, + "grad_norm": 2.109375, + "grad_norm_var": 0.7429603576660156, + "learning_rate": 0.0001, + "loss": 4.1016, + "loss/crossentropy": 1.9973859190940857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19645331799983978, + "step": 17158 + }, + { + "epoch": 0.3432, + "grad_norm": 1.8515625, + "grad_norm_var": 0.016874186197916665, + "learning_rate": 0.0001, + "loss": 4.2022, + "loss/crossentropy": 2.2570079565048218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20290197432041168, + "step": 17160 + }, + { + "epoch": 0.34324, + "grad_norm": 2.15625, + "grad_norm_var": 0.008642323811848958, + "learning_rate": 0.0001, + "loss": 4.2304, + "loss/crossentropy": 1.7943695187568665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1924503892660141, + "step": 17162 + }, + { + "epoch": 0.34328, + "grad_norm": 2.0, + "grad_norm_var": 0.008225250244140624, + "learning_rate": 0.0001, + "loss": 4.2142, + "loss/crossentropy": 2.1875799894332886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1968172788619995, + "step": 17164 + }, + { + "epoch": 0.34332, + "grad_norm": 2.0625, + "grad_norm_var": 0.007477823893229167, + "learning_rate": 0.0001, + "loss": 4.1844, + "loss/crossentropy": 2.242374360561371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21981361508369446, + "step": 17166 + }, + { + "epoch": 0.34336, + "grad_norm": 2.046875, + "grad_norm_var": 0.006754302978515625, + "learning_rate": 0.0001, + "loss": 4.0406, + "loss/crossentropy": 1.8542492985725403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19412916153669357, + "step": 17168 + }, + { + "epoch": 0.3434, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0067860921223958336, + "learning_rate": 0.0001, + "loss": 3.8804, + "loss/crossentropy": 1.9377904534339905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20015903562307358, + "step": 17170 + }, + { + "epoch": 0.34344, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006319173177083333, + "learning_rate": 0.0001, + "loss": 4.1691, + "loss/crossentropy": 2.2250781655311584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1943892240524292, + "step": 17172 + }, + { + "epoch": 0.34348, + "grad_norm": 1.984375, + "grad_norm_var": 0.0054094950358072914, + "learning_rate": 0.0001, + "loss": 4.0274, + "loss/crossentropy": 2.0463303923606873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1931215077638626, + "step": 17174 + }, + { + "epoch": 0.34352, + "grad_norm": 2.078125, + "grad_norm_var": 0.004984537760416667, + "learning_rate": 0.0001, + "loss": 3.826, + "loss/crossentropy": 2.0576277375221252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24007725715637207, + "step": 17176 + }, + { + "epoch": 0.34356, + "grad_norm": 1.859375, + "grad_norm_var": 0.003999582926432292, + "learning_rate": 0.0001, + "loss": 3.9865, + "loss/crossentropy": 1.8423307538032532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17171106487512589, + "step": 17178 + }, + { + "epoch": 0.3436, + "grad_norm": 1.984375, + "grad_norm_var": 0.0053375244140625, + "learning_rate": 0.0001, + "loss": 4.3089, + "loss/crossentropy": 2.05659818649292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20058965682983398, + "step": 17180 + }, + { + "epoch": 0.34364, + "grad_norm": 2.046875, + "grad_norm_var": 0.0055620829264322914, + "learning_rate": 0.0001, + "loss": 3.8965, + "loss/crossentropy": 1.926946997642517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1770440638065338, + "step": 17182 + }, + { + "epoch": 0.34368, + "grad_norm": 2.015625, + "grad_norm_var": 0.005098215738932292, + "learning_rate": 0.0001, + "loss": 3.9781, + "loss/crossentropy": 1.8181686401367188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19718249142169952, + "step": 17184 + }, + { + "epoch": 0.34372, + "grad_norm": 1.8203125, + "grad_norm_var": 0.006624094645182292, + "learning_rate": 0.0001, + "loss": 4.0732, + "loss/crossentropy": 2.021128237247467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1968582272529602, + "step": 17186 + }, + { + "epoch": 0.34376, + "grad_norm": 2.109375, + "grad_norm_var": 0.008365631103515625, + "learning_rate": 0.0001, + "loss": 4.2861, + "loss/crossentropy": 2.04776668548584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22718969732522964, + "step": 17188 + }, + { + "epoch": 0.3438, + "grad_norm": 1.890625, + "grad_norm_var": 0.008957672119140624, + "learning_rate": 0.0001, + "loss": 4.2196, + "loss/crossentropy": 2.2225213050842285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20369096100330353, + "step": 17190 + }, + { + "epoch": 0.34384, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008089192708333333, + "learning_rate": 0.0001, + "loss": 4.1137, + "loss/crossentropy": 1.8544179201126099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20437808334827423, + "step": 17192 + }, + { + "epoch": 0.34388, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006730143229166667, + "learning_rate": 0.0001, + "loss": 4.387, + "loss/crossentropy": 2.1270517110824585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1997273936867714, + "step": 17194 + }, + { + "epoch": 0.34392, + "grad_norm": 1.984375, + "grad_norm_var": 0.0060791015625, + "learning_rate": 0.0001, + "loss": 3.8062, + "loss/crossentropy": 1.8699182271957397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19382557272911072, + "step": 17196 + }, + { + "epoch": 0.34396, + "grad_norm": 2.046875, + "grad_norm_var": 0.006941731770833333, + "learning_rate": 0.0001, + "loss": 3.9864, + "loss/crossentropy": 1.8734498023986816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18053626269102097, + "step": 17198 + }, + { + "epoch": 0.344, + "grad_norm": 1.890625, + "grad_norm_var": 0.008097330729166666, + "learning_rate": 0.0001, + "loss": 3.9234, + "loss/crossentropy": 1.9386133551597595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1851777583360672, + "step": 17200 + }, + { + "epoch": 0.34404, + "grad_norm": 2.015625, + "grad_norm_var": 0.0069163004557291664, + "learning_rate": 0.0001, + "loss": 3.9619, + "loss/crossentropy": 2.1074278354644775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22176695615053177, + "step": 17202 + }, + { + "epoch": 0.34408, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005895741780598958, + "learning_rate": 0.0001, + "loss": 4.1231, + "loss/crossentropy": 1.826387107372284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19368141889572144, + "step": 17204 + }, + { + "epoch": 0.34412, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005680084228515625, + "learning_rate": 0.0001, + "loss": 4.1367, + "loss/crossentropy": 1.9109328389167786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2023227959871292, + "step": 17206 + }, + { + "epoch": 0.34416, + "grad_norm": 2.09375, + "grad_norm_var": 0.006517537434895833, + "learning_rate": 0.0001, + "loss": 4.1495, + "loss/crossentropy": 1.8752732872962952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17867545038461685, + "step": 17208 + }, + { + "epoch": 0.3442, + "grad_norm": 1.828125, + "grad_norm_var": 0.00972900390625, + "learning_rate": 0.0001, + "loss": 3.9998, + "loss/crossentropy": 1.95436429977417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19947312772274017, + "step": 17210 + }, + { + "epoch": 0.34424, + "grad_norm": 1.984375, + "grad_norm_var": 0.00947265625, + "learning_rate": 0.0001, + "loss": 4.0074, + "loss/crossentropy": 1.9237809777259827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2055458500981331, + "step": 17212 + }, + { + "epoch": 0.34428, + "grad_norm": 1.9375, + "grad_norm_var": 0.007995351155598959, + "learning_rate": 0.0001, + "loss": 4.1351, + "loss/crossentropy": 2.4497138261795044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23067011684179306, + "step": 17214 + }, + { + "epoch": 0.34432, + "grad_norm": 1.859375, + "grad_norm_var": 0.007791900634765625, + "learning_rate": 0.0001, + "loss": 3.9144, + "loss/crossentropy": 2.33384370803833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22732949256896973, + "step": 17216 + }, + { + "epoch": 0.34436, + "grad_norm": 2.0, + "grad_norm_var": 0.0075439453125, + "learning_rate": 0.0001, + "loss": 3.9414, + "loss/crossentropy": 2.022156059741974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063901051878929, + "step": 17218 + }, + { + "epoch": 0.3444, + "grad_norm": 1.90625, + "grad_norm_var": 0.007614898681640625, + "learning_rate": 0.0001, + "loss": 4.0642, + "loss/crossentropy": 2.183963179588318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21098726987838745, + "step": 17220 + }, + { + "epoch": 0.34444, + "grad_norm": 6.875, + "grad_norm_var": 1.501227823893229, + "learning_rate": 0.0001, + "loss": 4.3139, + "loss/crossentropy": 2.256633758544922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21449647843837738, + "step": 17222 + }, + { + "epoch": 0.34448, + "grad_norm": 14.3125, + "grad_norm_var": 10.478612263997396, + "learning_rate": 0.0001, + "loss": 3.838, + "loss/crossentropy": 1.8303287625312805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18412896245718002, + "step": 17224 + }, + { + "epoch": 0.34452, + "grad_norm": 2.0625, + "grad_norm_var": 10.405972290039063, + "learning_rate": 0.0001, + "loss": 4.1852, + "loss/crossentropy": 2.3248833417892456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23459070920944214, + "step": 17226 + }, + { + "epoch": 0.34456, + "grad_norm": 2.109375, + "grad_norm_var": 10.418485514322917, + "learning_rate": 0.0001, + "loss": 4.062, + "loss/crossentropy": 2.018009066581726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21266046166419983, + "step": 17228 + }, + { + "epoch": 0.3446, + "grad_norm": 1.984375, + "grad_norm_var": 10.434609985351562, + "learning_rate": 0.0001, + "loss": 4.186, + "loss/crossentropy": 2.337665557861328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.225304514169693, + "step": 17230 + }, + { + "epoch": 0.34464, + "grad_norm": 1.8046875, + "grad_norm_var": 10.453043365478516, + "learning_rate": 0.0001, + "loss": 3.86, + "loss/crossentropy": 1.8581790924072266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18770471215248108, + "step": 17232 + }, + { + "epoch": 0.34468, + "grad_norm": 1.8515625, + "grad_norm_var": 10.462247721354167, + "learning_rate": 0.0001, + "loss": 3.9213, + "loss/crossentropy": 1.9208187460899353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19106490910053253, + "step": 17234 + }, + { + "epoch": 0.34472, + "grad_norm": 1.8515625, + "grad_norm_var": 10.454630279541016, + "learning_rate": 0.0001, + "loss": 3.8284, + "loss/crossentropy": 1.8844050765037537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17407642304897308, + "step": 17236 + }, + { + "epoch": 0.34476, + "grad_norm": 1.9296875, + "grad_norm_var": 9.497085571289062, + "learning_rate": 0.0001, + "loss": 3.7423, + "loss/crossentropy": 2.0443355441093445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198170468211174, + "step": 17238 + }, + { + "epoch": 0.3448, + "grad_norm": 2.046875, + "grad_norm_var": 0.027342732747395834, + "learning_rate": 0.0001, + "loss": 4.0965, + "loss/crossentropy": 1.8563520908355713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1958291083574295, + "step": 17240 + }, + { + "epoch": 0.34484, + "grad_norm": 2.015625, + "grad_norm_var": 0.0127349853515625, + "learning_rate": 0.0001, + "loss": 4.3028, + "loss/crossentropy": 2.1252214908599854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22386425733566284, + "step": 17242 + }, + { + "epoch": 0.34488, + "grad_norm": 1.9375, + "grad_norm_var": 0.011848958333333333, + "learning_rate": 0.0001, + "loss": 3.926, + "loss/crossentropy": 2.1379681825637817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19781935214996338, + "step": 17244 + }, + { + "epoch": 0.34492, + "grad_norm": 1.828125, + "grad_norm_var": 0.013529205322265625, + "learning_rate": 0.0001, + "loss": 3.7403, + "loss/crossentropy": 1.9774981141090393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18523120880126953, + "step": 17246 + }, + { + "epoch": 0.34496, + "grad_norm": 2.015625, + "grad_norm_var": 0.0120025634765625, + "learning_rate": 0.0001, + "loss": 4.2668, + "loss/crossentropy": 2.3234479427337646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21939057111740112, + "step": 17248 + }, + { + "epoch": 0.345, + "grad_norm": 2.109375, + "grad_norm_var": 0.011706288655598958, + "learning_rate": 0.0001, + "loss": 4.11, + "loss/crossentropy": 2.0521583557128906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20204473286867142, + "step": 17250 + }, + { + "epoch": 0.34504, + "grad_norm": 1.90625, + "grad_norm_var": 0.03452860514322917, + "learning_rate": 0.0001, + "loss": 4.2184, + "loss/crossentropy": 2.486477255821228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21673013269901276, + "step": 17252 + }, + { + "epoch": 0.34508, + "grad_norm": 1.8515625, + "grad_norm_var": 0.035456339518229164, + "learning_rate": 0.0001, + "loss": 4.0215, + "loss/crossentropy": 2.285652995109558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21322200447320938, + "step": 17254 + }, + { + "epoch": 0.34512, + "grad_norm": 2.046875, + "grad_norm_var": 0.03436686197916667, + "learning_rate": 0.0001, + "loss": 4.1855, + "loss/crossentropy": 2.0946747064590454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19791530817747116, + "step": 17256 + }, + { + "epoch": 0.34516, + "grad_norm": 2.09375, + "grad_norm_var": 0.03250732421875, + "learning_rate": 0.0001, + "loss": 4.1129, + "loss/crossentropy": 1.7509565353393555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16754039376974106, + "step": 17258 + }, + { + "epoch": 0.3452, + "grad_norm": 2.03125, + "grad_norm_var": 0.0325347900390625, + "learning_rate": 0.0001, + "loss": 4.3411, + "loss/crossentropy": 2.3828574419021606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22555553913116455, + "step": 17260 + }, + { + "epoch": 0.34524, + "grad_norm": 2.140625, + "grad_norm_var": 0.029060872395833333, + "learning_rate": 0.0001, + "loss": 4.215, + "loss/crossentropy": 1.997941255569458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1815095767378807, + "step": 17262 + }, + { + "epoch": 0.34528, + "grad_norm": 2.25, + "grad_norm_var": 0.0296630859375, + "learning_rate": 0.0001, + "loss": 4.3814, + "loss/crossentropy": 2.3036177158355713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2305641770362854, + "step": 17264 + }, + { + "epoch": 0.34532, + "grad_norm": 1.8984375, + "grad_norm_var": 0.03178075154622396, + "learning_rate": 0.0001, + "loss": 4.1668, + "loss/crossentropy": 2.202543616294861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19594155251979828, + "step": 17266 + }, + { + "epoch": 0.34536, + "grad_norm": 1.875, + "grad_norm_var": 0.011199696858723959, + "learning_rate": 0.0001, + "loss": 4.1792, + "loss/crossentropy": 2.1162038445472717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1945410966873169, + "step": 17268 + }, + { + "epoch": 0.3454, + "grad_norm": 1.90625, + "grad_norm_var": 0.009883626302083334, + "learning_rate": 0.0001, + "loss": 4.1109, + "loss/crossentropy": 2.273344039916992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24327096343040466, + "step": 17270 + }, + { + "epoch": 0.34544, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011351521809895833, + "learning_rate": 0.0001, + "loss": 3.8254, + "loss/crossentropy": 1.851362407207489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18494703620672226, + "step": 17272 + }, + { + "epoch": 0.34548, + "grad_norm": 2.046875, + "grad_norm_var": 0.01114501953125, + "learning_rate": 0.0001, + "loss": 4.0846, + "loss/crossentropy": 1.9861729145050049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20077189058065414, + "step": 17274 + }, + { + "epoch": 0.34552, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011716461181640625, + "learning_rate": 0.0001, + "loss": 4.0692, + "loss/crossentropy": 2.0234099626541138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20506415516138077, + "step": 17276 + }, + { + "epoch": 0.34556, + "grad_norm": 1.90625, + "grad_norm_var": 0.012326812744140625, + "learning_rate": 0.0001, + "loss": 3.9843, + "loss/crossentropy": 2.1666045784950256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18108145147562027, + "step": 17278 + }, + { + "epoch": 0.3456, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0060536702473958336, + "learning_rate": 0.0001, + "loss": 4.0877, + "loss/crossentropy": 2.066905975341797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2112976461648941, + "step": 17280 + }, + { + "epoch": 0.34564, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0059722900390625, + "learning_rate": 0.0001, + "loss": 4.224, + "loss/crossentropy": 2.1789294481277466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20467212051153183, + "step": 17282 + }, + { + "epoch": 0.34568, + "grad_norm": 1.953125, + "grad_norm_var": 0.005736287434895833, + "learning_rate": 0.0001, + "loss": 3.9716, + "loss/crossentropy": 1.8310245275497437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974945366382599, + "step": 17284 + }, + { + "epoch": 0.34572, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006563059488932292, + "learning_rate": 0.0001, + "loss": 4.1832, + "loss/crossentropy": 2.033496856689453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1989584118127823, + "step": 17286 + }, + { + "epoch": 0.34576, + "grad_norm": 2.0625, + "grad_norm_var": 0.0091705322265625, + "learning_rate": 0.0001, + "loss": 3.8748, + "loss/crossentropy": 1.8883211016654968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1889789029955864, + "step": 17288 + }, + { + "epoch": 0.3458, + "grad_norm": 2.015625, + "grad_norm_var": 0.008495076497395834, + "learning_rate": 0.0001, + "loss": 4.1637, + "loss/crossentropy": 2.4520593881607056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2183564007282257, + "step": 17290 + }, + { + "epoch": 0.34584, + "grad_norm": 2.046875, + "grad_norm_var": 0.008272298177083333, + "learning_rate": 0.0001, + "loss": 3.9375, + "loss/crossentropy": 2.030746340751648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19382008165121078, + "step": 17292 + }, + { + "epoch": 0.34588, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007295735677083333, + "learning_rate": 0.0001, + "loss": 4.2182, + "loss/crossentropy": 1.9897491931915283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21130450069904327, + "step": 17294 + }, + { + "epoch": 0.34592, + "grad_norm": 1.9296875, + "grad_norm_var": 0.006883748372395833, + "learning_rate": 0.0001, + "loss": 3.982, + "loss/crossentropy": 1.8000388145446777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20359987765550613, + "step": 17296 + }, + { + "epoch": 0.34596, + "grad_norm": 1.984375, + "grad_norm_var": 0.0065610249837239586, + "learning_rate": 0.0001, + "loss": 4.0341, + "loss/crossentropy": 2.0408164262771606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18042601644992828, + "step": 17298 + }, + { + "epoch": 0.346, + "grad_norm": 2.015625, + "grad_norm_var": 0.006742350260416667, + "learning_rate": 0.0001, + "loss": 4.0849, + "loss/crossentropy": 2.28415846824646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20799801498651505, + "step": 17300 + }, + { + "epoch": 0.34604, + "grad_norm": 2.25, + "grad_norm_var": 0.010925038655598959, + "learning_rate": 0.0001, + "loss": 4.3258, + "loss/crossentropy": 1.9420422315597534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20019559562206268, + "step": 17302 + }, + { + "epoch": 0.34608, + "grad_norm": 2.234375, + "grad_norm_var": 0.0102691650390625, + "learning_rate": 0.0001, + "loss": 4.1892, + "loss/crossentropy": 2.100687026977539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21129299700260162, + "step": 17304 + }, + { + "epoch": 0.34612, + "grad_norm": 1.921875, + "grad_norm_var": 0.010970052083333333, + "learning_rate": 0.0001, + "loss": 4.0209, + "loss/crossentropy": 1.8657938241958618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2081414759159088, + "step": 17306 + }, + { + "epoch": 0.34616, + "grad_norm": 1.8359375, + "grad_norm_var": 0.012853749593098958, + "learning_rate": 0.0001, + "loss": 4.025, + "loss/crossentropy": 2.0629169940948486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21117018163204193, + "step": 17308 + }, + { + "epoch": 0.3462, + "grad_norm": 1.96875, + "grad_norm_var": 0.0126129150390625, + "learning_rate": 0.0001, + "loss": 4.159, + "loss/crossentropy": 2.2877084016799927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1959577053785324, + "step": 17310 + }, + { + "epoch": 0.34624, + "grad_norm": 2.015625, + "grad_norm_var": 0.012280019124348958, + "learning_rate": 0.0001, + "loss": 4.1135, + "loss/crossentropy": 2.2501026391983032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108026072382927, + "step": 17312 + }, + { + "epoch": 0.34628, + "grad_norm": 2.0, + "grad_norm_var": 0.012011464436848958, + "learning_rate": 0.0001, + "loss": 4.1498, + "loss/crossentropy": 1.9485750794410706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126113697886467, + "step": 17314 + }, + { + "epoch": 0.34632, + "grad_norm": 1.984375, + "grad_norm_var": 0.011500803629557292, + "learning_rate": 0.0001, + "loss": 4.13, + "loss/crossentropy": 2.193112373352051, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19680321961641312, + "step": 17316 + }, + { + "epoch": 0.34636, + "grad_norm": 1.796875, + "grad_norm_var": 0.010084788004557291, + "learning_rate": 0.0001, + "loss": 4.1181, + "loss/crossentropy": 2.30054771900177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2211720421910286, + "step": 17318 + }, + { + "epoch": 0.3464, + "grad_norm": 2.078125, + "grad_norm_var": 0.026869455973307293, + "learning_rate": 0.0001, + "loss": 4.2737, + "loss/crossentropy": 2.0091487169265747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19040243327617645, + "step": 17320 + }, + { + "epoch": 0.34644, + "grad_norm": 2.0625, + "grad_norm_var": 0.036628977457682295, + "learning_rate": 0.0001, + "loss": 4.2257, + "loss/crossentropy": 2.1841423511505127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2112671434879303, + "step": 17322 + }, + { + "epoch": 0.34648, + "grad_norm": 2.171875, + "grad_norm_var": 0.03555501302083333, + "learning_rate": 0.0001, + "loss": 3.8837, + "loss/crossentropy": 2.092822253704071, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2037336230278015, + "step": 17324 + }, + { + "epoch": 0.34652, + "grad_norm": 2.09375, + "grad_norm_var": 0.0335601806640625, + "learning_rate": 0.0001, + "loss": 4.0322, + "loss/crossentropy": 2.280580163002014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20753052085638046, + "step": 17326 + }, + { + "epoch": 0.34656, + "grad_norm": 1.7890625, + "grad_norm_var": 0.03884048461914062, + "learning_rate": 0.0001, + "loss": 4.1206, + "loss/crossentropy": 1.60361909866333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15820877254009247, + "step": 17328 + }, + { + "epoch": 0.3466, + "grad_norm": 1.84375, + "grad_norm_var": 0.04915949503580729, + "learning_rate": 0.0001, + "loss": 3.6214, + "loss/crossentropy": 1.608814001083374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16808264702558517, + "step": 17330 + }, + { + "epoch": 0.34664, + "grad_norm": 2.609375, + "grad_norm_var": 0.06932373046875, + "learning_rate": 0.0001, + "loss": 4.4983, + "loss/crossentropy": 2.3868669271469116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23717772960662842, + "step": 17332 + }, + { + "epoch": 0.34668, + "grad_norm": 1.8984375, + "grad_norm_var": 0.06720759073893229, + "learning_rate": 0.0001, + "loss": 4.0862, + "loss/crossentropy": 2.1908692717552185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21705647557973862, + "step": 17334 + }, + { + "epoch": 0.34672, + "grad_norm": 2.140625, + "grad_norm_var": 0.05272191365559896, + "learning_rate": 0.0001, + "loss": 4.2436, + "loss/crossentropy": 2.459980010986328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24167515337467194, + "step": 17336 + }, + { + "epoch": 0.34676, + "grad_norm": 1.984375, + "grad_norm_var": 0.04353612263997396, + "learning_rate": 0.0001, + "loss": 4.219, + "loss/crossentropy": 2.221674919128418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2097555324435234, + "step": 17338 + }, + { + "epoch": 0.3468, + "grad_norm": 1.78125, + "grad_norm_var": 0.04521458943684896, + "learning_rate": 0.0001, + "loss": 3.7565, + "loss/crossentropy": 2.2840858697891235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2250964418053627, + "step": 17340 + }, + { + "epoch": 0.34684, + "grad_norm": 1.9453125, + "grad_norm_var": 0.04426854451497396, + "learning_rate": 0.0001, + "loss": 3.6803, + "loss/crossentropy": 2.0715824365615845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2149435505270958, + "step": 17342 + }, + { + "epoch": 0.34688, + "grad_norm": 2.03125, + "grad_norm_var": 0.0420806884765625, + "learning_rate": 0.0001, + "loss": 4.1017, + "loss/crossentropy": 2.2389899492263794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22377658635377884, + "step": 17344 + }, + { + "epoch": 0.34692, + "grad_norm": 2.046875, + "grad_norm_var": 0.035278065999348955, + "learning_rate": 0.0001, + "loss": 3.9827, + "loss/crossentropy": 2.0308732986450195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2074751853942871, + "step": 17346 + }, + { + "epoch": 0.34696, + "grad_norm": 1.890625, + "grad_norm_var": 0.010864003499348959, + "learning_rate": 0.0001, + "loss": 3.7901, + "loss/crossentropy": 1.8123770356178284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19080865383148193, + "step": 17348 + }, + { + "epoch": 0.347, + "grad_norm": 1.734375, + "grad_norm_var": 0.010367838541666667, + "learning_rate": 0.0001, + "loss": 3.8772, + "loss/crossentropy": 1.884379267692566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1799243837594986, + "step": 17350 + }, + { + "epoch": 0.34704, + "grad_norm": 1.8671875, + "grad_norm_var": 0.012442779541015626, + "learning_rate": 0.0001, + "loss": 4.4317, + "loss/crossentropy": 2.237663149833679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20197343081235886, + "step": 17352 + }, + { + "epoch": 0.34708, + "grad_norm": 1.828125, + "grad_norm_var": 0.013337961832682292, + "learning_rate": 0.0001, + "loss": 3.7444, + "loss/crossentropy": 1.9563414454460144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18722663074731827, + "step": 17354 + }, + { + "epoch": 0.34712, + "grad_norm": 2.3125, + "grad_norm_var": 0.020817057291666666, + "learning_rate": 0.0001, + "loss": 4.2799, + "loss/crossentropy": 2.1188591718673706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21183937788009644, + "step": 17356 + }, + { + "epoch": 0.34716, + "grad_norm": 2.046875, + "grad_norm_var": 0.0219146728515625, + "learning_rate": 0.0001, + "loss": 4.4551, + "loss/crossentropy": 2.276149272918701, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2232571393251419, + "step": 17358 + }, + { + "epoch": 0.3472, + "grad_norm": 2.0, + "grad_norm_var": 0.02123998006184896, + "learning_rate": 0.0001, + "loss": 4.4329, + "loss/crossentropy": 2.360138177871704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22683367878198624, + "step": 17360 + }, + { + "epoch": 0.34724, + "grad_norm": 1.96875, + "grad_norm_var": 0.020576985677083333, + "learning_rate": 0.0001, + "loss": 4.2762, + "loss/crossentropy": 1.99272620677948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18479204177856445, + "step": 17362 + }, + { + "epoch": 0.34728, + "grad_norm": 1.9921875, + "grad_norm_var": 0.019760894775390624, + "learning_rate": 0.0001, + "loss": 4.2303, + "loss/crossentropy": 2.053771197795868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20591863989830017, + "step": 17364 + }, + { + "epoch": 0.34732, + "grad_norm": 1.9296875, + "grad_norm_var": 0.016428375244140626, + "learning_rate": 0.0001, + "loss": 3.9703, + "loss/crossentropy": 2.1311055421829224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2049081027507782, + "step": 17366 + }, + { + "epoch": 0.34736, + "grad_norm": 1.984375, + "grad_norm_var": 0.01246337890625, + "learning_rate": 0.0001, + "loss": 4.1102, + "loss/crossentropy": 2.0712148547172546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19809392094612122, + "step": 17368 + }, + { + "epoch": 0.3474, + "grad_norm": 1.953125, + "grad_norm_var": 0.009468587239583333, + "learning_rate": 0.0001, + "loss": 4.0732, + "loss/crossentropy": 2.144728183746338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21964795887470245, + "step": 17370 + }, + { + "epoch": 0.34744, + "grad_norm": 2.03125, + "grad_norm_var": 0.003856404622395833, + "learning_rate": 0.0001, + "loss": 3.9145, + "loss/crossentropy": 2.024275004863739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19414371997117996, + "step": 17372 + }, + { + "epoch": 0.34748, + "grad_norm": 1.84375, + "grad_norm_var": 0.00400390625, + "learning_rate": 0.0001, + "loss": 4.0925, + "loss/crossentropy": 2.460733413696289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2185375839471817, + "step": 17374 + }, + { + "epoch": 0.34752, + "grad_norm": 2.28125, + "grad_norm_var": 0.010423787434895833, + "learning_rate": 0.0001, + "loss": 4.308, + "loss/crossentropy": 2.3502203226089478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2140796035528183, + "step": 17376 + }, + { + "epoch": 0.34756, + "grad_norm": 1.78125, + "grad_norm_var": 0.012547810872395834, + "learning_rate": 0.0001, + "loss": 4.0238, + "loss/crossentropy": 2.2896264791488647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2149006575345993, + "step": 17378 + }, + { + "epoch": 0.3476, + "grad_norm": 1.8515625, + "grad_norm_var": 0.013118489583333334, + "learning_rate": 0.0001, + "loss": 3.9757, + "loss/crossentropy": 2.101209044456482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20842310786247253, + "step": 17380 + }, + { + "epoch": 0.34764, + "grad_norm": 2.0, + "grad_norm_var": 0.0189361572265625, + "learning_rate": 0.0001, + "loss": 4.3287, + "loss/crossentropy": 1.8995551466941833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1851196512579918, + "step": 17382 + }, + { + "epoch": 0.34768, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01887995402018229, + "learning_rate": 0.0001, + "loss": 3.9219, + "loss/crossentropy": 1.850643277168274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16962846368551254, + "step": 17384 + }, + { + "epoch": 0.34772, + "grad_norm": 2.078125, + "grad_norm_var": 0.019528961181640624, + "learning_rate": 0.0001, + "loss": 4.2478, + "loss/crossentropy": 2.1131063103675842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19152897596359253, + "step": 17386 + }, + { + "epoch": 0.34776, + "grad_norm": 1.9921875, + "grad_norm_var": 0.020140584309895834, + "learning_rate": 0.0001, + "loss": 4.1932, + "loss/crossentropy": 2.172673463821411, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20645174384117126, + "step": 17388 + }, + { + "epoch": 0.3478, + "grad_norm": 2.09375, + "grad_norm_var": 0.019147745768229165, + "learning_rate": 0.0001, + "loss": 4.269, + "loss/crossentropy": 2.431947708129883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20513125509023666, + "step": 17390 + }, + { + "epoch": 0.34784, + "grad_norm": 1.9453125, + "grad_norm_var": 0.014461008707682292, + "learning_rate": 0.0001, + "loss": 3.9314, + "loss/crossentropy": 2.062328279018402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1929447129368782, + "step": 17392 + }, + { + "epoch": 0.34788, + "grad_norm": 2.015625, + "grad_norm_var": 0.010609690348307292, + "learning_rate": 0.0001, + "loss": 4.2277, + "loss/crossentropy": 2.189133882522583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20945163816213608, + "step": 17394 + }, + { + "epoch": 0.34792, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010846964518229167, + "learning_rate": 0.0001, + "loss": 3.9426, + "loss/crossentropy": 2.070523977279663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20401575416326523, + "step": 17396 + }, + { + "epoch": 0.34796, + "grad_norm": 1.9375, + "grad_norm_var": 0.0075266520182291664, + "learning_rate": 0.0001, + "loss": 4.0668, + "loss/crossentropy": 2.3387876749038696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19960296899080276, + "step": 17398 + }, + { + "epoch": 0.348, + "grad_norm": 2.078125, + "grad_norm_var": 0.00955810546875, + "learning_rate": 0.0001, + "loss": 4.0225, + "loss/crossentropy": 1.9857355952262878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19195467978715897, + "step": 17400 + }, + { + "epoch": 0.34804, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009041086832682291, + "learning_rate": 0.0001, + "loss": 4.0715, + "loss/crossentropy": 2.2523770332336426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22015457600355148, + "step": 17402 + }, + { + "epoch": 0.34808, + "grad_norm": 1.875, + "grad_norm_var": 0.0074045817057291664, + "learning_rate": 0.0001, + "loss": 3.9181, + "loss/crossentropy": 1.7088012099266052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1644933521747589, + "step": 17404 + }, + { + "epoch": 0.34812, + "grad_norm": 1.890625, + "grad_norm_var": 0.006772613525390625, + "learning_rate": 0.0001, + "loss": 3.8759, + "loss/crossentropy": 2.022001802921295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19237526506185532, + "step": 17406 + }, + { + "epoch": 0.34816, + "grad_norm": 2.125, + "grad_norm_var": 0.015669504801432293, + "learning_rate": 0.0001, + "loss": 4.2196, + "loss/crossentropy": 2.240332841873169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2214636355638504, + "step": 17408 + }, + { + "epoch": 0.3482, + "grad_norm": 1.9765625, + "grad_norm_var": 0.015099843343098959, + "learning_rate": 0.0001, + "loss": 4.0211, + "loss/crossentropy": 2.1392452716827393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21769221127033234, + "step": 17410 + }, + { + "epoch": 0.34824, + "grad_norm": 1.890625, + "grad_norm_var": 0.016078440348307292, + "learning_rate": 0.0001, + "loss": 4.0243, + "loss/crossentropy": 1.9182460308074951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.202327162027359, + "step": 17412 + }, + { + "epoch": 0.34828, + "grad_norm": 2.0625, + "grad_norm_var": 0.0153961181640625, + "learning_rate": 0.0001, + "loss": 4.0654, + "loss/crossentropy": 2.036426305770874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19997400790452957, + "step": 17414 + }, + { + "epoch": 0.34832, + "grad_norm": 1.9296875, + "grad_norm_var": 0.014353179931640625, + "learning_rate": 0.0001, + "loss": 3.9086, + "loss/crossentropy": 2.0429354906082153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20519915968179703, + "step": 17416 + }, + { + "epoch": 0.34836, + "grad_norm": 2.03125, + "grad_norm_var": 0.013923136393229167, + "learning_rate": 0.0001, + "loss": 4.2742, + "loss/crossentropy": 2.3943710327148438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2106815129518509, + "step": 17418 + }, + { + "epoch": 0.3484, + "grad_norm": 2.046875, + "grad_norm_var": 0.0130615234375, + "learning_rate": 0.0001, + "loss": 4.1596, + "loss/crossentropy": 2.060115098953247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20672088861465454, + "step": 17420 + }, + { + "epoch": 0.34844, + "grad_norm": 1.8984375, + "grad_norm_var": 0.012471516927083334, + "learning_rate": 0.0001, + "loss": 4.2223, + "loss/crossentropy": 2.2820589542388916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23306988179683685, + "step": 17422 + }, + { + "epoch": 0.34848, + "grad_norm": 1.984375, + "grad_norm_var": 0.008788045247395833, + "learning_rate": 0.0001, + "loss": 4.1665, + "loss/crossentropy": 2.13969624042511, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20909114927053452, + "step": 17424 + }, + { + "epoch": 0.34852, + "grad_norm": 1.953125, + "grad_norm_var": 0.011214192708333333, + "learning_rate": 0.0001, + "loss": 3.9572, + "loss/crossentropy": 2.0822505950927734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20429539680480957, + "step": 17426 + }, + { + "epoch": 0.34856, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009759267171223959, + "learning_rate": 0.0001, + "loss": 3.9435, + "loss/crossentropy": 2.512156844139099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21776874363422394, + "step": 17428 + }, + { + "epoch": 0.3486, + "grad_norm": 2.359375, + "grad_norm_var": 0.018700917561848957, + "learning_rate": 0.0001, + "loss": 4.1145, + "loss/crossentropy": 2.0379759669303894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20926732569932938, + "step": 17430 + }, + { + "epoch": 0.34864, + "grad_norm": 2.046875, + "grad_norm_var": 0.017295074462890626, + "learning_rate": 0.0001, + "loss": 4.1998, + "loss/crossentropy": 2.443945050239563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22597475349903107, + "step": 17432 + }, + { + "epoch": 0.34868, + "grad_norm": 1.8203125, + "grad_norm_var": 0.019576009114583334, + "learning_rate": 0.0001, + "loss": 3.8442, + "loss/crossentropy": 1.8295652866363525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18034228682518005, + "step": 17434 + }, + { + "epoch": 0.34872, + "grad_norm": 1.8671875, + "grad_norm_var": 0.020428212483723958, + "learning_rate": 0.0001, + "loss": 4.0614, + "loss/crossentropy": 1.9487475156784058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18921684473752975, + "step": 17436 + }, + { + "epoch": 0.34876, + "grad_norm": 2.078125, + "grad_norm_var": 0.018721262613932293, + "learning_rate": 0.0001, + "loss": 4.1153, + "loss/crossentropy": 2.099911689758301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2079886943101883, + "step": 17438 + }, + { + "epoch": 0.3488, + "grad_norm": 2.015625, + "grad_norm_var": 0.016739654541015624, + "learning_rate": 0.0001, + "loss": 4.1762, + "loss/crossentropy": 2.52177894115448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22080882638692856, + "step": 17440 + }, + { + "epoch": 0.34884, + "grad_norm": 2.125, + "grad_norm_var": 0.014900461832682291, + "learning_rate": 0.0001, + "loss": 4.1076, + "loss/crossentropy": 2.222637891769409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22301562875509262, + "step": 17442 + }, + { + "epoch": 0.34888, + "grad_norm": 2.09375, + "grad_norm_var": 0.014725494384765624, + "learning_rate": 0.0001, + "loss": 4.3965, + "loss/crossentropy": 2.2042930126190186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2051277905702591, + "step": 17444 + }, + { + "epoch": 0.34892, + "grad_norm": 2.015625, + "grad_norm_var": 0.0065093994140625, + "learning_rate": 0.0001, + "loss": 4.2452, + "loss/crossentropy": 2.0599029064178467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20651433616876602, + "step": 17446 + }, + { + "epoch": 0.34896, + "grad_norm": 2.0, + "grad_norm_var": 0.007027180989583334, + "learning_rate": 0.0001, + "loss": 3.9174, + "loss/crossentropy": 1.9319151639938354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18495241552591324, + "step": 17448 + }, + { + "epoch": 0.349, + "grad_norm": 1.8671875, + "grad_norm_var": 0.006197102864583333, + "learning_rate": 0.0001, + "loss": 4.0844, + "loss/crossentropy": 2.341706871986389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20252344757318497, + "step": 17450 + }, + { + "epoch": 0.34904, + "grad_norm": 1.953125, + "grad_norm_var": 0.004833984375, + "learning_rate": 0.0001, + "loss": 4.0301, + "loss/crossentropy": 2.0161609053611755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2176668643951416, + "step": 17452 + }, + { + "epoch": 0.34908, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0062334696451822914, + "learning_rate": 0.0001, + "loss": 3.6046, + "loss/crossentropy": 1.7961083054542542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16384318470954895, + "step": 17454 + }, + { + "epoch": 0.34912, + "grad_norm": 2.265625, + "grad_norm_var": 0.01791966756184896, + "learning_rate": 0.0001, + "loss": 4.6054, + "loss/crossentropy": 2.108873188495636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21137713640928268, + "step": 17456 + }, + { + "epoch": 0.34916, + "grad_norm": 1.921875, + "grad_norm_var": 0.017830403645833333, + "learning_rate": 0.0001, + "loss": 3.9543, + "loss/crossentropy": 2.158664584159851, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19957562536001205, + "step": 17458 + }, + { + "epoch": 0.3492, + "grad_norm": 1.84375, + "grad_norm_var": 0.018660227457682293, + "learning_rate": 0.0001, + "loss": 3.9454, + "loss/crossentropy": 1.7799381017684937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.193112812936306, + "step": 17460 + }, + { + "epoch": 0.34924, + "grad_norm": 2.0625, + "grad_norm_var": 0.019128163655598957, + "learning_rate": 0.0001, + "loss": 4.4072, + "loss/crossentropy": 2.372501015663147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2180323675274849, + "step": 17462 + }, + { + "epoch": 0.34928, + "grad_norm": 1.96875, + "grad_norm_var": 0.0185699462890625, + "learning_rate": 0.0001, + "loss": 4.0343, + "loss/crossentropy": 1.9338968396186829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19117196649312973, + "step": 17464 + }, + { + "epoch": 0.34932, + "grad_norm": 1.8125, + "grad_norm_var": 0.020774078369140626, + "learning_rate": 0.0001, + "loss": 3.6429, + "loss/crossentropy": 1.4602742195129395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1677519753575325, + "step": 17466 + }, + { + "epoch": 0.34936, + "grad_norm": 2.0, + "grad_norm_var": 0.02080078125, + "learning_rate": 0.0001, + "loss": 4.2498, + "loss/crossentropy": 2.1464006304740906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21201838552951813, + "step": 17468 + }, + { + "epoch": 0.3494, + "grad_norm": 2.109375, + "grad_norm_var": 0.018529256184895832, + "learning_rate": 0.0001, + "loss": 4.1736, + "loss/crossentropy": 2.071315884590149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19322700798511505, + "step": 17470 + }, + { + "epoch": 0.34944, + "grad_norm": 1.984375, + "grad_norm_var": 0.010309855143229166, + "learning_rate": 0.0001, + "loss": 4.2653, + "loss/crossentropy": 1.9166500568389893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23960981518030167, + "step": 17472 + }, + { + "epoch": 0.34948, + "grad_norm": 2.0625, + "grad_norm_var": 0.011457316080729167, + "learning_rate": 0.0001, + "loss": 3.9433, + "loss/crossentropy": 1.8666390180587769, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1875123456120491, + "step": 17474 + }, + { + "epoch": 0.34952, + "grad_norm": 2.015625, + "grad_norm_var": 0.009736887613932292, + "learning_rate": 0.0001, + "loss": 4.0984, + "loss/crossentropy": 1.9197289943695068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21075090020895004, + "step": 17476 + }, + { + "epoch": 0.34956, + "grad_norm": 1.90625, + "grad_norm_var": 0.011018625895182292, + "learning_rate": 0.0001, + "loss": 4.0971, + "loss/crossentropy": 1.904780387878418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20412997901439667, + "step": 17478 + }, + { + "epoch": 0.3496, + "grad_norm": 1.859375, + "grad_norm_var": 0.012325032552083334, + "learning_rate": 0.0001, + "loss": 3.735, + "loss/crossentropy": 1.6775096654891968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1669272631406784, + "step": 17480 + }, + { + "epoch": 0.34964, + "grad_norm": 2.015625, + "grad_norm_var": 0.0089111328125, + "learning_rate": 0.0001, + "loss": 4.1039, + "loss/crossentropy": 2.050463318824768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19382070004940033, + "step": 17482 + }, + { + "epoch": 0.34968, + "grad_norm": 2.015625, + "grad_norm_var": 0.009224192301432291, + "learning_rate": 0.0001, + "loss": 3.8739, + "loss/crossentropy": 1.8124673962593079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20674846321344376, + "step": 17484 + }, + { + "epoch": 0.34972, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008969879150390625, + "learning_rate": 0.0001, + "loss": 3.6797, + "loss/crossentropy": 1.6487451791763306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17874807119369507, + "step": 17486 + }, + { + "epoch": 0.34976, + "grad_norm": 2.171875, + "grad_norm_var": 0.009430948893229167, + "learning_rate": 0.0001, + "loss": 4.4025, + "loss/crossentropy": 2.101687431335449, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24856076389551163, + "step": 17488 + }, + { + "epoch": 0.3498, + "grad_norm": 2.015625, + "grad_norm_var": 0.009205881754557292, + "learning_rate": 0.0001, + "loss": 4.0236, + "loss/crossentropy": 1.9233632683753967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21046236902475357, + "step": 17490 + }, + { + "epoch": 0.34984, + "grad_norm": 2.09375, + "grad_norm_var": 0.010589345296223959, + "learning_rate": 0.0001, + "loss": 4.0376, + "loss/crossentropy": 2.2882679104804993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23568424582481384, + "step": 17492 + }, + { + "epoch": 0.34988, + "grad_norm": 2.0625, + "grad_norm_var": 0.022342681884765625, + "learning_rate": 0.0001, + "loss": 4.4205, + "loss/crossentropy": 2.205874502658844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21892941743135452, + "step": 17494 + }, + { + "epoch": 0.34992, + "grad_norm": 2.234375, + "grad_norm_var": 0.0232177734375, + "learning_rate": 0.0001, + "loss": 4.3628, + "loss/crossentropy": 2.278464913368225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22409500926733017, + "step": 17496 + }, + { + "epoch": 0.34996, + "grad_norm": 1.859375, + "grad_norm_var": 0.026082356770833332, + "learning_rate": 0.0001, + "loss": 4.0736, + "loss/crossentropy": 2.5003273487091064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22478827834129333, + "step": 17498 + }, + { + "epoch": 0.35, + "grad_norm": 2.0625, + "grad_norm_var": 0.04237848917643229, + "learning_rate": 0.0001, + "loss": 4.3658, + "loss/crossentropy": 2.034249722957611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19176610559225082, + "step": 17500 + }, + { + "epoch": 0.35004, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0401031494140625, + "learning_rate": 0.0001, + "loss": 4.1586, + "loss/crossentropy": 2.271065592765808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21523310244083405, + "step": 17502 + }, + { + "epoch": 0.35008, + "grad_norm": 1.984375, + "grad_norm_var": 0.03870824178059896, + "learning_rate": 0.0001, + "loss": 4.1198, + "loss/crossentropy": 2.182866334915161, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21311558783054352, + "step": 17504 + }, + { + "epoch": 0.35012, + "grad_norm": 2.125, + "grad_norm_var": 0.03878173828125, + "learning_rate": 0.0001, + "loss": 4.1471, + "loss/crossentropy": 1.7134016752243042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16884544491767883, + "step": 17506 + }, + { + "epoch": 0.35016, + "grad_norm": 2.125, + "grad_norm_var": 0.03687718709309896, + "learning_rate": 0.0001, + "loss": 4.2464, + "loss/crossentropy": 2.198198080062866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1916792169213295, + "step": 17508 + }, + { + "epoch": 0.3502, + "grad_norm": 2.0625, + "grad_norm_var": 0.0272125244140625, + "learning_rate": 0.0001, + "loss": 4.2651, + "loss/crossentropy": 1.9370547533035278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23538394272327423, + "step": 17510 + }, + { + "epoch": 0.35024, + "grad_norm": 2.078125, + "grad_norm_var": 0.02469456990559896, + "learning_rate": 0.0001, + "loss": 4.0664, + "loss/crossentropy": 2.1743874549865723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22514399886131287, + "step": 17512 + }, + { + "epoch": 0.35028, + "grad_norm": 2.03125, + "grad_norm_var": 0.02174657185872396, + "learning_rate": 0.0001, + "loss": 4.2767, + "loss/crossentropy": 1.8546866178512573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1772822067141533, + "step": 17514 + }, + { + "epoch": 0.35032, + "grad_norm": 2.0, + "grad_norm_var": 0.004154205322265625, + "learning_rate": 0.0001, + "loss": 4.2603, + "loss/crossentropy": 1.8439211249351501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19603434205055237, + "step": 17516 + }, + { + "epoch": 0.35036, + "grad_norm": 1.8125, + "grad_norm_var": 0.006843058268229166, + "learning_rate": 0.0001, + "loss": 3.9506, + "loss/crossentropy": 2.2717851400375366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21598497033119202, + "step": 17518 + }, + { + "epoch": 0.3504, + "grad_norm": 1.921875, + "grad_norm_var": 0.006843058268229166, + "learning_rate": 0.0001, + "loss": 4.1807, + "loss/crossentropy": 2.2645692825317383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2229163572192192, + "step": 17520 + }, + { + "epoch": 0.35044, + "grad_norm": 1.828125, + "grad_norm_var": 0.0071604410807291664, + "learning_rate": 0.0001, + "loss": 4.09, + "loss/crossentropy": 2.0507450103759766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19268949329853058, + "step": 17522 + }, + { + "epoch": 0.35048, + "grad_norm": 2.0, + "grad_norm_var": 0.005521647135416667, + "learning_rate": 0.0001, + "loss": 4.1468, + "loss/crossentropy": 2.3831188678741455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21931231766939163, + "step": 17524 + }, + { + "epoch": 0.35052, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006281534830729167, + "learning_rate": 0.0001, + "loss": 4.116, + "loss/crossentropy": 1.8574120998382568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17972451448440552, + "step": 17526 + }, + { + "epoch": 0.35056, + "grad_norm": 1.875, + "grad_norm_var": 0.006013997395833333, + "learning_rate": 0.0001, + "loss": 3.9063, + "loss/crossentropy": 2.141213893890381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21798025816679, + "step": 17528 + }, + { + "epoch": 0.3506, + "grad_norm": 1.8359375, + "grad_norm_var": 0.006520334879557292, + "learning_rate": 0.0001, + "loss": 3.9454, + "loss/crossentropy": 2.2224762439727783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2031020149588585, + "step": 17530 + }, + { + "epoch": 0.35064, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006476847330729166, + "learning_rate": 0.0001, + "loss": 3.765, + "loss/crossentropy": 2.163232743740082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22002054750919342, + "step": 17532 + }, + { + "epoch": 0.35068, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0055653889973958336, + "learning_rate": 0.0001, + "loss": 4.1397, + "loss/crossentropy": 2.1463611721992493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20084905624389648, + "step": 17534 + }, + { + "epoch": 0.35072, + "grad_norm": 2.015625, + "grad_norm_var": 0.006566365559895833, + "learning_rate": 0.0001, + "loss": 3.9751, + "loss/crossentropy": 2.1863406896591187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2150622010231018, + "step": 17536 + }, + { + "epoch": 0.35076, + "grad_norm": 1.9609375, + "grad_norm_var": 0.005744425455729166, + "learning_rate": 0.0001, + "loss": 4.0958, + "loss/crossentropy": 2.2162610292434692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20196689665317535, + "step": 17538 + }, + { + "epoch": 0.3508, + "grad_norm": 10.6875, + "grad_norm_var": 4.776464589436849, + "learning_rate": 0.0001, + "loss": 4.1004, + "loss/crossentropy": 1.8908233642578125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23238417506217957, + "step": 17540 + }, + { + "epoch": 0.35084, + "grad_norm": 2.125, + "grad_norm_var": 4.763142903645833, + "learning_rate": 0.0001, + "loss": 4.416, + "loss/crossentropy": 2.3578076362609863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21626722812652588, + "step": 17542 + }, + { + "epoch": 0.35088, + "grad_norm": 2.234375, + "grad_norm_var": 4.736722819010416, + "learning_rate": 0.0001, + "loss": 4.3243, + "loss/crossentropy": 2.4079915285110474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2292388305068016, + "step": 17544 + }, + { + "epoch": 0.35092, + "grad_norm": 2.015625, + "grad_norm_var": 4.717746734619141, + "learning_rate": 0.0001, + "loss": 4.2647, + "loss/crossentropy": 2.1021666526794434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20725515484809875, + "step": 17546 + }, + { + "epoch": 0.35096, + "grad_norm": 1.96875, + "grad_norm_var": 4.707061513264974, + "learning_rate": 0.0001, + "loss": 4.0905, + "loss/crossentropy": 2.039289176464081, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20614883303642273, + "step": 17548 + }, + { + "epoch": 0.351, + "grad_norm": 2.203125, + "grad_norm_var": 4.68468017578125, + "learning_rate": 0.0001, + "loss": 4.269, + "loss/crossentropy": 1.9731826782226562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2164870798587799, + "step": 17550 + }, + { + "epoch": 0.35104, + "grad_norm": 1.875, + "grad_norm_var": 4.69991455078125, + "learning_rate": 0.0001, + "loss": 4.0785, + "loss/crossentropy": 2.0166266560554504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1977868676185608, + "step": 17552 + }, + { + "epoch": 0.35108, + "grad_norm": 2.0, + "grad_norm_var": 4.692333730061849, + "learning_rate": 0.0001, + "loss": 4.2823, + "loss/crossentropy": 2.284690737724304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2204381301999092, + "step": 17554 + }, + { + "epoch": 0.35112, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012977854410807291, + "learning_rate": 0.0001, + "loss": 4.2629, + "loss/crossentropy": 2.293992757797241, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21718238294124603, + "step": 17556 + }, + { + "epoch": 0.35116, + "grad_norm": 2.140625, + "grad_norm_var": 0.013133748372395834, + "learning_rate": 0.0001, + "loss": 4.0687, + "loss/crossentropy": 2.252563714981079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23132775723934174, + "step": 17558 + }, + { + "epoch": 0.3512, + "grad_norm": 1.921875, + "grad_norm_var": 0.0097564697265625, + "learning_rate": 0.0001, + "loss": 3.977, + "loss/crossentropy": 1.8454533219337463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20084110647439957, + "step": 17560 + }, + { + "epoch": 0.35124, + "grad_norm": 2.0625, + "grad_norm_var": 0.009616851806640625, + "learning_rate": 0.0001, + "loss": 4.1526, + "loss/crossentropy": 2.3160746097564697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21168464422225952, + "step": 17562 + }, + { + "epoch": 0.35128, + "grad_norm": 2.03125, + "grad_norm_var": 0.00953369140625, + "learning_rate": 0.0001, + "loss": 4.031, + "loss/crossentropy": 2.0470253229141235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20728407055139542, + "step": 17564 + }, + { + "epoch": 0.35132, + "grad_norm": 2.0, + "grad_norm_var": 0.0064389546712239586, + "learning_rate": 0.0001, + "loss": 4.404, + "loss/crossentropy": 2.098679304122925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2133883535861969, + "step": 17566 + }, + { + "epoch": 0.35136, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005615234375, + "learning_rate": 0.0001, + "loss": 4.0509, + "loss/crossentropy": 1.975411057472229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19393378496170044, + "step": 17568 + }, + { + "epoch": 0.3514, + "grad_norm": 2.0625, + "grad_norm_var": 0.005606842041015625, + "learning_rate": 0.0001, + "loss": 4.2861, + "loss/crossentropy": 2.594779372215271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2499995082616806, + "step": 17570 + }, + { + "epoch": 0.35144, + "grad_norm": 1.875, + "grad_norm_var": 0.005915323893229167, + "learning_rate": 0.0001, + "loss": 4.1045, + "loss/crossentropy": 2.075734496116638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.207140251994133, + "step": 17572 + }, + { + "epoch": 0.35148, + "grad_norm": 1.765625, + "grad_norm_var": 0.0062978108723958336, + "learning_rate": 0.0001, + "loss": 3.869, + "loss/crossentropy": 2.124872624874115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20058108121156693, + "step": 17574 + }, + { + "epoch": 0.35152, + "grad_norm": 2.0625, + "grad_norm_var": 0.012056477864583333, + "learning_rate": 0.0001, + "loss": 4.1552, + "loss/crossentropy": 2.0275574922561646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2105395644903183, + "step": 17576 + }, + { + "epoch": 0.35156, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011913045247395834, + "learning_rate": 0.0001, + "loss": 4.248, + "loss/crossentropy": 2.0694713592529297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21342340111732483, + "step": 17578 + }, + { + "epoch": 0.3516, + "grad_norm": 2.0625, + "grad_norm_var": 0.012239583333333333, + "learning_rate": 0.0001, + "loss": 3.9729, + "loss/crossentropy": 1.8013625741004944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1766640990972519, + "step": 17580 + }, + { + "epoch": 0.35164, + "grad_norm": 2.015625, + "grad_norm_var": 0.0123443603515625, + "learning_rate": 0.0001, + "loss": 4.0313, + "loss/crossentropy": 2.1990463733673096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19608185440301895, + "step": 17582 + }, + { + "epoch": 0.35168, + "grad_norm": 1.8671875, + "grad_norm_var": 0.013361612955729166, + "learning_rate": 0.0001, + "loss": 3.9041, + "loss/crossentropy": 2.0714540481567383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20064443349838257, + "step": 17584 + }, + { + "epoch": 0.35172, + "grad_norm": 1.9453125, + "grad_norm_var": 0.015242258707682291, + "learning_rate": 0.0001, + "loss": 3.7431, + "loss/crossentropy": 1.7437097430229187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17001167684793472, + "step": 17586 + }, + { + "epoch": 0.35176, + "grad_norm": 2.265625, + "grad_norm_var": 0.020531972249348957, + "learning_rate": 0.0001, + "loss": 4.1987, + "loss/crossentropy": 2.072261691093445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21019883453845978, + "step": 17588 + }, + { + "epoch": 0.3518, + "grad_norm": 2.28125, + "grad_norm_var": 0.021922810872395834, + "learning_rate": 0.0001, + "loss": 4.0826, + "loss/crossentropy": 2.1804715394973755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21589763462543488, + "step": 17590 + }, + { + "epoch": 0.35184, + "grad_norm": 2.0, + "grad_norm_var": 0.01734619140625, + "learning_rate": 0.0001, + "loss": 4.0964, + "loss/crossentropy": 2.0044930577278137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20204917341470718, + "step": 17592 + }, + { + "epoch": 0.35188, + "grad_norm": 2.015625, + "grad_norm_var": 0.017362467447916665, + "learning_rate": 0.0001, + "loss": 4.1394, + "loss/crossentropy": 1.8961025476455688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1865018978714943, + "step": 17594 + }, + { + "epoch": 0.35192, + "grad_norm": 1.9765625, + "grad_norm_var": 0.016971588134765625, + "learning_rate": 0.0001, + "loss": 3.8503, + "loss/crossentropy": 1.7005563378334045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18951866030693054, + "step": 17596 + }, + { + "epoch": 0.35196, + "grad_norm": 1.953125, + "grad_norm_var": 0.017002105712890625, + "learning_rate": 0.0001, + "loss": 4.0044, + "loss/crossentropy": 2.002982437610626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2076285257935524, + "step": 17598 + }, + { + "epoch": 0.352, + "grad_norm": 1.90625, + "grad_norm_var": 0.016950480143229165, + "learning_rate": 0.0001, + "loss": 4.0595, + "loss/crossentropy": 2.0283620357513428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20175684988498688, + "step": 17600 + }, + { + "epoch": 0.35204, + "grad_norm": 1.9375, + "grad_norm_var": 0.014289347330729167, + "learning_rate": 0.0001, + "loss": 3.9495, + "loss/crossentropy": 1.855184018611908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19368328154087067, + "step": 17602 + }, + { + "epoch": 0.35208, + "grad_norm": 1.96875, + "grad_norm_var": 0.009089152018229166, + "learning_rate": 0.0001, + "loss": 4.2364, + "loss/crossentropy": 2.2288308143615723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2121831700205803, + "step": 17604 + }, + { + "epoch": 0.35212, + "grad_norm": 1.953125, + "grad_norm_var": 0.002512359619140625, + "learning_rate": 0.0001, + "loss": 4.1398, + "loss/crossentropy": 1.9114368557929993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17937570065259933, + "step": 17606 + }, + { + "epoch": 0.35216, + "grad_norm": 1.921875, + "grad_norm_var": 0.0026751200358072916, + "learning_rate": 0.0001, + "loss": 3.9921, + "loss/crossentropy": 2.238133430480957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23264692723751068, + "step": 17608 + }, + { + "epoch": 0.3522, + "grad_norm": 1.9375, + "grad_norm_var": 0.0024169921875, + "learning_rate": 0.0001, + "loss": 3.85, + "loss/crossentropy": 1.7219743728637695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17108283936977386, + "step": 17610 + }, + { + "epoch": 0.35224, + "grad_norm": 1.890625, + "grad_norm_var": 0.005037434895833333, + "learning_rate": 0.0001, + "loss": 3.8406, + "loss/crossentropy": 1.9613978862762451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2042389214038849, + "step": 17612 + }, + { + "epoch": 0.35228, + "grad_norm": 1.859375, + "grad_norm_var": 0.005132802327473958, + "learning_rate": 0.0001, + "loss": 3.8567, + "loss/crossentropy": 1.946107029914856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19604258984327316, + "step": 17614 + }, + { + "epoch": 0.35232, + "grad_norm": 1.921875, + "grad_norm_var": 0.005077870686848959, + "learning_rate": 0.0001, + "loss": 4.1404, + "loss/crossentropy": 2.255744218826294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2184707298874855, + "step": 17616 + }, + { + "epoch": 0.35236, + "grad_norm": 2.0, + "grad_norm_var": 0.0058062235514322914, + "learning_rate": 0.0001, + "loss": 4.1852, + "loss/crossentropy": 2.339258551597595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21353042870759964, + "step": 17618 + }, + { + "epoch": 0.3524, + "grad_norm": 1.984375, + "grad_norm_var": 0.0054595947265625, + "learning_rate": 0.0001, + "loss": 3.9494, + "loss/crossentropy": 1.8912869691848755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19134657084941864, + "step": 17620 + }, + { + "epoch": 0.35244, + "grad_norm": 2.03125, + "grad_norm_var": 0.0058258056640625, + "learning_rate": 0.0001, + "loss": 4.2607, + "loss/crossentropy": 2.186914384365082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20765449851751328, + "step": 17622 + }, + { + "epoch": 0.35248, + "grad_norm": 1.8828125, + "grad_norm_var": 0.005773671468098958, + "learning_rate": 0.0001, + "loss": 3.8567, + "loss/crossentropy": 1.6339558959007263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18255474418401718, + "step": 17624 + }, + { + "epoch": 0.35252, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005796051025390625, + "learning_rate": 0.0001, + "loss": 3.9593, + "loss/crossentropy": 2.030660629272461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1943136677145958, + "step": 17626 + }, + { + "epoch": 0.35256, + "grad_norm": 2.046875, + "grad_norm_var": 0.004173787434895834, + "learning_rate": 0.0001, + "loss": 3.9738, + "loss/crossentropy": 1.8040945529937744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18390020728111267, + "step": 17628 + }, + { + "epoch": 0.3526, + "grad_norm": 2.15625, + "grad_norm_var": 0.0055539449055989586, + "learning_rate": 0.0001, + "loss": 4.4613, + "loss/crossentropy": 1.9993014335632324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20794235169887543, + "step": 17630 + }, + { + "epoch": 0.35264, + "grad_norm": 1.9609375, + "grad_norm_var": 0.004881795247395833, + "learning_rate": 0.0001, + "loss": 3.8539, + "loss/crossentropy": 2.0751482248306274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20385687798261642, + "step": 17632 + }, + { + "epoch": 0.35268, + "grad_norm": 2.078125, + "grad_norm_var": 0.0052073160807291664, + "learning_rate": 0.0001, + "loss": 4.0059, + "loss/crossentropy": 1.8923512697219849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19879557192325592, + "step": 17634 + }, + { + "epoch": 0.35272, + "grad_norm": 1.984375, + "grad_norm_var": 0.004935709635416666, + "learning_rate": 0.0001, + "loss": 4.2176, + "loss/crossentropy": 2.068901300430298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21545641124248505, + "step": 17636 + }, + { + "epoch": 0.35276, + "grad_norm": 2.046875, + "grad_norm_var": 0.005968983968098958, + "learning_rate": 0.0001, + "loss": 3.8627, + "loss/crossentropy": 1.8329379558563232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1875568851828575, + "step": 17638 + }, + { + "epoch": 0.3528, + "grad_norm": 1.90625, + "grad_norm_var": 0.007811482747395833, + "learning_rate": 0.0001, + "loss": 4.0878, + "loss/crossentropy": 2.2368232011795044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22392207384109497, + "step": 17640 + }, + { + "epoch": 0.35284, + "grad_norm": 2.3125, + "grad_norm_var": 0.014972941080729166, + "learning_rate": 0.0001, + "loss": 4.0397, + "loss/crossentropy": 2.2503843307495117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21132144331932068, + "step": 17642 + }, + { + "epoch": 0.35288, + "grad_norm": 1.9140625, + "grad_norm_var": 0.01582819620768229, + "learning_rate": 0.0001, + "loss": 4.0459, + "loss/crossentropy": 2.1919764280319214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.197752483189106, + "step": 17644 + }, + { + "epoch": 0.35292, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0148345947265625, + "learning_rate": 0.0001, + "loss": 3.9122, + "loss/crossentropy": 1.9867302775382996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2115078568458557, + "step": 17646 + }, + { + "epoch": 0.35296, + "grad_norm": 1.90625, + "grad_norm_var": 0.017899576822916666, + "learning_rate": 0.0001, + "loss": 3.8369, + "loss/crossentropy": 1.9015426635742188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17188573628664017, + "step": 17648 + }, + { + "epoch": 0.353, + "grad_norm": 1.875, + "grad_norm_var": 0.019006093343098957, + "learning_rate": 0.0001, + "loss": 3.6023, + "loss/crossentropy": 1.8092533946037292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1764475554227829, + "step": 17650 + }, + { + "epoch": 0.35304, + "grad_norm": 1.8828125, + "grad_norm_var": 0.01936620076497396, + "learning_rate": 0.0001, + "loss": 4.0542, + "loss/crossentropy": 2.0736488103866577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20046666264533997, + "step": 17652 + }, + { + "epoch": 0.35308, + "grad_norm": 2.078125, + "grad_norm_var": 0.019364166259765624, + "learning_rate": 0.0001, + "loss": 3.9716, + "loss/crossentropy": 2.1787428855895996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23303276300430298, + "step": 17654 + }, + { + "epoch": 0.35312, + "grad_norm": 2.0, + "grad_norm_var": 0.017020416259765626, + "learning_rate": 0.0001, + "loss": 4.4889, + "loss/crossentropy": 2.419381618499756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23756036162376404, + "step": 17656 + }, + { + "epoch": 0.35316, + "grad_norm": 2.09375, + "grad_norm_var": 0.012111155192057292, + "learning_rate": 0.0001, + "loss": 4.4202, + "loss/crossentropy": 2.4017220735549927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2373708263039589, + "step": 17658 + }, + { + "epoch": 0.3532, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011919911702473958, + "learning_rate": 0.0001, + "loss": 3.8805, + "loss/crossentropy": 1.7796767354011536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19661501049995422, + "step": 17660 + }, + { + "epoch": 0.35324, + "grad_norm": 2.015625, + "grad_norm_var": 0.013158162434895834, + "learning_rate": 0.0001, + "loss": 4.2215, + "loss/crossentropy": 2.1815608739852905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20645780861377716, + "step": 17662 + }, + { + "epoch": 0.35328, + "grad_norm": 2.03125, + "grad_norm_var": 0.010178375244140624, + "learning_rate": 0.0001, + "loss": 4.2148, + "loss/crossentropy": 2.056411921977997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21581003069877625, + "step": 17664 + }, + { + "epoch": 0.35332, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007824452718098958, + "learning_rate": 0.0001, + "loss": 3.8676, + "loss/crossentropy": 1.8901747465133667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19498124718666077, + "step": 17666 + }, + { + "epoch": 0.35336, + "grad_norm": 2.0, + "grad_norm_var": 0.006493123372395834, + "learning_rate": 0.0001, + "loss": 4.0884, + "loss/crossentropy": 2.0241716504096985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19405542314052582, + "step": 17668 + }, + { + "epoch": 0.3534, + "grad_norm": 1.96875, + "grad_norm_var": 0.008241526285807292, + "learning_rate": 0.0001, + "loss": 4.1452, + "loss/crossentropy": 2.016151189804077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1856973022222519, + "step": 17670 + }, + { + "epoch": 0.35344, + "grad_norm": 2.078125, + "grad_norm_var": 0.009096018473307292, + "learning_rate": 0.0001, + "loss": 4.3895, + "loss/crossentropy": 2.5066399574279785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23949292302131653, + "step": 17672 + }, + { + "epoch": 0.35348, + "grad_norm": 2.015625, + "grad_norm_var": 0.007236480712890625, + "learning_rate": 0.0001, + "loss": 4.1432, + "loss/crossentropy": 2.1384177207946777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21078043431043625, + "step": 17674 + }, + { + "epoch": 0.35352, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0070709228515625, + "learning_rate": 0.0001, + "loss": 4.1732, + "loss/crossentropy": 2.1211976408958435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064737230539322, + "step": 17676 + }, + { + "epoch": 0.35356, + "grad_norm": 2.078125, + "grad_norm_var": 0.0072418212890625, + "learning_rate": 0.0001, + "loss": 4.3103, + "loss/crossentropy": 2.018574059009552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20446718484163284, + "step": 17678 + }, + { + "epoch": 0.3536, + "grad_norm": 2.015625, + "grad_norm_var": 0.008186848958333333, + "learning_rate": 0.0001, + "loss": 3.9862, + "loss/crossentropy": 2.10149747133255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974436193704605, + "step": 17680 + }, + { + "epoch": 0.35364, + "grad_norm": 2.0625, + "grad_norm_var": 0.009698232014973959, + "learning_rate": 0.0001, + "loss": 4.3133, + "loss/crossentropy": 2.288322687149048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22535154223442078, + "step": 17682 + }, + { + "epoch": 0.35368, + "grad_norm": 1.921875, + "grad_norm_var": 0.010400136311848959, + "learning_rate": 0.0001, + "loss": 4.3396, + "loss/crossentropy": 2.002479314804077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20964795351028442, + "step": 17684 + }, + { + "epoch": 0.35372, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0099029541015625, + "learning_rate": 0.0001, + "loss": 4.0116, + "loss/crossentropy": 2.2548930644989014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19946102052927017, + "step": 17686 + }, + { + "epoch": 0.35376, + "grad_norm": 2.296875, + "grad_norm_var": 0.015006510416666667, + "learning_rate": 0.0001, + "loss": 4.1777, + "loss/crossentropy": 2.3636194467544556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21626830101013184, + "step": 17688 + }, + { + "epoch": 0.3538, + "grad_norm": 2.15625, + "grad_norm_var": 0.016471099853515626, + "learning_rate": 0.0001, + "loss": 4.1137, + "loss/crossentropy": 2.2401771545410156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.202116459608078, + "step": 17690 + }, + { + "epoch": 0.35384, + "grad_norm": 1.9453125, + "grad_norm_var": 0.01602783203125, + "learning_rate": 0.0001, + "loss": 4.2248, + "loss/crossentropy": 2.192560911178589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20161370187997818, + "step": 17692 + }, + { + "epoch": 0.35388, + "grad_norm": 2.078125, + "grad_norm_var": 0.01495361328125, + "learning_rate": 0.0001, + "loss": 4.175, + "loss/crossentropy": 2.31100332736969, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23697054386138916, + "step": 17694 + }, + { + "epoch": 0.35392, + "grad_norm": 2.171875, + "grad_norm_var": 0.014139811197916666, + "learning_rate": 0.0001, + "loss": 4.0894, + "loss/crossentropy": 2.018012821674347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1954345926642418, + "step": 17696 + }, + { + "epoch": 0.35396, + "grad_norm": 2.203125, + "grad_norm_var": 0.0168853759765625, + "learning_rate": 0.0001, + "loss": 4.3128, + "loss/crossentropy": 2.0922133326530457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2219238504767418, + "step": 17698 + }, + { + "epoch": 0.354, + "grad_norm": 2.03125, + "grad_norm_var": 0.01591796875, + "learning_rate": 0.0001, + "loss": 4.3803, + "loss/crossentropy": 2.727385640144348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21042727679014206, + "step": 17700 + }, + { + "epoch": 0.35404, + "grad_norm": 1.8984375, + "grad_norm_var": 0.014631144205729167, + "learning_rate": 0.0001, + "loss": 3.9535, + "loss/crossentropy": 1.8388070464134216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18624315410852432, + "step": 17702 + }, + { + "epoch": 0.35408, + "grad_norm": 2.03125, + "grad_norm_var": 0.008719889322916667, + "learning_rate": 0.0001, + "loss": 3.9327, + "loss/crossentropy": 1.879045307636261, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1827290803194046, + "step": 17704 + }, + { + "epoch": 0.35412, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0075266520182291664, + "learning_rate": 0.0001, + "loss": 4.2715, + "loss/crossentropy": 2.2604658603668213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22230257838964462, + "step": 17706 + }, + { + "epoch": 0.35416, + "grad_norm": 1.7890625, + "grad_norm_var": 0.01329345703125, + "learning_rate": 0.0001, + "loss": 3.7828, + "loss/crossentropy": 2.1200287342071533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20524511486291885, + "step": 17708 + }, + { + "epoch": 0.3542, + "grad_norm": 1.96875, + "grad_norm_var": 0.014135487874348958, + "learning_rate": 0.0001, + "loss": 3.9624, + "loss/crossentropy": 1.933307945728302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19991742819547653, + "step": 17710 + }, + { + "epoch": 0.35424, + "grad_norm": 2.015625, + "grad_norm_var": 0.013231404622395833, + "learning_rate": 0.0001, + "loss": 4.0673, + "loss/crossentropy": 1.9442221522331238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1861218512058258, + "step": 17712 + }, + { + "epoch": 0.35428, + "grad_norm": 1.90625, + "grad_norm_var": 0.008885701497395834, + "learning_rate": 0.0001, + "loss": 3.6592, + "loss/crossentropy": 1.7169482111930847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16892607510089874, + "step": 17714 + }, + { + "epoch": 0.35432, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006891886393229167, + "learning_rate": 0.0001, + "loss": 4.3472, + "loss/crossentropy": 2.105340003967285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19026386737823486, + "step": 17716 + }, + { + "epoch": 0.35436, + "grad_norm": 1.90625, + "grad_norm_var": 0.0069976806640625, + "learning_rate": 0.0001, + "loss": 4.1666, + "loss/crossentropy": 2.0366504192352295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19709386676549911, + "step": 17718 + }, + { + "epoch": 0.3544, + "grad_norm": 2.078125, + "grad_norm_var": 0.007393391927083334, + "learning_rate": 0.0001, + "loss": 3.9829, + "loss/crossentropy": 1.9578897356987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19842654466629028, + "step": 17720 + }, + { + "epoch": 0.35444, + "grad_norm": 1.890625, + "grad_norm_var": 0.006888834635416666, + "learning_rate": 0.0001, + "loss": 3.8608, + "loss/crossentropy": 2.139336943626404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19530436396598816, + "step": 17722 + }, + { + "epoch": 0.35448, + "grad_norm": 2.140625, + "grad_norm_var": 0.009669748942057292, + "learning_rate": 0.0001, + "loss": 4.3803, + "loss/crossentropy": 2.4077011346817017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2209417074918747, + "step": 17724 + }, + { + "epoch": 0.35452, + "grad_norm": 1.953125, + "grad_norm_var": 0.009373982747395834, + "learning_rate": 0.0001, + "loss": 4.2412, + "loss/crossentropy": 2.1217586994171143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2052367851138115, + "step": 17726 + }, + { + "epoch": 0.35456, + "grad_norm": 1.90625, + "grad_norm_var": 0.009751129150390624, + "learning_rate": 0.0001, + "loss": 4.1872, + "loss/crossentropy": 2.078941583633423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033359408378601, + "step": 17728 + }, + { + "epoch": 0.3546, + "grad_norm": 2.09375, + "grad_norm_var": 0.0072265625, + "learning_rate": 0.0001, + "loss": 4.3514, + "loss/crossentropy": 2.1763141751289368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22765995562076569, + "step": 17730 + }, + { + "epoch": 0.35464, + "grad_norm": 2.09375, + "grad_norm_var": 0.00784912109375, + "learning_rate": 0.0001, + "loss": 4.1094, + "loss/crossentropy": 1.83991938829422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18603162467479706, + "step": 17732 + }, + { + "epoch": 0.35468, + "grad_norm": 2.015625, + "grad_norm_var": 0.009132639567057291, + "learning_rate": 0.0001, + "loss": 3.93, + "loss/crossentropy": 2.021396040916443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19647164642810822, + "step": 17734 + }, + { + "epoch": 0.35472, + "grad_norm": 2.0, + "grad_norm_var": 0.009297434488932292, + "learning_rate": 0.0001, + "loss": 4.2137, + "loss/crossentropy": 2.2339202165603638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20721730589866638, + "step": 17736 + }, + { + "epoch": 0.35476, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008017730712890626, + "learning_rate": 0.0001, + "loss": 4.1859, + "loss/crossentropy": 2.2280211448669434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21610353142023087, + "step": 17738 + }, + { + "epoch": 0.3548, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0082672119140625, + "learning_rate": 0.0001, + "loss": 3.8836, + "loss/crossentropy": 1.8112387657165527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18565281480550766, + "step": 17740 + }, + { + "epoch": 0.35484, + "grad_norm": 2.140625, + "grad_norm_var": 0.009098307291666666, + "learning_rate": 0.0001, + "loss": 4.5429, + "loss/crossentropy": 2.1125651597976685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1949760466814041, + "step": 17742 + }, + { + "epoch": 0.35488, + "grad_norm": 1.96875, + "grad_norm_var": 0.009163411458333333, + "learning_rate": 0.0001, + "loss": 3.9613, + "loss/crossentropy": 2.1020091772079468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126646414399147, + "step": 17744 + }, + { + "epoch": 0.35492, + "grad_norm": 2.046875, + "grad_norm_var": 0.010553995768229166, + "learning_rate": 0.0001, + "loss": 4.0522, + "loss/crossentropy": 1.8875654339790344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1913822665810585, + "step": 17746 + }, + { + "epoch": 0.35496, + "grad_norm": 2.140625, + "grad_norm_var": 0.011921946207682292, + "learning_rate": 0.0001, + "loss": 4.2408, + "loss/crossentropy": 2.1043163537979126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18798817694187164, + "step": 17748 + }, + { + "epoch": 0.355, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012373606363932291, + "learning_rate": 0.0001, + "loss": 4.0744, + "loss/crossentropy": 2.0751022696495056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19110247492790222, + "step": 17750 + }, + { + "epoch": 0.35504, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011815388997395834, + "learning_rate": 0.0001, + "loss": 4.2077, + "loss/crossentropy": 2.0335286259651184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18653328716754913, + "step": 17752 + }, + { + "epoch": 0.35508, + "grad_norm": 2.140625, + "grad_norm_var": 0.012230428059895833, + "learning_rate": 0.0001, + "loss": 4.2833, + "loss/crossentropy": 2.091266691684723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19435560703277588, + "step": 17754 + }, + { + "epoch": 0.35512, + "grad_norm": 2.265625, + "grad_norm_var": 0.015702056884765624, + "learning_rate": 0.0001, + "loss": 4.4468, + "loss/crossentropy": 2.2889565229415894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21760451793670654, + "step": 17756 + }, + { + "epoch": 0.35516, + "grad_norm": 2.140625, + "grad_norm_var": 0.0162017822265625, + "learning_rate": 0.0001, + "loss": 4.1216, + "loss/crossentropy": 1.818089485168457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18759652972221375, + "step": 17758 + }, + { + "epoch": 0.3552, + "grad_norm": 1.9453125, + "grad_norm_var": 0.015531158447265625, + "learning_rate": 0.0001, + "loss": 4.0322, + "loss/crossentropy": 1.8579466938972473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20318903028964996, + "step": 17760 + }, + { + "epoch": 0.35524, + "grad_norm": 2.140625, + "grad_norm_var": 0.0152008056640625, + "learning_rate": 0.0001, + "loss": 4.058, + "loss/crossentropy": 2.0463815927505493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20810134708881378, + "step": 17762 + }, + { + "epoch": 0.35528, + "grad_norm": 1.859375, + "grad_norm_var": 0.0146392822265625, + "learning_rate": 0.0001, + "loss": 3.9702, + "loss/crossentropy": 1.8263658285140991, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1781037524342537, + "step": 17764 + }, + { + "epoch": 0.35532, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0123199462890625, + "learning_rate": 0.0001, + "loss": 4.0973, + "loss/crossentropy": 2.0662325620651245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2161395400762558, + "step": 17766 + }, + { + "epoch": 0.35536, + "grad_norm": 1.921875, + "grad_norm_var": 0.013598378499348958, + "learning_rate": 0.0001, + "loss": 4.1028, + "loss/crossentropy": 1.9901453256607056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20523327589035034, + "step": 17768 + }, + { + "epoch": 0.3554, + "grad_norm": 2.03125, + "grad_norm_var": 0.011766560872395833, + "learning_rate": 0.0001, + "loss": 4.1567, + "loss/crossentropy": 1.9877265095710754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20279201865196228, + "step": 17770 + }, + { + "epoch": 0.35544, + "grad_norm": 2.09375, + "grad_norm_var": 0.013695271809895833, + "learning_rate": 0.0001, + "loss": 4.5072, + "loss/crossentropy": 2.1376627683639526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20117300748825073, + "step": 17772 + }, + { + "epoch": 0.35548, + "grad_norm": 1.953125, + "grad_norm_var": 0.015327962239583333, + "learning_rate": 0.0001, + "loss": 3.8024, + "loss/crossentropy": 2.0174089074134827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2042396143078804, + "step": 17774 + }, + { + "epoch": 0.35552, + "grad_norm": 1.90625, + "grad_norm_var": 0.019090779622395835, + "learning_rate": 0.0001, + "loss": 3.7872, + "loss/crossentropy": 2.1650888919830322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2062518149614334, + "step": 17776 + }, + { + "epoch": 0.35556, + "grad_norm": 1.9140625, + "grad_norm_var": 0.017836252848307293, + "learning_rate": 0.0001, + "loss": 4.275, + "loss/crossentropy": 2.078373670578003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.205182746052742, + "step": 17778 + }, + { + "epoch": 0.3556, + "grad_norm": 1.9453125, + "grad_norm_var": 0.017693837483723957, + "learning_rate": 0.0001, + "loss": 3.7109, + "loss/crossentropy": 1.7496679425239563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17602672427892685, + "step": 17780 + }, + { + "epoch": 0.35564, + "grad_norm": 1.8984375, + "grad_norm_var": 0.01779963175455729, + "learning_rate": 0.0001, + "loss": 3.8767, + "loss/crossentropy": 2.3132810592651367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21808438748121262, + "step": 17782 + }, + { + "epoch": 0.35568, + "grad_norm": 2.046875, + "grad_norm_var": 0.017210896809895834, + "learning_rate": 0.0001, + "loss": 3.8465, + "loss/crossentropy": 1.9900661706924438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20264087617397308, + "step": 17784 + }, + { + "epoch": 0.35572, + "grad_norm": 2.03125, + "grad_norm_var": 0.01665013631184896, + "learning_rate": 0.0001, + "loss": 4.2735, + "loss/crossentropy": 2.095883369445801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19925827533006668, + "step": 17786 + }, + { + "epoch": 0.35576, + "grad_norm": 1.8671875, + "grad_norm_var": 0.008771769205729167, + "learning_rate": 0.0001, + "loss": 3.9558, + "loss/crossentropy": 1.9932443499565125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1864292174577713, + "step": 17788 + }, + { + "epoch": 0.3558, + "grad_norm": 2.0625, + "grad_norm_var": 0.008455149332682292, + "learning_rate": 0.0001, + "loss": 4.0811, + "loss/crossentropy": 1.8468709588050842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18770359456539154, + "step": 17790 + }, + { + "epoch": 0.35584, + "grad_norm": 1.921875, + "grad_norm_var": 0.006502278645833333, + "learning_rate": 0.0001, + "loss": 4.1749, + "loss/crossentropy": 2.293928623199463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20280499011278152, + "step": 17792 + }, + { + "epoch": 0.35588, + "grad_norm": 2.0, + "grad_norm_var": 0.00618896484375, + "learning_rate": 0.0001, + "loss": 4.0155, + "loss/crossentropy": 1.8333890438079834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20070593804121017, + "step": 17794 + }, + { + "epoch": 0.35592, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0061767578125, + "learning_rate": 0.0001, + "loss": 3.9903, + "loss/crossentropy": 1.7173805236816406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17676317691802979, + "step": 17796 + }, + { + "epoch": 0.35596, + "grad_norm": 1.953125, + "grad_norm_var": 0.0058977762858072914, + "learning_rate": 0.0001, + "loss": 3.8196, + "loss/crossentropy": 1.8198468685150146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20926716923713684, + "step": 17798 + }, + { + "epoch": 0.356, + "grad_norm": 2.015625, + "grad_norm_var": 0.0054840087890625, + "learning_rate": 0.0001, + "loss": 4.2473, + "loss/crossentropy": 2.354674279689789, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23675478994846344, + "step": 17800 + }, + { + "epoch": 0.35604, + "grad_norm": 1.96875, + "grad_norm_var": 0.005610911051432291, + "learning_rate": 0.0001, + "loss": 4.1092, + "loss/crossentropy": 2.055357277393341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18840713798999786, + "step": 17802 + }, + { + "epoch": 0.35608, + "grad_norm": 1.8359375, + "grad_norm_var": 0.007089996337890625, + "learning_rate": 0.0001, + "loss": 4.1351, + "loss/crossentropy": 2.3461071252822876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22149299830198288, + "step": 17804 + }, + { + "epoch": 0.35612, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0083892822265625, + "learning_rate": 0.0001, + "loss": 4.165, + "loss/crossentropy": 2.2675124406814575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21928898990154266, + "step": 17806 + }, + { + "epoch": 0.35616, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007346343994140625, + "learning_rate": 0.0001, + "loss": 4.2397, + "loss/crossentropy": 2.239398717880249, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20473385602235794, + "step": 17808 + }, + { + "epoch": 0.3562, + "grad_norm": 1.8359375, + "grad_norm_var": 0.009262847900390624, + "learning_rate": 0.0001, + "loss": 3.8152, + "loss/crossentropy": 2.022661864757538, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1910046860575676, + "step": 17810 + }, + { + "epoch": 0.35624, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009236653645833334, + "learning_rate": 0.0001, + "loss": 4.053, + "loss/crossentropy": 1.874055802822113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19950387626886368, + "step": 17812 + }, + { + "epoch": 0.35628, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009178670247395833, + "learning_rate": 0.0001, + "loss": 4.058, + "loss/crossentropy": 1.7601851224899292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18487969785928726, + "step": 17814 + }, + { + "epoch": 0.35632, + "grad_norm": 1.984375, + "grad_norm_var": 0.008348592122395833, + "learning_rate": 0.0001, + "loss": 4.0428, + "loss/crossentropy": 1.8841391801834106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17906954884529114, + "step": 17816 + }, + { + "epoch": 0.35636, + "grad_norm": 1.7890625, + "grad_norm_var": 0.011092122395833333, + "learning_rate": 0.0001, + "loss": 3.6292, + "loss/crossentropy": 1.7313326597213745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16394374519586563, + "step": 17818 + }, + { + "epoch": 0.3564, + "grad_norm": 1.96875, + "grad_norm_var": 0.0076901753743489586, + "learning_rate": 0.0001, + "loss": 3.7718, + "loss/crossentropy": 2.094825506210327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20041480660438538, + "step": 17820 + }, + { + "epoch": 0.35644, + "grad_norm": 1.9296875, + "grad_norm_var": 0.004854329427083333, + "learning_rate": 0.0001, + "loss": 4.075, + "loss/crossentropy": 2.222295045852661, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19912637770175934, + "step": 17822 + }, + { + "epoch": 0.35648, + "grad_norm": 2.015625, + "grad_norm_var": 0.005980428059895833, + "learning_rate": 0.0001, + "loss": 3.971, + "loss/crossentropy": 2.1671608090400696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20070582628250122, + "step": 17824 + }, + { + "epoch": 0.35652, + "grad_norm": 2.03125, + "grad_norm_var": 0.006884511311848958, + "learning_rate": 0.0001, + "loss": 4.0805, + "loss/crossentropy": 2.164097785949707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2535782679915428, + "step": 17826 + }, + { + "epoch": 0.35656, + "grad_norm": 1.8203125, + "grad_norm_var": 0.008405558268229167, + "learning_rate": 0.0001, + "loss": 4.128, + "loss/crossentropy": 1.738038182258606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17469938844442368, + "step": 17828 + }, + { + "epoch": 0.3566, + "grad_norm": 1.96875, + "grad_norm_var": 0.010636393229166667, + "learning_rate": 0.0001, + "loss": 4.3157, + "loss/crossentropy": 2.3383474349975586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2087705433368683, + "step": 17830 + }, + { + "epoch": 0.35664, + "grad_norm": 1.7734375, + "grad_norm_var": 0.012870279947916667, + "learning_rate": 0.0001, + "loss": 3.854, + "loss/crossentropy": 2.0826202034950256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19418098032474518, + "step": 17832 + }, + { + "epoch": 0.35668, + "grad_norm": 2.109375, + "grad_norm_var": 0.011287434895833334, + "learning_rate": 0.0001, + "loss": 4.1733, + "loss/crossentropy": 1.943938970565796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20667240023612976, + "step": 17834 + }, + { + "epoch": 0.35672, + "grad_norm": 1.9375, + "grad_norm_var": 0.011226145426432292, + "learning_rate": 0.0001, + "loss": 4.1312, + "loss/crossentropy": 1.9756113290786743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1973763257265091, + "step": 17836 + }, + { + "epoch": 0.35676, + "grad_norm": 1.8515625, + "grad_norm_var": 0.012898763020833334, + "learning_rate": 0.0001, + "loss": 3.6895, + "loss/crossentropy": 1.8583598732948303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17464379966259003, + "step": 17838 + }, + { + "epoch": 0.3568, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011791737874348958, + "learning_rate": 0.0001, + "loss": 3.9421, + "loss/crossentropy": 1.9786911606788635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20051777362823486, + "step": 17840 + }, + { + "epoch": 0.35684, + "grad_norm": 1.984375, + "grad_norm_var": 0.010414377848307291, + "learning_rate": 0.0001, + "loss": 4.0804, + "loss/crossentropy": 2.1271785497665405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21754157543182373, + "step": 17842 + }, + { + "epoch": 0.35688, + "grad_norm": 1.828125, + "grad_norm_var": 0.009639485677083334, + "learning_rate": 0.0001, + "loss": 3.9739, + "loss/crossentropy": 2.264941096305847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20562931895256042, + "step": 17844 + }, + { + "epoch": 0.35692, + "grad_norm": 2.078125, + "grad_norm_var": 0.0083160400390625, + "learning_rate": 0.0001, + "loss": 4.3865, + "loss/crossentropy": 2.141028881072998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23258531093597412, + "step": 17846 + }, + { + "epoch": 0.35696, + "grad_norm": 2.203125, + "grad_norm_var": 0.009781901041666667, + "learning_rate": 0.0001, + "loss": 4.1484, + "loss/crossentropy": 1.9527946710586548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24948758631944656, + "step": 17848 + }, + { + "epoch": 0.357, + "grad_norm": 2.03125, + "grad_norm_var": 0.008854166666666666, + "learning_rate": 0.0001, + "loss": 4.0745, + "loss/crossentropy": 1.9595746397972107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19834068417549133, + "step": 17850 + }, + { + "epoch": 0.35704, + "grad_norm": 1.8984375, + "grad_norm_var": 0.010396321614583334, + "learning_rate": 0.0001, + "loss": 4.1407, + "loss/crossentropy": 1.8414466977119446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19133952260017395, + "step": 17852 + }, + { + "epoch": 0.35708, + "grad_norm": 1.984375, + "grad_norm_var": 0.010326894124348958, + "learning_rate": 0.0001, + "loss": 3.6912, + "loss/crossentropy": 1.6589071154594421, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17579880356788635, + "step": 17854 + }, + { + "epoch": 0.35712, + "grad_norm": 1.8671875, + "grad_norm_var": 0.011213175455729167, + "learning_rate": 0.0001, + "loss": 3.8164, + "loss/crossentropy": 1.8462252020835876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19051598757505417, + "step": 17856 + }, + { + "epoch": 0.35716, + "grad_norm": 2.109375, + "grad_norm_var": 0.015083567301432291, + "learning_rate": 0.0001, + "loss": 3.9453, + "loss/crossentropy": 2.0473897457122803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21250004321336746, + "step": 17858 + }, + { + "epoch": 0.3572, + "grad_norm": 2.078125, + "grad_norm_var": 0.014664459228515624, + "learning_rate": 0.0001, + "loss": 4.1836, + "loss/crossentropy": 2.220176875591278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22177650034427643, + "step": 17860 + }, + { + "epoch": 0.35724, + "grad_norm": 1.9140625, + "grad_norm_var": 0.13489176432291666, + "learning_rate": 0.0001, + "loss": 3.5816, + "loss/crossentropy": 1.6774207949638367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16900332272052765, + "step": 17862 + }, + { + "epoch": 0.35728, + "grad_norm": 2.0, + "grad_norm_var": 0.13516616821289062, + "learning_rate": 0.0001, + "loss": 3.8075, + "loss/crossentropy": 1.9064326286315918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1925061270594597, + "step": 17864 + }, + { + "epoch": 0.35732, + "grad_norm": 2.15625, + "grad_norm_var": 0.1366607666015625, + "learning_rate": 0.0001, + "loss": 4.1698, + "loss/crossentropy": 2.188231110572815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20198386162519455, + "step": 17866 + }, + { + "epoch": 0.35736, + "grad_norm": 2.140625, + "grad_norm_var": 0.13647359212239582, + "learning_rate": 0.0001, + "loss": 4.0141, + "loss/crossentropy": 1.9380639791488647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19545384496450424, + "step": 17868 + }, + { + "epoch": 0.3574, + "grad_norm": 2.015625, + "grad_norm_var": 0.13407389322916666, + "learning_rate": 0.0001, + "loss": 4.1965, + "loss/crossentropy": 1.995704710483551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19666064530611038, + "step": 17870 + }, + { + "epoch": 0.35744, + "grad_norm": 1.96875, + "grad_norm_var": 0.13017349243164061, + "learning_rate": 0.0001, + "loss": 4.0191, + "loss/crossentropy": 2.2148354053497314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23265502601861954, + "step": 17872 + }, + { + "epoch": 0.35748, + "grad_norm": 2.03125, + "grad_norm_var": 0.1243072509765625, + "learning_rate": 0.0001, + "loss": 4.0021, + "loss/crossentropy": 1.7091269493103027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21242853999137878, + "step": 17874 + }, + { + "epoch": 0.35752, + "grad_norm": 2.109375, + "grad_norm_var": 0.12219950358072916, + "learning_rate": 0.0001, + "loss": 4.3081, + "loss/crossentropy": 2.2395507097244263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21505478769540787, + "step": 17876 + }, + { + "epoch": 0.35756, + "grad_norm": 1.953125, + "grad_norm_var": 0.008512369791666667, + "learning_rate": 0.0001, + "loss": 3.8636, + "loss/crossentropy": 2.014355480670929, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20956294238567352, + "step": 17878 + }, + { + "epoch": 0.3576, + "grad_norm": 2.296875, + "grad_norm_var": 0.012383778889973959, + "learning_rate": 0.0001, + "loss": 4.3642, + "loss/crossentropy": 2.3606066703796387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22014011442661285, + "step": 17880 + }, + { + "epoch": 0.35764, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011368560791015624, + "learning_rate": 0.0001, + "loss": 3.9368, + "loss/crossentropy": 1.6152977347373962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1630660966038704, + "step": 17882 + }, + { + "epoch": 0.35768, + "grad_norm": 1.953125, + "grad_norm_var": 0.011138661702473959, + "learning_rate": 0.0001, + "loss": 4.156, + "loss/crossentropy": 2.1977567076683044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20369882881641388, + "step": 17884 + }, + { + "epoch": 0.35772, + "grad_norm": 1.8359375, + "grad_norm_var": 0.012359364827473959, + "learning_rate": 0.0001, + "loss": 3.9927, + "loss/crossentropy": 1.9582098126411438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20284517109394073, + "step": 17886 + }, + { + "epoch": 0.35776, + "grad_norm": 1.8515625, + "grad_norm_var": 0.013818105061848959, + "learning_rate": 0.0001, + "loss": 4.108, + "loss/crossentropy": 2.1300426721572876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19376185536384583, + "step": 17888 + }, + { + "epoch": 0.3578, + "grad_norm": 2.03125, + "grad_norm_var": 0.01883519490559896, + "learning_rate": 0.0001, + "loss": 4.3648, + "loss/crossentropy": 2.057590961456299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21069013327360153, + "step": 17890 + }, + { + "epoch": 0.35784, + "grad_norm": 2.046875, + "grad_norm_var": 0.01968994140625, + "learning_rate": 0.0001, + "loss": 4.0006, + "loss/crossentropy": 2.21635901927948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19591110199689865, + "step": 17892 + }, + { + "epoch": 0.35788, + "grad_norm": 2.109375, + "grad_norm_var": 0.019809722900390625, + "learning_rate": 0.0001, + "loss": 4.4868, + "loss/crossentropy": 2.6530216932296753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24127788096666336, + "step": 17894 + }, + { + "epoch": 0.35792, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013256581624348958, + "learning_rate": 0.0001, + "loss": 4.0727, + "loss/crossentropy": 1.9931264519691467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20592481642961502, + "step": 17896 + }, + { + "epoch": 0.35796, + "grad_norm": 2.0, + "grad_norm_var": 0.013305409749348959, + "learning_rate": 0.0001, + "loss": 4.1574, + "loss/crossentropy": 2.178193688392639, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2204412743449211, + "step": 17898 + }, + { + "epoch": 0.358, + "grad_norm": 1.96875, + "grad_norm_var": 0.012878163655598959, + "learning_rate": 0.0001, + "loss": 4.1748, + "loss/crossentropy": 2.192083954811096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22313552349805832, + "step": 17900 + }, + { + "epoch": 0.35804, + "grad_norm": 1.9375, + "grad_norm_var": 0.011281077067057292, + "learning_rate": 0.0001, + "loss": 4.1695, + "loss/crossentropy": 2.228816568851471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2120378389954567, + "step": 17902 + }, + { + "epoch": 0.35808, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010871378580729167, + "learning_rate": 0.0001, + "loss": 4.1369, + "loss/crossentropy": 2.0921813249588013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19552303105592728, + "step": 17904 + }, + { + "epoch": 0.35812, + "grad_norm": 1.9609375, + "grad_norm_var": 0.004687245686848958, + "learning_rate": 0.0001, + "loss": 4.2206, + "loss/crossentropy": 2.0568217635154724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19668184220790863, + "step": 17906 + }, + { + "epoch": 0.35816, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0038937886555989584, + "learning_rate": 0.0001, + "loss": 3.9946, + "loss/crossentropy": 1.8241485357284546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15895915031433105, + "step": 17908 + }, + { + "epoch": 0.3582, + "grad_norm": 1.859375, + "grad_norm_var": 0.003082021077473958, + "learning_rate": 0.0001, + "loss": 3.9904, + "loss/crossentropy": 2.2571341395378113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2065516859292984, + "step": 17910 + }, + { + "epoch": 0.35824, + "grad_norm": 1.890625, + "grad_norm_var": 0.0032297770182291665, + "learning_rate": 0.0001, + "loss": 3.9289, + "loss/crossentropy": 2.0472288727760315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21586833149194717, + "step": 17912 + }, + { + "epoch": 0.35828, + "grad_norm": 1.984375, + "grad_norm_var": 0.0035947163899739585, + "learning_rate": 0.0001, + "loss": 4.0652, + "loss/crossentropy": 2.0345569252967834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19726714491844177, + "step": 17914 + }, + { + "epoch": 0.35832, + "grad_norm": 1.984375, + "grad_norm_var": 0.0037127176920572916, + "learning_rate": 0.0001, + "loss": 4.0294, + "loss/crossentropy": 2.15024471282959, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20401738584041595, + "step": 17916 + }, + { + "epoch": 0.35836, + "grad_norm": 2.046875, + "grad_norm_var": 0.004874420166015625, + "learning_rate": 0.0001, + "loss": 4.3122, + "loss/crossentropy": 2.110986351966858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2299470156431198, + "step": 17918 + }, + { + "epoch": 0.3584, + "grad_norm": 2.03125, + "grad_norm_var": 0.003897857666015625, + "learning_rate": 0.0001, + "loss": 4.2147, + "loss/crossentropy": 1.7297720909118652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18035240471363068, + "step": 17920 + }, + { + "epoch": 0.35844, + "grad_norm": 2.3125, + "grad_norm_var": 0.01236572265625, + "learning_rate": 0.0001, + "loss": 4.3997, + "loss/crossentropy": 1.9483368396759033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2060721442103386, + "step": 17922 + }, + { + "epoch": 0.35848, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0123443603515625, + "learning_rate": 0.0001, + "loss": 4.1527, + "loss/crossentropy": 1.9840999841690063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20700833201408386, + "step": 17924 + }, + { + "epoch": 0.35852, + "grad_norm": 2.0625, + "grad_norm_var": 0.011525217692057292, + "learning_rate": 0.0001, + "loss": 4.1754, + "loss/crossentropy": 2.4118131399154663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20695140957832336, + "step": 17926 + }, + { + "epoch": 0.35856, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011195627848307292, + "learning_rate": 0.0001, + "loss": 3.9363, + "loss/crossentropy": 1.9709432721138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18202026933431625, + "step": 17928 + }, + { + "epoch": 0.3586, + "grad_norm": 2.078125, + "grad_norm_var": 0.010206858317057291, + "learning_rate": 0.0001, + "loss": 4.2972, + "loss/crossentropy": 1.9450251460075378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21454186737537384, + "step": 17930 + }, + { + "epoch": 0.35864, + "grad_norm": 2.03125, + "grad_norm_var": 0.0094390869140625, + "learning_rate": 0.0001, + "loss": 4.3317, + "loss/crossentropy": 2.0738271474838257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21376954019069672, + "step": 17932 + }, + { + "epoch": 0.35868, + "grad_norm": 2.03125, + "grad_norm_var": 0.010109202067057291, + "learning_rate": 0.0001, + "loss": 4.1613, + "loss/crossentropy": 2.1857110261917114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2238282486796379, + "step": 17934 + }, + { + "epoch": 0.35872, + "grad_norm": 1.96875, + "grad_norm_var": 0.010643513997395833, + "learning_rate": 0.0001, + "loss": 4.2208, + "loss/crossentropy": 2.2790093421936035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987098902463913, + "step": 17936 + }, + { + "epoch": 0.35876, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0054280598958333336, + "learning_rate": 0.0001, + "loss": 3.9261, + "loss/crossentropy": 2.1101399064064026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2034989446401596, + "step": 17938 + }, + { + "epoch": 0.3588, + "grad_norm": 1.8203125, + "grad_norm_var": 0.005793253580729167, + "learning_rate": 0.0001, + "loss": 3.7024, + "loss/crossentropy": 1.6923771500587463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18680892139673233, + "step": 17940 + }, + { + "epoch": 0.35884, + "grad_norm": 1.8515625, + "grad_norm_var": 0.005342356363932292, + "learning_rate": 0.0001, + "loss": 3.9282, + "loss/crossentropy": 1.6772454977035522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17377988994121552, + "step": 17942 + }, + { + "epoch": 0.35888, + "grad_norm": 1.921875, + "grad_norm_var": 0.007722981770833333, + "learning_rate": 0.0001, + "loss": 4.141, + "loss/crossentropy": 2.2448233366012573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21804184466600418, + "step": 17944 + }, + { + "epoch": 0.35892, + "grad_norm": 1.953125, + "grad_norm_var": 0.006688435872395833, + "learning_rate": 0.0001, + "loss": 3.9984, + "loss/crossentropy": 1.934161365032196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1834414005279541, + "step": 17946 + }, + { + "epoch": 0.35896, + "grad_norm": 2.0, + "grad_norm_var": 0.007344563802083333, + "learning_rate": 0.0001, + "loss": 3.9943, + "loss/crossentropy": 2.06991970539093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21084094047546387, + "step": 17948 + }, + { + "epoch": 0.359, + "grad_norm": 1.984375, + "grad_norm_var": 0.007916005452473958, + "learning_rate": 0.0001, + "loss": 4.0493, + "loss/crossentropy": 2.0942054986953735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21652275323867798, + "step": 17950 + }, + { + "epoch": 0.35904, + "grad_norm": 2.078125, + "grad_norm_var": 0.009134928385416666, + "learning_rate": 0.0001, + "loss": 4.1125, + "loss/crossentropy": 2.0005252361297607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198953777551651, + "step": 17952 + }, + { + "epoch": 0.35908, + "grad_norm": 2.609375, + "grad_norm_var": 0.0327301025390625, + "learning_rate": 0.0001, + "loss": 4.1639, + "loss/crossentropy": 2.1563133597373962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2072513848543167, + "step": 17954 + }, + { + "epoch": 0.35912, + "grad_norm": 2.109375, + "grad_norm_var": 0.0312896728515625, + "learning_rate": 0.0001, + "loss": 4.1789, + "loss/crossentropy": 2.3533977270126343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21483703702688217, + "step": 17956 + }, + { + "epoch": 0.35916, + "grad_norm": 1.9375, + "grad_norm_var": 0.029581705729166668, + "learning_rate": 0.0001, + "loss": 4.0921, + "loss/crossentropy": 2.1915602684020996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2241670861840248, + "step": 17958 + }, + { + "epoch": 0.3592, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0314117431640625, + "learning_rate": 0.0001, + "loss": 3.9225, + "loss/crossentropy": 1.9529941082000732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1920214593410492, + "step": 17960 + }, + { + "epoch": 0.35924, + "grad_norm": 2.015625, + "grad_norm_var": 0.030350748697916666, + "learning_rate": 0.0001, + "loss": 4.4467, + "loss/crossentropy": 2.308731436729431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2238362655043602, + "step": 17962 + }, + { + "epoch": 0.35928, + "grad_norm": 2.03125, + "grad_norm_var": 0.04334208170572917, + "learning_rate": 0.0001, + "loss": 4.1682, + "loss/crossentropy": 2.0917986631393433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061680108308792, + "step": 17964 + }, + { + "epoch": 0.35932, + "grad_norm": 2.265625, + "grad_norm_var": 10.265104166666667, + "learning_rate": 0.0001, + "loss": 4.9501, + "loss/crossentropy": 2.16925585269928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20239810645580292, + "step": 17966 + }, + { + "epoch": 0.35936, + "grad_norm": 2.0, + "grad_norm_var": 10.219252268473307, + "learning_rate": 0.0001, + "loss": 4.287, + "loss/crossentropy": 2.1585946083068848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21381714940071106, + "step": 17968 + }, + { + "epoch": 0.3594, + "grad_norm": 1.8359375, + "grad_norm_var": 10.281192016601562, + "learning_rate": 0.0001, + "loss": 4.0133, + "loss/crossentropy": 1.8752552270889282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19667387753725052, + "step": 17970 + }, + { + "epoch": 0.35944, + "grad_norm": 1.9140625, + "grad_norm_var": 10.291275024414062, + "learning_rate": 0.0001, + "loss": 3.9797, + "loss/crossentropy": 1.8533543944358826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1844579428434372, + "step": 17972 + }, + { + "epoch": 0.35948, + "grad_norm": 2.234375, + "grad_norm_var": 10.282754516601562, + "learning_rate": 0.0001, + "loss": 4.5316, + "loss/crossentropy": 2.5859906673431396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23142188042402267, + "step": 17974 + }, + { + "epoch": 0.35952, + "grad_norm": 2.15625, + "grad_norm_var": 10.236201985677083, + "learning_rate": 0.0001, + "loss": 4.2607, + "loss/crossentropy": 2.2086023092269897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21764708310365677, + "step": 17976 + }, + { + "epoch": 0.35956, + "grad_norm": 1.953125, + "grad_norm_var": 10.258858235677083, + "learning_rate": 0.0001, + "loss": 4.2578, + "loss/crossentropy": 2.3860682249069214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22804389894008636, + "step": 17978 + }, + { + "epoch": 0.3596, + "grad_norm": 2.0, + "grad_norm_var": 10.314815266927083, + "learning_rate": 0.0001, + "loss": 4.2055, + "loss/crossentropy": 2.1412216424942017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.200544573366642, + "step": 17980 + }, + { + "epoch": 0.35964, + "grad_norm": 1.9140625, + "grad_norm_var": 0.022395833333333334, + "learning_rate": 0.0001, + "loss": 4.0412, + "loss/crossentropy": 2.198704957962036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2043847218155861, + "step": 17982 + }, + { + "epoch": 0.35968, + "grad_norm": 1.90625, + "grad_norm_var": 0.010249837239583334, + "learning_rate": 0.0001, + "loss": 4.037, + "loss/crossentropy": 2.0053776502609253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2072744369506836, + "step": 17984 + }, + { + "epoch": 0.35972, + "grad_norm": 2.0625, + "grad_norm_var": 0.04221979777018229, + "learning_rate": 0.0001, + "loss": 4.127, + "loss/crossentropy": 2.5015710592269897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23083895444869995, + "step": 17986 + }, + { + "epoch": 0.35976, + "grad_norm": 2.015625, + "grad_norm_var": 0.04145889282226563, + "learning_rate": 0.0001, + "loss": 4.0531, + "loss/crossentropy": 2.037220776081085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19487662613391876, + "step": 17988 + }, + { + "epoch": 0.3598, + "grad_norm": 1.921875, + "grad_norm_var": 0.03911107381184896, + "learning_rate": 0.0001, + "loss": 3.9801, + "loss/crossentropy": 1.9316660165786743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20183449983596802, + "step": 17990 + }, + { + "epoch": 0.35984, + "grad_norm": 1.828125, + "grad_norm_var": 0.04090067545572917, + "learning_rate": 0.0001, + "loss": 3.804, + "loss/crossentropy": 1.892772138118744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17082024365663528, + "step": 17992 + }, + { + "epoch": 0.35988, + "grad_norm": 1.9453125, + "grad_norm_var": 0.04088109334309896, + "learning_rate": 0.0001, + "loss": 4.0732, + "loss/crossentropy": 1.9325169324874878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1900368332862854, + "step": 17994 + }, + { + "epoch": 0.35992, + "grad_norm": 2.421875, + "grad_norm_var": 0.050388336181640625, + "learning_rate": 0.0001, + "loss": 4.1392, + "loss/crossentropy": 1.9746126532554626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21207977831363678, + "step": 17996 + }, + { + "epoch": 0.35996, + "grad_norm": 2.03125, + "grad_norm_var": 0.048620351155598956, + "learning_rate": 0.0001, + "loss": 4.0308, + "loss/crossentropy": 2.1249493956565857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18674498051404953, + "step": 17998 + }, + { + "epoch": 0.36, + "grad_norm": 2.046875, + "grad_norm_var": 0.04625422159830729, + "learning_rate": 0.0001, + "loss": 4.0854, + "loss/crossentropy": 2.1279499530792236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.214716374874115, + "step": 18000 + }, + { + "epoch": 0.36004, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0177398681640625, + "learning_rate": 0.0001, + "loss": 3.9238, + "loss/crossentropy": 2.301763415336609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20897136628627777, + "step": 18002 + }, + { + "epoch": 0.36008, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01788330078125, + "learning_rate": 0.0001, + "loss": 4.0944, + "loss/crossentropy": 1.8079062104225159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19506293535232544, + "step": 18004 + }, + { + "epoch": 0.36012, + "grad_norm": 1.890625, + "grad_norm_var": 0.0205078125, + "learning_rate": 0.0001, + "loss": 4.0503, + "loss/crossentropy": 1.9429230093955994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19762147217988968, + "step": 18006 + }, + { + "epoch": 0.36016, + "grad_norm": 1.9296875, + "grad_norm_var": 0.019608561197916666, + "learning_rate": 0.0001, + "loss": 3.9801, + "loss/crossentropy": 2.307368576526642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21261631697416306, + "step": 18008 + }, + { + "epoch": 0.3602, + "grad_norm": 1.8203125, + "grad_norm_var": 0.021952311197916668, + "learning_rate": 0.0001, + "loss": 3.8723, + "loss/crossentropy": 2.2834020853042603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20086131989955902, + "step": 18010 + }, + { + "epoch": 0.36024, + "grad_norm": 1.875, + "grad_norm_var": 0.0117340087890625, + "learning_rate": 0.0001, + "loss": 4.1708, + "loss/crossentropy": 2.1730109453201294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20470792055130005, + "step": 18012 + }, + { + "epoch": 0.36028, + "grad_norm": 2.109375, + "grad_norm_var": 0.012809244791666667, + "learning_rate": 0.0001, + "loss": 4.4047, + "loss/crossentropy": 2.491591691970825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2361597716808319, + "step": 18014 + }, + { + "epoch": 0.36032, + "grad_norm": 1.9296875, + "grad_norm_var": 0.013240305582682292, + "learning_rate": 0.0001, + "loss": 4.167, + "loss/crossentropy": 2.024519979953766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2042919620871544, + "step": 18016 + }, + { + "epoch": 0.36036, + "grad_norm": 1.7734375, + "grad_norm_var": 0.016123199462890626, + "learning_rate": 0.0001, + "loss": 3.9521, + "loss/crossentropy": 2.055600941181183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17253537476062775, + "step": 18018 + }, + { + "epoch": 0.3604, + "grad_norm": 2.046875, + "grad_norm_var": 0.0188629150390625, + "learning_rate": 0.0001, + "loss": 3.9665, + "loss/crossentropy": 1.8796368837356567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21168731898069382, + "step": 18020 + }, + { + "epoch": 0.36044, + "grad_norm": 1.9140625, + "grad_norm_var": 0.014825185139973959, + "learning_rate": 0.0001, + "loss": 4.0291, + "loss/crossentropy": 1.9885223507881165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20676057040691376, + "step": 18022 + }, + { + "epoch": 0.36048, + "grad_norm": 1.90625, + "grad_norm_var": 0.010839589436848958, + "learning_rate": 0.0001, + "loss": 3.8401, + "loss/crossentropy": 2.0412577986717224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18211868405342102, + "step": 18024 + }, + { + "epoch": 0.36052, + "grad_norm": 1.828125, + "grad_norm_var": 0.010445149739583333, + "learning_rate": 0.0001, + "loss": 3.8696, + "loss/crossentropy": 2.1391053199768066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20121745765209198, + "step": 18026 + }, + { + "epoch": 0.36056, + "grad_norm": 2.046875, + "grad_norm_var": 0.010087076822916667, + "learning_rate": 0.0001, + "loss": 3.9683, + "loss/crossentropy": 1.8888981938362122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19372491538524628, + "step": 18028 + }, + { + "epoch": 0.3606, + "grad_norm": 2.015625, + "grad_norm_var": 0.007087198893229166, + "learning_rate": 0.0001, + "loss": 4.2467, + "loss/crossentropy": 2.002028524875641, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18978165835142136, + "step": 18030 + }, + { + "epoch": 0.36064, + "grad_norm": 1.828125, + "grad_norm_var": 0.007828776041666667, + "learning_rate": 0.0001, + "loss": 4.0703, + "loss/crossentropy": 2.3448036909103394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21162079274654388, + "step": 18032 + }, + { + "epoch": 0.36068, + "grad_norm": 2.5625, + "grad_norm_var": 0.03154474894205729, + "learning_rate": 0.0001, + "loss": 4.283, + "loss/crossentropy": 2.086554765701294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25456031411886215, + "step": 18034 + }, + { + "epoch": 0.36072, + "grad_norm": 1.8671875, + "grad_norm_var": 0.02948582967122396, + "learning_rate": 0.0001, + "loss": 3.8122, + "loss/crossentropy": 1.9166680574417114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18382181227207184, + "step": 18036 + }, + { + "epoch": 0.36076, + "grad_norm": 1.8984375, + "grad_norm_var": 0.029642740885416668, + "learning_rate": 0.0001, + "loss": 3.9937, + "loss/crossentropy": 2.050579786300659, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20126450806856155, + "step": 18038 + }, + { + "epoch": 0.3608, + "grad_norm": 1.9453125, + "grad_norm_var": 0.030049641927083332, + "learning_rate": 0.0001, + "loss": 3.9273, + "loss/crossentropy": 2.2339882850646973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20789660513401031, + "step": 18040 + }, + { + "epoch": 0.36084, + "grad_norm": 1.8984375, + "grad_norm_var": 0.029243977864583333, + "learning_rate": 0.0001, + "loss": 4.0086, + "loss/crossentropy": 2.248544931411743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20925041288137436, + "step": 18042 + }, + { + "epoch": 0.36088, + "grad_norm": 2.046875, + "grad_norm_var": 0.028955078125, + "learning_rate": 0.0001, + "loss": 4.0448, + "loss/crossentropy": 1.8894963264465332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18232882767915726, + "step": 18044 + }, + { + "epoch": 0.36092, + "grad_norm": 2.046875, + "grad_norm_var": 0.0289703369140625, + "learning_rate": 0.0001, + "loss": 4.1838, + "loss/crossentropy": 2.0220844745635986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18812591582536697, + "step": 18046 + }, + { + "epoch": 0.36096, + "grad_norm": 1.9921875, + "grad_norm_var": 0.027913411458333332, + "learning_rate": 0.0001, + "loss": 4.2088, + "loss/crossentropy": 2.1896166801452637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.205679252743721, + "step": 18048 + }, + { + "epoch": 0.361, + "grad_norm": 2.046875, + "grad_norm_var": 0.004686482747395833, + "learning_rate": 0.0001, + "loss": 4.0511, + "loss/crossentropy": 2.0670089721679688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18881987780332565, + "step": 18050 + }, + { + "epoch": 0.36104, + "grad_norm": 2.046875, + "grad_norm_var": 0.005500284830729166, + "learning_rate": 0.0001, + "loss": 4.2426, + "loss/crossentropy": 2.155470609664917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2147108018398285, + "step": 18052 + }, + { + "epoch": 0.36108, + "grad_norm": 2.15625, + "grad_norm_var": 0.0076812744140625, + "learning_rate": 0.0001, + "loss": 4.2007, + "loss/crossentropy": 1.8182223439216614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18691811710596085, + "step": 18054 + }, + { + "epoch": 0.36112, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0066314697265625, + "learning_rate": 0.0001, + "loss": 3.8601, + "loss/crossentropy": 1.7366089820861816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18361609429121017, + "step": 18056 + }, + { + "epoch": 0.36116, + "grad_norm": 1.96875, + "grad_norm_var": 0.005704498291015625, + "learning_rate": 0.0001, + "loss": 4.027, + "loss/crossentropy": 1.9429153203964233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974082887172699, + "step": 18058 + }, + { + "epoch": 0.3612, + "grad_norm": 1.9453125, + "grad_norm_var": 0.00562744140625, + "learning_rate": 0.0001, + "loss": 4.2751, + "loss/crossentropy": 2.230627417564392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22216492146253586, + "step": 18060 + }, + { + "epoch": 0.36124, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006371053059895834, + "learning_rate": 0.0001, + "loss": 4.1759, + "loss/crossentropy": 2.0506786704063416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20492341369390488, + "step": 18062 + }, + { + "epoch": 0.36128, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007116444905598958, + "learning_rate": 0.0001, + "loss": 3.778, + "loss/crossentropy": 2.025477647781372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20034795254468918, + "step": 18064 + }, + { + "epoch": 0.36132, + "grad_norm": 2.046875, + "grad_norm_var": 0.006617991129557291, + "learning_rate": 0.0001, + "loss": 4.2998, + "loss/crossentropy": 1.9828922748565674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19445167481899261, + "step": 18066 + }, + { + "epoch": 0.36136, + "grad_norm": 2.046875, + "grad_norm_var": 0.006485748291015625, + "learning_rate": 0.0001, + "loss": 4.2566, + "loss/crossentropy": 2.2869513630867004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21809721738100052, + "step": 18068 + }, + { + "epoch": 0.3614, + "grad_norm": 1.875, + "grad_norm_var": 0.004129791259765625, + "learning_rate": 0.0001, + "loss": 3.9636, + "loss/crossentropy": 2.1584482192993164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21123671531677246, + "step": 18070 + }, + { + "epoch": 0.36144, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0046078999837239586, + "learning_rate": 0.0001, + "loss": 3.9628, + "loss/crossentropy": 2.112669587135315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20795845240354538, + "step": 18072 + }, + { + "epoch": 0.36148, + "grad_norm": 2.09375, + "grad_norm_var": 0.0059234619140625, + "learning_rate": 0.0001, + "loss": 4.2516, + "loss/crossentropy": 2.3283581733703613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22724319994449615, + "step": 18074 + }, + { + "epoch": 0.36152, + "grad_norm": 1.8671875, + "grad_norm_var": 0.006078084309895833, + "learning_rate": 0.0001, + "loss": 3.8966, + "loss/crossentropy": 2.286331057548523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20937074720859528, + "step": 18076 + }, + { + "epoch": 0.36156, + "grad_norm": 1.7578125, + "grad_norm_var": 0.009110260009765624, + "learning_rate": 0.0001, + "loss": 4.1124, + "loss/crossentropy": 2.1055954694747925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1924988478422165, + "step": 18078 + }, + { + "epoch": 0.3616, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008186594645182291, + "learning_rate": 0.0001, + "loss": 4.0349, + "loss/crossentropy": 2.180675983428955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21886169910430908, + "step": 18080 + }, + { + "epoch": 0.36164, + "grad_norm": 2.078125, + "grad_norm_var": 0.008621978759765624, + "learning_rate": 0.0001, + "loss": 3.9958, + "loss/crossentropy": 1.7595775127410889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17430071532726288, + "step": 18082 + }, + { + "epoch": 0.36168, + "grad_norm": 1.921875, + "grad_norm_var": 0.011472320556640625, + "learning_rate": 0.0001, + "loss": 4.0306, + "loss/crossentropy": 2.1191208958625793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19574576616287231, + "step": 18084 + }, + { + "epoch": 0.36172, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0110595703125, + "learning_rate": 0.0001, + "loss": 4.1563, + "loss/crossentropy": 2.0964609384536743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19742169976234436, + "step": 18086 + }, + { + "epoch": 0.36176, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0108154296875, + "learning_rate": 0.0001, + "loss": 4.3584, + "loss/crossentropy": 2.3081077337265015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2177809551358223, + "step": 18088 + }, + { + "epoch": 0.3618, + "grad_norm": 1.9453125, + "grad_norm_var": 0.009928385416666666, + "learning_rate": 0.0001, + "loss": 4.0493, + "loss/crossentropy": 2.340154528617859, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23811470717191696, + "step": 18090 + }, + { + "epoch": 0.36184, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009777577718098958, + "learning_rate": 0.0001, + "loss": 3.828, + "loss/crossentropy": 1.9872968196868896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19391798973083496, + "step": 18092 + }, + { + "epoch": 0.36188, + "grad_norm": 1.984375, + "grad_norm_var": 0.0067779541015625, + "learning_rate": 0.0001, + "loss": 3.9404, + "loss/crossentropy": 1.9162002205848694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.203052818775177, + "step": 18094 + }, + { + "epoch": 0.36192, + "grad_norm": 1.7890625, + "grad_norm_var": 0.009718577067057291, + "learning_rate": 0.0001, + "loss": 3.7458, + "loss/crossentropy": 2.240887403488159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19722212105989456, + "step": 18096 + }, + { + "epoch": 0.36196, + "grad_norm": 2.078125, + "grad_norm_var": 0.009496053059895834, + "learning_rate": 0.0001, + "loss": 4.1473, + "loss/crossentropy": 1.8601738214492798, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1938677802681923, + "step": 18098 + }, + { + "epoch": 0.362, + "grad_norm": 1.84375, + "grad_norm_var": 0.005729166666666666, + "learning_rate": 0.0001, + "loss": 4.0131, + "loss/crossentropy": 1.9588143229484558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19559185206890106, + "step": 18100 + }, + { + "epoch": 0.36204, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0060618082682291664, + "learning_rate": 0.0001, + "loss": 3.9199, + "loss/crossentropy": 1.909518837928772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974424198269844, + "step": 18102 + }, + { + "epoch": 0.36208, + "grad_norm": 1.8203125, + "grad_norm_var": 0.005641428629557291, + "learning_rate": 0.0001, + "loss": 4.15, + "loss/crossentropy": 2.0762908458709717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981927454471588, + "step": 18104 + }, + { + "epoch": 0.36212, + "grad_norm": 2.015625, + "grad_norm_var": 0.006339263916015625, + "learning_rate": 0.0001, + "loss": 4.3226, + "loss/crossentropy": 2.4555106163024902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22001618146896362, + "step": 18106 + }, + { + "epoch": 0.36216, + "grad_norm": 1.7734375, + "grad_norm_var": 0.013533528645833333, + "learning_rate": 0.0001, + "loss": 4.0595, + "loss/crossentropy": 1.9983150959014893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19945750385522842, + "step": 18108 + }, + { + "epoch": 0.3622, + "grad_norm": 1.90625, + "grad_norm_var": 0.013181304931640625, + "learning_rate": 0.0001, + "loss": 4.0863, + "loss/crossentropy": 2.152313530445099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21388111263513565, + "step": 18110 + }, + { + "epoch": 0.36224, + "grad_norm": 2.125, + "grad_norm_var": 0.016886393229166668, + "learning_rate": 0.0001, + "loss": 4.1128, + "loss/crossentropy": 1.690669596195221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1883523240685463, + "step": 18112 + }, + { + "epoch": 0.36228, + "grad_norm": 2.09375, + "grad_norm_var": 0.017179107666015624, + "learning_rate": 0.0001, + "loss": 3.9978, + "loss/crossentropy": 1.9404500126838684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18945956230163574, + "step": 18114 + }, + { + "epoch": 0.36232, + "grad_norm": 1.9765625, + "grad_norm_var": 0.016007486979166666, + "learning_rate": 0.0001, + "loss": 4.4396, + "loss/crossentropy": 2.213471293449402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22550443559885025, + "step": 18116 + }, + { + "epoch": 0.36236, + "grad_norm": 2.015625, + "grad_norm_var": 0.014233144124348958, + "learning_rate": 0.0001, + "loss": 4.3084, + "loss/crossentropy": 2.1008135080337524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2002386674284935, + "step": 18118 + }, + { + "epoch": 0.3624, + "grad_norm": 1.8359375, + "grad_norm_var": 0.013866933186848958, + "learning_rate": 0.0001, + "loss": 3.8664, + "loss/crossentropy": 1.9704426527023315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1806606948375702, + "step": 18120 + }, + { + "epoch": 0.36244, + "grad_norm": 1.890625, + "grad_norm_var": 0.01473388671875, + "learning_rate": 0.0001, + "loss": 3.7456, + "loss/crossentropy": 1.8658949732780457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19861871004104614, + "step": 18122 + }, + { + "epoch": 0.36248, + "grad_norm": 2.0625, + "grad_norm_var": 0.011034901936848958, + "learning_rate": 0.0001, + "loss": 4.0341, + "loss/crossentropy": 1.9327979683876038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19395866990089417, + "step": 18124 + }, + { + "epoch": 0.36252, + "grad_norm": 1.984375, + "grad_norm_var": 0.009641265869140625, + "learning_rate": 0.0001, + "loss": 4.071, + "loss/crossentropy": 2.377102255821228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21928569674491882, + "step": 18126 + }, + { + "epoch": 0.36256, + "grad_norm": 2.0625, + "grad_norm_var": 0.006994374593098958, + "learning_rate": 0.0001, + "loss": 3.9623, + "loss/crossentropy": 2.1650161743164062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2292800396680832, + "step": 18128 + }, + { + "epoch": 0.3626, + "grad_norm": 2.078125, + "grad_norm_var": 0.007458241780598959, + "learning_rate": 0.0001, + "loss": 4.3135, + "loss/crossentropy": 2.281827986240387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2190621793270111, + "step": 18130 + }, + { + "epoch": 0.36264, + "grad_norm": 1.9140625, + "grad_norm_var": 0.00848388671875, + "learning_rate": 0.0001, + "loss": 3.8335, + "loss/crossentropy": 2.0073219537734985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18984179198741913, + "step": 18132 + }, + { + "epoch": 0.36268, + "grad_norm": 2.0625, + "grad_norm_var": 0.01141357421875, + "learning_rate": 0.0001, + "loss": 3.7551, + "loss/crossentropy": 1.7675580978393555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18771683424711227, + "step": 18134 + }, + { + "epoch": 0.36272, + "grad_norm": 1.8671875, + "grad_norm_var": 0.011154937744140624, + "learning_rate": 0.0001, + "loss": 3.8816, + "loss/crossentropy": 2.064103126525879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20835159718990326, + "step": 18136 + }, + { + "epoch": 0.36276, + "grad_norm": 2.140625, + "grad_norm_var": 0.0126861572265625, + "learning_rate": 0.0001, + "loss": 4.2122, + "loss/crossentropy": 2.1821314096450806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2391536980867386, + "step": 18138 + }, + { + "epoch": 0.3628, + "grad_norm": 1.8984375, + "grad_norm_var": 0.010117340087890624, + "learning_rate": 0.0001, + "loss": 4.2307, + "loss/crossentropy": 2.295500636100769, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.215906023979187, + "step": 18140 + }, + { + "epoch": 0.36284, + "grad_norm": 1.9609375, + "grad_norm_var": 0.010245768229166667, + "learning_rate": 0.0001, + "loss": 4.1144, + "loss/crossentropy": 2.0736570954322815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18491360545158386, + "step": 18142 + }, + { + "epoch": 0.36288, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009496053059895834, + "learning_rate": 0.0001, + "loss": 4.1326, + "loss/crossentropy": 2.0513535737991333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20204297453165054, + "step": 18144 + }, + { + "epoch": 0.36292, + "grad_norm": 2.140625, + "grad_norm_var": 0.010601552327473958, + "learning_rate": 0.0001, + "loss": 4.1648, + "loss/crossentropy": 2.1471269130706787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.204594686627388, + "step": 18146 + }, + { + "epoch": 0.36296, + "grad_norm": 2.125, + "grad_norm_var": 0.011372884114583334, + "learning_rate": 0.0001, + "loss": 4.2728, + "loss/crossentropy": 2.093555986881256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20173487067222595, + "step": 18148 + }, + { + "epoch": 0.363, + "grad_norm": 1.6953125, + "grad_norm_var": 0.013627115885416667, + "learning_rate": 0.0001, + "loss": 3.7183, + "loss/crossentropy": 1.6938685178756714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1703190803527832, + "step": 18150 + }, + { + "epoch": 0.36304, + "grad_norm": 2.046875, + "grad_norm_var": 0.013337961832682292, + "learning_rate": 0.0001, + "loss": 4.116, + "loss/crossentropy": 2.180016875267029, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23546195775270462, + "step": 18152 + }, + { + "epoch": 0.36308, + "grad_norm": 2.015625, + "grad_norm_var": 0.011277008056640624, + "learning_rate": 0.0001, + "loss": 4.1826, + "loss/crossentropy": 2.145975947380066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20554804801940918, + "step": 18154 + }, + { + "epoch": 0.36312, + "grad_norm": 1.953125, + "grad_norm_var": 0.011131795247395833, + "learning_rate": 0.0001, + "loss": 4.0113, + "loss/crossentropy": 2.068985939025879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20410801470279694, + "step": 18156 + }, + { + "epoch": 0.36316, + "grad_norm": 1.8671875, + "grad_norm_var": 0.012572224934895833, + "learning_rate": 0.0001, + "loss": 3.8273, + "loss/crossentropy": 1.8428975343704224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19165785610675812, + "step": 18158 + }, + { + "epoch": 0.3632, + "grad_norm": 1.875, + "grad_norm_var": 0.0131744384765625, + "learning_rate": 0.0001, + "loss": 3.7808, + "loss/crossentropy": 1.859747588634491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18524104356765747, + "step": 18160 + }, + { + "epoch": 0.36324, + "grad_norm": 2.0, + "grad_norm_var": 0.011521148681640624, + "learning_rate": 0.0001, + "loss": 4.1458, + "loss/crossentropy": 2.1383343935012817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21649780869483948, + "step": 18162 + }, + { + "epoch": 0.36328, + "grad_norm": 2.078125, + "grad_norm_var": 0.010758209228515624, + "learning_rate": 0.0001, + "loss": 3.9889, + "loss/crossentropy": 2.0153123140335083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2131347879767418, + "step": 18164 + }, + { + "epoch": 0.36332, + "grad_norm": 1.875, + "grad_norm_var": 0.007004547119140625, + "learning_rate": 0.0001, + "loss": 4.0827, + "loss/crossentropy": 1.8382813930511475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18116765469312668, + "step": 18166 + }, + { + "epoch": 0.36336, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007287343343098958, + "learning_rate": 0.0001, + "loss": 3.8883, + "loss/crossentropy": 1.9639039039611816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2014933079481125, + "step": 18168 + }, + { + "epoch": 0.3634, + "grad_norm": 2.21875, + "grad_norm_var": 0.012446848551432292, + "learning_rate": 0.0001, + "loss": 4.2886, + "loss/crossentropy": 2.057366132736206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2134721800684929, + "step": 18170 + }, + { + "epoch": 0.36344, + "grad_norm": 1.953125, + "grad_norm_var": 0.012690989176432292, + "learning_rate": 0.0001, + "loss": 4.0801, + "loss/crossentropy": 2.244979500770569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.219241164624691, + "step": 18172 + }, + { + "epoch": 0.36348, + "grad_norm": 1.921875, + "grad_norm_var": 0.010625966389973958, + "learning_rate": 0.0001, + "loss": 4.1271, + "loss/crossentropy": 2.1159361600875854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19503474980592728, + "step": 18174 + }, + { + "epoch": 0.36352, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0099761962890625, + "learning_rate": 0.0001, + "loss": 4.2031, + "loss/crossentropy": 2.240646004676819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22175125032663345, + "step": 18176 + }, + { + "epoch": 0.36356, + "grad_norm": 1.8984375, + "grad_norm_var": 0.011327107747395834, + "learning_rate": 0.0001, + "loss": 3.9508, + "loss/crossentropy": 1.7946885228157043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17505639791488647, + "step": 18178 + }, + { + "epoch": 0.3636, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010420735677083333, + "learning_rate": 0.0001, + "loss": 4.1138, + "loss/crossentropy": 2.2058286666870117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21548640727996826, + "step": 18180 + }, + { + "epoch": 0.36364, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0101226806640625, + "learning_rate": 0.0001, + "loss": 4.0831, + "loss/crossentropy": 2.168944835662842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22227579355239868, + "step": 18182 + }, + { + "epoch": 0.36368, + "grad_norm": 1.96875, + "grad_norm_var": 0.008392079671223959, + "learning_rate": 0.0001, + "loss": 4.1482, + "loss/crossentropy": 2.019882082939148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20161078870296478, + "step": 18184 + }, + { + "epoch": 0.36372, + "grad_norm": 1.9375, + "grad_norm_var": 0.0034993489583333335, + "learning_rate": 0.0001, + "loss": 4.1305, + "loss/crossentropy": 2.000952959060669, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19938969612121582, + "step": 18186 + }, + { + "epoch": 0.36376, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0024920145670572916, + "learning_rate": 0.0001, + "loss": 3.9158, + "loss/crossentropy": 2.0612798929214478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2022939696907997, + "step": 18188 + }, + { + "epoch": 0.3638, + "grad_norm": 1.78125, + "grad_norm_var": 0.0042307535807291664, + "learning_rate": 0.0001, + "loss": 3.8822, + "loss/crossentropy": 1.664880096912384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16950316727161407, + "step": 18190 + }, + { + "epoch": 0.36384, + "grad_norm": 1.875, + "grad_norm_var": 0.003360748291015625, + "learning_rate": 0.0001, + "loss": 3.9881, + "loss/crossentropy": 1.8047854900360107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1823924407362938, + "step": 18192 + }, + { + "epoch": 0.36388, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0060808817545572914, + "learning_rate": 0.0001, + "loss": 3.7857, + "loss/crossentropy": 2.167177438735962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1963154897093773, + "step": 18194 + }, + { + "epoch": 0.36392, + "grad_norm": 1.875, + "grad_norm_var": 0.006151326497395833, + "learning_rate": 0.0001, + "loss": 4.0372, + "loss/crossentropy": 2.396019458770752, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20359576493501663, + "step": 18196 + }, + { + "epoch": 0.36396, + "grad_norm": 1.9375, + "grad_norm_var": 0.005866495768229166, + "learning_rate": 0.0001, + "loss": 3.9349, + "loss/crossentropy": 2.0866541862487793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2082398235797882, + "step": 18198 + }, + { + "epoch": 0.364, + "grad_norm": 2.015625, + "grad_norm_var": 0.0060791015625, + "learning_rate": 0.0001, + "loss": 4.1046, + "loss/crossentropy": 2.170537829399109, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19030719250440598, + "step": 18200 + }, + { + "epoch": 0.36404, + "grad_norm": 2.328125, + "grad_norm_var": 0.024234771728515625, + "learning_rate": 0.0001, + "loss": 4.3921, + "loss/crossentropy": 1.938852846622467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19586428999900818, + "step": 18202 + }, + { + "epoch": 0.36408, + "grad_norm": 1.9453125, + "grad_norm_var": 0.023996734619140626, + "learning_rate": 0.0001, + "loss": 4.0629, + "loss/crossentropy": 1.8909979462623596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17625273764133453, + "step": 18204 + }, + { + "epoch": 0.36412, + "grad_norm": 1.9296875, + "grad_norm_var": 0.022078450520833334, + "learning_rate": 0.0001, + "loss": 4.1729, + "loss/crossentropy": 1.9127016067504883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1888885200023651, + "step": 18206 + }, + { + "epoch": 0.36416, + "grad_norm": 1.9296875, + "grad_norm_var": 0.03103612263997396, + "learning_rate": 0.0001, + "loss": 4.1826, + "loss/crossentropy": 2.182482957839966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21610775589942932, + "step": 18208 + }, + { + "epoch": 0.3642, + "grad_norm": 1.8828125, + "grad_norm_var": 0.027497355143229166, + "learning_rate": 0.0001, + "loss": 4.1941, + "loss/crossentropy": 2.0900736451148987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20886409282684326, + "step": 18210 + }, + { + "epoch": 0.36424, + "grad_norm": 2.0, + "grad_norm_var": 0.0248199462890625, + "learning_rate": 0.0001, + "loss": 3.9184, + "loss/crossentropy": 2.2132604122161865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21213480830192566, + "step": 18212 + }, + { + "epoch": 0.36428, + "grad_norm": 2.171875, + "grad_norm_var": 0.024825032552083334, + "learning_rate": 0.0001, + "loss": 4.0002, + "loss/crossentropy": 1.7810762524604797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.188262477517128, + "step": 18214 + }, + { + "epoch": 0.36432, + "grad_norm": 1.984375, + "grad_norm_var": 0.02490208943684896, + "learning_rate": 0.0001, + "loss": 3.8339, + "loss/crossentropy": 2.072141647338867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20580865442752838, + "step": 18216 + }, + { + "epoch": 0.36436, + "grad_norm": 2.125, + "grad_norm_var": 0.016778310139973957, + "learning_rate": 0.0001, + "loss": 4.0777, + "loss/crossentropy": 1.9672082662582397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19521142542362213, + "step": 18218 + }, + { + "epoch": 0.3644, + "grad_norm": 1.890625, + "grad_norm_var": 0.017574055989583334, + "learning_rate": 0.0001, + "loss": 4.0763, + "loss/crossentropy": 2.0122682452201843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21602813154459, + "step": 18220 + }, + { + "epoch": 0.36444, + "grad_norm": 2.078125, + "grad_norm_var": 0.0169342041015625, + "learning_rate": 0.0001, + "loss": 4.0574, + "loss/crossentropy": 2.1709738969802856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2267712950706482, + "step": 18222 + }, + { + "epoch": 0.36448, + "grad_norm": 2.03125, + "grad_norm_var": 0.009091949462890625, + "learning_rate": 0.0001, + "loss": 4.2092, + "loss/crossentropy": 2.246786117553711, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2128008008003235, + "step": 18224 + }, + { + "epoch": 0.36452, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0063250223795572914, + "learning_rate": 0.0001, + "loss": 4.1051, + "loss/crossentropy": 1.686498999595642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18233107775449753, + "step": 18226 + }, + { + "epoch": 0.36456, + "grad_norm": 2.078125, + "grad_norm_var": 0.006786855061848959, + "learning_rate": 0.0001, + "loss": 4.3823, + "loss/crossentropy": 2.5849136114120483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23062889277935028, + "step": 18228 + }, + { + "epoch": 0.3646, + "grad_norm": 1.921875, + "grad_norm_var": 0.005804189046223958, + "learning_rate": 0.0001, + "loss": 4.04, + "loss/crossentropy": 2.068212151527405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20198806375265121, + "step": 18230 + }, + { + "epoch": 0.36464, + "grad_norm": 1.9140625, + "grad_norm_var": 0.00618896484375, + "learning_rate": 0.0001, + "loss": 3.8729, + "loss/crossentropy": 1.727245271205902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17305339127779007, + "step": 18232 + }, + { + "epoch": 0.36468, + "grad_norm": 2.078125, + "grad_norm_var": 0.005574289957682292, + "learning_rate": 0.0001, + "loss": 4.0496, + "loss/crossentropy": 2.0269583463668823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2060471773147583, + "step": 18234 + }, + { + "epoch": 0.36472, + "grad_norm": 2.015625, + "grad_norm_var": 0.004801177978515625, + "learning_rate": 0.0001, + "loss": 4.1707, + "loss/crossentropy": 1.9194093346595764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19781427085399628, + "step": 18236 + }, + { + "epoch": 0.36476, + "grad_norm": 2.078125, + "grad_norm_var": 0.0055328369140625, + "learning_rate": 0.0001, + "loss": 4.1438, + "loss/crossentropy": 1.7539438605308533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19985094666481018, + "step": 18238 + }, + { + "epoch": 0.3648, + "grad_norm": 1.890625, + "grad_norm_var": 0.006068674723307291, + "learning_rate": 0.0001, + "loss": 4.1287, + "loss/crossentropy": 2.1092851161956787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23365512490272522, + "step": 18240 + }, + { + "epoch": 0.36484, + "grad_norm": 1.984375, + "grad_norm_var": 0.00771484375, + "learning_rate": 0.0001, + "loss": 3.6435, + "loss/crossentropy": 1.772037386894226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18215186148881912, + "step": 18242 + }, + { + "epoch": 0.36488, + "grad_norm": 2.109375, + "grad_norm_var": 0.007845052083333333, + "learning_rate": 0.0001, + "loss": 4.1017, + "loss/crossentropy": 2.0108843445777893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20684240013360977, + "step": 18244 + }, + { + "epoch": 0.36492, + "grad_norm": 2.109375, + "grad_norm_var": 0.008348592122395833, + "learning_rate": 0.0001, + "loss": 4.0077, + "loss/crossentropy": 2.0076091289520264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1842496171593666, + "step": 18246 + }, + { + "epoch": 0.36496, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008085123697916667, + "learning_rate": 0.0001, + "loss": 4.1357, + "loss/crossentropy": 2.1824593544006348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19478252530097961, + "step": 18248 + }, + { + "epoch": 0.365, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0068318684895833336, + "learning_rate": 0.0001, + "loss": 4.2231, + "loss/crossentropy": 2.1033846139907837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21361663192510605, + "step": 18250 + }, + { + "epoch": 0.36504, + "grad_norm": 1.84375, + "grad_norm_var": 0.009299468994140626, + "learning_rate": 0.0001, + "loss": 3.7738, + "loss/crossentropy": 2.142418146133423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20618261396884918, + "step": 18252 + }, + { + "epoch": 0.36508, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008131663004557291, + "learning_rate": 0.0001, + "loss": 4.1037, + "loss/crossentropy": 2.061814546585083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20519836992025375, + "step": 18254 + }, + { + "epoch": 0.36512, + "grad_norm": 2.015625, + "grad_norm_var": 0.008512115478515625, + "learning_rate": 0.0001, + "loss": 4.0512, + "loss/crossentropy": 2.0216987133026123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1956964135169983, + "step": 18256 + }, + { + "epoch": 0.36516, + "grad_norm": 2.03125, + "grad_norm_var": 0.006681315104166667, + "learning_rate": 0.0001, + "loss": 4.301, + "loss/crossentropy": 2.2619231939315796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21638523787260056, + "step": 18258 + }, + { + "epoch": 0.3652, + "grad_norm": 2.09375, + "grad_norm_var": 0.006281534830729167, + "learning_rate": 0.0001, + "loss": 4.165, + "loss/crossentropy": 1.8058243989944458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19046328961849213, + "step": 18260 + }, + { + "epoch": 0.36524, + "grad_norm": 2.046875, + "grad_norm_var": 0.005639394124348958, + "learning_rate": 0.0001, + "loss": 4.1876, + "loss/crossentropy": 2.0969032049179077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20291081070899963, + "step": 18262 + }, + { + "epoch": 0.36528, + "grad_norm": 1.8125, + "grad_norm_var": 0.006794993082682292, + "learning_rate": 0.0001, + "loss": 3.8754, + "loss/crossentropy": 1.9567083716392517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19624605774879456, + "step": 18264 + }, + { + "epoch": 0.36532, + "grad_norm": 1.921875, + "grad_norm_var": 0.006493123372395834, + "learning_rate": 0.0001, + "loss": 3.9861, + "loss/crossentropy": 1.9015939235687256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18771862238645554, + "step": 18266 + }, + { + "epoch": 0.36536, + "grad_norm": 1.9375, + "grad_norm_var": 0.017575836181640624, + "learning_rate": 0.0001, + "loss": 4.0288, + "loss/crossentropy": 1.9432410597801208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1785590723156929, + "step": 18268 + }, + { + "epoch": 0.3654, + "grad_norm": 2.0, + "grad_norm_var": 0.017438761393229165, + "learning_rate": 0.0001, + "loss": 4.3256, + "loss/crossentropy": 2.216074585914612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21463461220264435, + "step": 18270 + }, + { + "epoch": 0.36544, + "grad_norm": 1.8203125, + "grad_norm_var": 0.020401763916015624, + "learning_rate": 0.0001, + "loss": 3.7192, + "loss/crossentropy": 1.5440006256103516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1621650904417038, + "step": 18272 + }, + { + "epoch": 0.36548, + "grad_norm": 1.8203125, + "grad_norm_var": 0.02197850545247396, + "learning_rate": 0.0001, + "loss": 3.9072, + "loss/crossentropy": 1.9478511214256287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20275350660085678, + "step": 18274 + }, + { + "epoch": 0.36552, + "grad_norm": 2.0, + "grad_norm_var": 0.02217381795247396, + "learning_rate": 0.0001, + "loss": 4.1986, + "loss/crossentropy": 2.16433984041214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21180611103773117, + "step": 18276 + }, + { + "epoch": 0.36556, + "grad_norm": 2.015625, + "grad_norm_var": 0.02182184855143229, + "learning_rate": 0.0001, + "loss": 4.0345, + "loss/crossentropy": 1.9628196954727173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198894202709198, + "step": 18278 + }, + { + "epoch": 0.3656, + "grad_norm": 1.8984375, + "grad_norm_var": 0.020409901936848957, + "learning_rate": 0.0001, + "loss": 4.0616, + "loss/crossentropy": 1.7818017601966858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19511079788208008, + "step": 18280 + }, + { + "epoch": 0.36564, + "grad_norm": 2.15625, + "grad_norm_var": 0.02315241495768229, + "learning_rate": 0.0001, + "loss": 4.1769, + "loss/crossentropy": 2.0149444341659546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2128366306424141, + "step": 18282 + }, + { + "epoch": 0.36568, + "grad_norm": 1.8671875, + "grad_norm_var": 0.010351308186848958, + "learning_rate": 0.0001, + "loss": 4.0475, + "loss/crossentropy": 2.1580519676208496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21166586130857468, + "step": 18284 + }, + { + "epoch": 0.36572, + "grad_norm": 1.859375, + "grad_norm_var": 0.03432591756184896, + "learning_rate": 0.0001, + "loss": 3.9196, + "loss/crossentropy": 2.308629631996155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20283856242895126, + "step": 18286 + }, + { + "epoch": 0.36576, + "grad_norm": 1.9296875, + "grad_norm_var": 0.03199055989583333, + "learning_rate": 0.0001, + "loss": 4.0587, + "loss/crossentropy": 2.1761614084243774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.212914377450943, + "step": 18288 + }, + { + "epoch": 0.3658, + "grad_norm": 1.9453125, + "grad_norm_var": 0.029842122395833334, + "learning_rate": 0.0001, + "loss": 4.191, + "loss/crossentropy": 1.9851300120353699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1898990124464035, + "step": 18290 + }, + { + "epoch": 0.36584, + "grad_norm": 1.859375, + "grad_norm_var": 0.031172688802083334, + "learning_rate": 0.0001, + "loss": 4.028, + "loss/crossentropy": 2.1123871207237244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20982655137777328, + "step": 18292 + }, + { + "epoch": 0.36588, + "grad_norm": 2.0625, + "grad_norm_var": 0.03831761678059896, + "learning_rate": 0.0001, + "loss": 4.3007, + "loss/crossentropy": 2.2666051387786865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23328793793916702, + "step": 18294 + }, + { + "epoch": 0.36592, + "grad_norm": 1.9765625, + "grad_norm_var": 0.03904393513997396, + "learning_rate": 0.0001, + "loss": 3.8741, + "loss/crossentropy": 1.9444871544837952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20335105061531067, + "step": 18296 + }, + { + "epoch": 0.36596, + "grad_norm": 2.046875, + "grad_norm_var": 0.036717732747395836, + "learning_rate": 0.0001, + "loss": 4.0799, + "loss/crossentropy": 2.136604368686676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20185644924640656, + "step": 18298 + }, + { + "epoch": 0.366, + "grad_norm": 2.59375, + "grad_norm_var": 0.059024810791015625, + "learning_rate": 0.0001, + "loss": 4.0605, + "loss/crossentropy": 1.9029142260551453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24649157375097275, + "step": 18300 + }, + { + "epoch": 0.36604, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0400054931640625, + "learning_rate": 0.0001, + "loss": 4.1685, + "loss/crossentropy": 2.0363988876342773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20920708030462265, + "step": 18302 + }, + { + "epoch": 0.36608, + "grad_norm": 1.8984375, + "grad_norm_var": 0.04248860677083333, + "learning_rate": 0.0001, + "loss": 3.5556, + "loss/crossentropy": 1.8357294797897339, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18201576173305511, + "step": 18304 + }, + { + "epoch": 0.36612, + "grad_norm": 2.109375, + "grad_norm_var": 0.04253743489583333, + "learning_rate": 0.0001, + "loss": 4.3381, + "loss/crossentropy": 2.194098114967346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20245323330163956, + "step": 18306 + }, + { + "epoch": 0.36616, + "grad_norm": 2.03125, + "grad_norm_var": 0.0409332275390625, + "learning_rate": 0.0001, + "loss": 4.2501, + "loss/crossentropy": 2.3840869665145874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2243075668811798, + "step": 18308 + }, + { + "epoch": 0.3662, + "grad_norm": 1.9375, + "grad_norm_var": 0.034398396809895836, + "learning_rate": 0.0001, + "loss": 4.1895, + "loss/crossentropy": 2.3093236684799194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22531048208475113, + "step": 18310 + }, + { + "epoch": 0.36624, + "grad_norm": 1.9296875, + "grad_norm_var": 0.03472493489583333, + "learning_rate": 0.0001, + "loss": 3.7629, + "loss/crossentropy": 2.155013680458069, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19654065370559692, + "step": 18312 + }, + { + "epoch": 0.36628, + "grad_norm": 1.9453125, + "grad_norm_var": 0.03486226399739583, + "learning_rate": 0.0001, + "loss": 4.0241, + "loss/crossentropy": 1.8823603391647339, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18333254009485245, + "step": 18314 + }, + { + "epoch": 0.36632, + "grad_norm": 2.046875, + "grad_norm_var": 0.0090972900390625, + "learning_rate": 0.0001, + "loss": 3.8092, + "loss/crossentropy": 1.9217600226402283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18095016479492188, + "step": 18316 + }, + { + "epoch": 0.36636, + "grad_norm": 1.84375, + "grad_norm_var": 0.009004720052083333, + "learning_rate": 0.0001, + "loss": 3.8178, + "loss/crossentropy": 1.7448294758796692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18972519785165787, + "step": 18318 + }, + { + "epoch": 0.3664, + "grad_norm": 1.8359375, + "grad_norm_var": 0.008442942301432292, + "learning_rate": 0.0001, + "loss": 3.9394, + "loss/crossentropy": 1.8730336427688599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18396812677383423, + "step": 18320 + }, + { + "epoch": 0.36644, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006955718994140625, + "learning_rate": 0.0001, + "loss": 3.8763, + "loss/crossentropy": 2.075575351715088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20251936465501785, + "step": 18322 + }, + { + "epoch": 0.36648, + "grad_norm": 1.96875, + "grad_norm_var": 0.004131825764973959, + "learning_rate": 0.0001, + "loss": 4.0955, + "loss/crossentropy": 2.083173990249634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2003224566578865, + "step": 18324 + }, + { + "epoch": 0.36652, + "grad_norm": 1.8671875, + "grad_norm_var": 0.00396728515625, + "learning_rate": 0.0001, + "loss": 4.0245, + "loss/crossentropy": 2.4401766061782837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21872484683990479, + "step": 18326 + }, + { + "epoch": 0.36656, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0038533528645833333, + "learning_rate": 0.0001, + "loss": 4.0249, + "loss/crossentropy": 2.2402881383895874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1977379471063614, + "step": 18328 + }, + { + "epoch": 0.3666, + "grad_norm": 1.921875, + "grad_norm_var": 0.004650624593098959, + "learning_rate": 0.0001, + "loss": 4.1047, + "loss/crossentropy": 2.2246369123458862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21132111549377441, + "step": 18330 + }, + { + "epoch": 0.36664, + "grad_norm": 2.046875, + "grad_norm_var": 0.0043365478515625, + "learning_rate": 0.0001, + "loss": 4.1607, + "loss/crossentropy": 1.8048500418663025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18298982083797455, + "step": 18332 + }, + { + "epoch": 0.36668, + "grad_norm": 2.15625, + "grad_norm_var": 0.006514231363932292, + "learning_rate": 0.0001, + "loss": 4.0987, + "loss/crossentropy": 2.3368008136749268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21743982285261154, + "step": 18334 + }, + { + "epoch": 0.36672, + "grad_norm": 2.234375, + "grad_norm_var": 0.010188547770182292, + "learning_rate": 0.0001, + "loss": 4.3158, + "loss/crossentropy": 2.3520134687423706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22603602707386017, + "step": 18336 + }, + { + "epoch": 0.36676, + "grad_norm": 2.140625, + "grad_norm_var": 0.011248524983723958, + "learning_rate": 0.0001, + "loss": 4.0037, + "loss/crossentropy": 2.0071199536323547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18584266304969788, + "step": 18338 + }, + { + "epoch": 0.3668, + "grad_norm": 2.234375, + "grad_norm_var": 0.019260406494140625, + "learning_rate": 0.0001, + "loss": 4.2787, + "loss/crossentropy": 1.9329636693000793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21350353956222534, + "step": 18340 + }, + { + "epoch": 0.36684, + "grad_norm": 2.0, + "grad_norm_var": 0.015990193684895834, + "learning_rate": 0.0001, + "loss": 4.0387, + "loss/crossentropy": 1.8231948018074036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064230963587761, + "step": 18342 + }, + { + "epoch": 0.36688, + "grad_norm": 1.828125, + "grad_norm_var": 0.01605809529622396, + "learning_rate": 0.0001, + "loss": 4.0888, + "loss/crossentropy": 1.9928399324417114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1996312066912651, + "step": 18344 + }, + { + "epoch": 0.36692, + "grad_norm": 1.96875, + "grad_norm_var": 0.015916951497395835, + "learning_rate": 0.0001, + "loss": 4.1736, + "loss/crossentropy": 2.139783501625061, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20407474786043167, + "step": 18346 + }, + { + "epoch": 0.36696, + "grad_norm": 1.8984375, + "grad_norm_var": 0.017166900634765624, + "learning_rate": 0.0001, + "loss": 4.0357, + "loss/crossentropy": 1.8519954681396484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18404167890548706, + "step": 18348 + }, + { + "epoch": 0.367, + "grad_norm": 2.015625, + "grad_norm_var": 0.016361236572265625, + "learning_rate": 0.0001, + "loss": 4.1325, + "loss/crossentropy": 2.037365674972534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2384558469057083, + "step": 18350 + }, + { + "epoch": 0.36704, + "grad_norm": 1.875, + "grad_norm_var": 0.015600331624348958, + "learning_rate": 0.0001, + "loss": 3.887, + "loss/crossentropy": 2.0023937821388245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19700156897306442, + "step": 18352 + }, + { + "epoch": 0.36708, + "grad_norm": 2.125, + "grad_norm_var": 0.015282185872395833, + "learning_rate": 0.0001, + "loss": 4.0475, + "loss/crossentropy": 1.829107940196991, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19559869915246964, + "step": 18354 + }, + { + "epoch": 0.36712, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006012980143229167, + "learning_rate": 0.0001, + "loss": 4.0962, + "loss/crossentropy": 2.206624150276184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2166636437177658, + "step": 18356 + }, + { + "epoch": 0.36716, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005907185872395833, + "learning_rate": 0.0001, + "loss": 4.1191, + "loss/crossentropy": 2.1971875429153442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20924220979213715, + "step": 18358 + }, + { + "epoch": 0.3672, + "grad_norm": 2.1875, + "grad_norm_var": 0.0069539388020833336, + "learning_rate": 0.0001, + "loss": 4.4876, + "loss/crossentropy": 2.554604172706604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2638286352157593, + "step": 18360 + }, + { + "epoch": 0.36724, + "grad_norm": 1.9375, + "grad_norm_var": 0.007981109619140624, + "learning_rate": 0.0001, + "loss": 3.9076, + "loss/crossentropy": 2.088346302509308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1948780044913292, + "step": 18362 + }, + { + "epoch": 0.36728, + "grad_norm": 1.875, + "grad_norm_var": 0.0078033447265625, + "learning_rate": 0.0001, + "loss": 4.0101, + "loss/crossentropy": 2.24162495136261, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21616502106189728, + "step": 18364 + }, + { + "epoch": 0.36732, + "grad_norm": 1.8515625, + "grad_norm_var": 0.008567047119140626, + "learning_rate": 0.0001, + "loss": 3.8693, + "loss/crossentropy": 2.249878764152527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2312733307480812, + "step": 18366 + }, + { + "epoch": 0.36736, + "grad_norm": 1.921875, + "grad_norm_var": 0.0083404541015625, + "learning_rate": 0.0001, + "loss": 4.0711, + "loss/crossentropy": 2.0440531969070435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18561603128910065, + "step": 18368 + }, + { + "epoch": 0.3674, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006696573893229167, + "learning_rate": 0.0001, + "loss": 4.2582, + "loss/crossentropy": 2.2324228286743164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20741964876651764, + "step": 18370 + }, + { + "epoch": 0.36744, + "grad_norm": 2.125, + "grad_norm_var": 0.008634440104166667, + "learning_rate": 0.0001, + "loss": 4.2807, + "loss/crossentropy": 2.168972373008728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.215408056974411, + "step": 18372 + }, + { + "epoch": 0.36748, + "grad_norm": 1.890625, + "grad_norm_var": 0.008955637613932291, + "learning_rate": 0.0001, + "loss": 4.0341, + "loss/crossentropy": 2.076392412185669, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2023996263742447, + "step": 18374 + }, + { + "epoch": 0.36752, + "grad_norm": 2.046875, + "grad_norm_var": 0.005968983968098958, + "learning_rate": 0.0001, + "loss": 4.2958, + "loss/crossentropy": 2.0497928857803345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21541842073202133, + "step": 18376 + }, + { + "epoch": 0.36756, + "grad_norm": 1.6875, + "grad_norm_var": 0.0102294921875, + "learning_rate": 0.0001, + "loss": 3.6889, + "loss/crossentropy": 1.8894451260566711, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18294879794120789, + "step": 18378 + }, + { + "epoch": 0.3676, + "grad_norm": 1.984375, + "grad_norm_var": 0.010545857747395833, + "learning_rate": 0.0001, + "loss": 4.0237, + "loss/crossentropy": 2.1654014587402344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18372679501771927, + "step": 18380 + }, + { + "epoch": 0.36764, + "grad_norm": 2.1875, + "grad_norm_var": 0.013516998291015625, + "learning_rate": 0.0001, + "loss": 4.0657, + "loss/crossentropy": 2.203396499156952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22593200951814651, + "step": 18382 + }, + { + "epoch": 0.36768, + "grad_norm": 2.03125, + "grad_norm_var": 0.014090728759765626, + "learning_rate": 0.0001, + "loss": 3.946, + "loss/crossentropy": 1.7496543526649475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1825375333428383, + "step": 18384 + }, + { + "epoch": 0.36772, + "grad_norm": 1.875, + "grad_norm_var": 0.014249420166015625, + "learning_rate": 0.0001, + "loss": 3.7667, + "loss/crossentropy": 1.9488004446029663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18244057893753052, + "step": 18386 + }, + { + "epoch": 0.36776, + "grad_norm": 1.9453125, + "grad_norm_var": 0.01231689453125, + "learning_rate": 0.0001, + "loss": 3.9406, + "loss/crossentropy": 1.9792875051498413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20204880088567734, + "step": 18388 + }, + { + "epoch": 0.3678, + "grad_norm": 1.890625, + "grad_norm_var": 0.012412261962890626, + "learning_rate": 0.0001, + "loss": 4.0077, + "loss/crossentropy": 1.890614092350006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19333010911941528, + "step": 18390 + }, + { + "epoch": 0.36784, + "grad_norm": 1.96875, + "grad_norm_var": 0.011832427978515626, + "learning_rate": 0.0001, + "loss": 4.0615, + "loss/crossentropy": 2.0901471972465515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20534122735261917, + "step": 18392 + }, + { + "epoch": 0.36788, + "grad_norm": 2.109375, + "grad_norm_var": 0.00716552734375, + "learning_rate": 0.0001, + "loss": 4.2502, + "loss/crossentropy": 2.2931089401245117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2130991816520691, + "step": 18394 + }, + { + "epoch": 0.36792, + "grad_norm": 2.078125, + "grad_norm_var": 0.0077392578125, + "learning_rate": 0.0001, + "loss": 3.9862, + "loss/crossentropy": 2.2144237756729126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22273198515176773, + "step": 18396 + }, + { + "epoch": 0.36796, + "grad_norm": 1.8046875, + "grad_norm_var": 0.006624094645182292, + "learning_rate": 0.0001, + "loss": 3.7365, + "loss/crossentropy": 1.9135422110557556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1983061358332634, + "step": 18398 + }, + { + "epoch": 0.368, + "grad_norm": 2.0, + "grad_norm_var": 0.007456207275390625, + "learning_rate": 0.0001, + "loss": 3.7556, + "loss/crossentropy": 1.692852795124054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1670849658548832, + "step": 18400 + }, + { + "epoch": 0.36804, + "grad_norm": 2.09375, + "grad_norm_var": 0.0083160400390625, + "learning_rate": 0.0001, + "loss": 4.0043, + "loss/crossentropy": 1.908652126789093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21787738054990768, + "step": 18402 + }, + { + "epoch": 0.36808, + "grad_norm": 2.015625, + "grad_norm_var": 0.008432769775390625, + "learning_rate": 0.0001, + "loss": 4.2343, + "loss/crossentropy": 2.0613549947738647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20495950430631638, + "step": 18404 + }, + { + "epoch": 0.36812, + "grad_norm": 1.890625, + "grad_norm_var": 0.026759592692057292, + "learning_rate": 0.0001, + "loss": 4.0549, + "loss/crossentropy": 1.9765326976776123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18859146535396576, + "step": 18406 + }, + { + "epoch": 0.36816, + "grad_norm": 1.953125, + "grad_norm_var": 0.027147420247395835, + "learning_rate": 0.0001, + "loss": 4.0196, + "loss/crossentropy": 1.8403696417808533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18926746398210526, + "step": 18408 + }, + { + "epoch": 0.3682, + "grad_norm": 2.15625, + "grad_norm_var": 0.029080963134765624, + "learning_rate": 0.0001, + "loss": 4.1859, + "loss/crossentropy": 2.1792644262313843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22263357043266296, + "step": 18410 + }, + { + "epoch": 0.36824, + "grad_norm": 2.0625, + "grad_norm_var": 0.02910334269205729, + "learning_rate": 0.0001, + "loss": 4.2387, + "loss/crossentropy": 2.079226016998291, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20990607887506485, + "step": 18412 + }, + { + "epoch": 0.36828, + "grad_norm": 1.8671875, + "grad_norm_var": 0.027457427978515626, + "learning_rate": 0.0001, + "loss": 4.2093, + "loss/crossentropy": 2.1428889632225037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22140806913375854, + "step": 18414 + }, + { + "epoch": 0.36832, + "grad_norm": 1.984375, + "grad_norm_var": 0.024589029947916667, + "learning_rate": 0.0001, + "loss": 4.3761, + "loss/crossentropy": 2.1461609601974487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20594316720962524, + "step": 18416 + }, + { + "epoch": 0.36836, + "grad_norm": 1.9609375, + "grad_norm_var": 0.02399470011393229, + "learning_rate": 0.0001, + "loss": 4.1597, + "loss/crossentropy": 1.980036735534668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.201970636844635, + "step": 18418 + }, + { + "epoch": 0.3684, + "grad_norm": 1.9140625, + "grad_norm_var": 0.024773915608723957, + "learning_rate": 0.0001, + "loss": 3.9847, + "loss/crossentropy": 1.6273554563522339, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1938486322760582, + "step": 18420 + }, + { + "epoch": 0.36844, + "grad_norm": 2.03125, + "grad_norm_var": 0.006251780192057291, + "learning_rate": 0.0001, + "loss": 4.1784, + "loss/crossentropy": 1.9100900292396545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18063092976808548, + "step": 18422 + }, + { + "epoch": 0.36848, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0063555399576822914, + "learning_rate": 0.0001, + "loss": 4.1282, + "loss/crossentropy": 2.1882822513580322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21261122822761536, + "step": 18424 + }, + { + "epoch": 0.36852, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0046384175618489586, + "learning_rate": 0.0001, + "loss": 3.9556, + "loss/crossentropy": 2.210257649421692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1929008737206459, + "step": 18426 + }, + { + "epoch": 0.36856, + "grad_norm": 2.078125, + "grad_norm_var": 0.005041249593098958, + "learning_rate": 0.0001, + "loss": 4.1691, + "loss/crossentropy": 2.0247724056243896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2004971206188202, + "step": 18428 + }, + { + "epoch": 0.3686, + "grad_norm": 2.015625, + "grad_norm_var": 0.004133097330729167, + "learning_rate": 0.0001, + "loss": 4.2092, + "loss/crossentropy": 1.9844316244125366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.210312619805336, + "step": 18430 + }, + { + "epoch": 0.36864, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005000559488932291, + "learning_rate": 0.0001, + "loss": 4.0368, + "loss/crossentropy": 2.2416625022888184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21065251529216766, + "step": 18432 + }, + { + "epoch": 0.36868, + "grad_norm": 2.453125, + "grad_norm_var": 0.020104726155598957, + "learning_rate": 0.0001, + "loss": 4.4109, + "loss/crossentropy": 2.0283551812171936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2150302305817604, + "step": 18434 + }, + { + "epoch": 0.36872, + "grad_norm": 2.015625, + "grad_norm_var": 0.01968994140625, + "learning_rate": 0.0001, + "loss": 4.1379, + "loss/crossentropy": 2.210233688354492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21831902861595154, + "step": 18436 + }, + { + "epoch": 0.36876, + "grad_norm": 1.9375, + "grad_norm_var": 0.0205474853515625, + "learning_rate": 0.0001, + "loss": 3.9993, + "loss/crossentropy": 2.0454147458076477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2432640790939331, + "step": 18438 + }, + { + "epoch": 0.3688, + "grad_norm": 2.078125, + "grad_norm_var": 0.020444488525390624, + "learning_rate": 0.0001, + "loss": 4.1243, + "loss/crossentropy": 2.024592399597168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20496471971273422, + "step": 18440 + }, + { + "epoch": 0.36884, + "grad_norm": 2.125, + "grad_norm_var": 0.018822987874348957, + "learning_rate": 0.0001, + "loss": 4.3058, + "loss/crossentropy": 2.4205459356307983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22281523793935776, + "step": 18442 + }, + { + "epoch": 0.36888, + "grad_norm": 1.96875, + "grad_norm_var": 0.0170806884765625, + "learning_rate": 0.0001, + "loss": 4.0341, + "loss/crossentropy": 1.9198943376541138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1916896402835846, + "step": 18444 + }, + { + "epoch": 0.36892, + "grad_norm": 2.0625, + "grad_norm_var": 0.0171051025390625, + "learning_rate": 0.0001, + "loss": 4.0475, + "loss/crossentropy": 1.928059160709381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1899876967072487, + "step": 18446 + }, + { + "epoch": 0.36896, + "grad_norm": 1.921875, + "grad_norm_var": 0.020792388916015626, + "learning_rate": 0.0001, + "loss": 4.3327, + "loss/crossentropy": 2.346290349960327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22388508170843124, + "step": 18448 + }, + { + "epoch": 0.369, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009946441650390625, + "learning_rate": 0.0001, + "loss": 3.9064, + "loss/crossentropy": 2.0108938217163086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19462434202432632, + "step": 18450 + }, + { + "epoch": 0.36904, + "grad_norm": 2.0, + "grad_norm_var": 0.00986328125, + "learning_rate": 0.0001, + "loss": 4.2468, + "loss/crossentropy": 1.7108886241912842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18806710839271545, + "step": 18452 + }, + { + "epoch": 0.36908, + "grad_norm": 5.1875, + "grad_norm_var": 0.6345499674479167, + "learning_rate": 0.0001, + "loss": 4.6972, + "loss/crossentropy": 2.4162802696228027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3217930570244789, + "step": 18454 + }, + { + "epoch": 0.36912, + "grad_norm": 2.1875, + "grad_norm_var": 0.639013671875, + "learning_rate": 0.0001, + "loss": 3.4762, + "loss/crossentropy": 1.6459838151931763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17543485760688782, + "step": 18456 + }, + { + "epoch": 0.36916, + "grad_norm": 2.140625, + "grad_norm_var": 0.640679677327474, + "learning_rate": 0.0001, + "loss": 4.1194, + "loss/crossentropy": 2.2111966013908386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22191359847784042, + "step": 18458 + }, + { + "epoch": 0.3692, + "grad_norm": 2.125, + "grad_norm_var": 0.6325887044270834, + "learning_rate": 0.0001, + "loss": 4.301, + "loss/crossentropy": 2.2514692544937134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21384654194116592, + "step": 18460 + }, + { + "epoch": 0.36924, + "grad_norm": 2.125, + "grad_norm_var": 0.63665771484375, + "learning_rate": 0.0001, + "loss": 4.307, + "loss/crossentropy": 2.097872793674469, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21747223287820816, + "step": 18462 + }, + { + "epoch": 0.36928, + "grad_norm": 2.03125, + "grad_norm_var": 0.6418690999348958, + "learning_rate": 0.0001, + "loss": 4.1033, + "loss/crossentropy": 2.1611807346343994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2042219042778015, + "step": 18464 + }, + { + "epoch": 0.36932, + "grad_norm": 2.15625, + "grad_norm_var": 0.62939453125, + "learning_rate": 0.0001, + "loss": 3.7871, + "loss/crossentropy": 1.8941562175750732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.192266546189785, + "step": 18466 + }, + { + "epoch": 0.36936, + "grad_norm": 1.921875, + "grad_norm_var": 0.72021484375, + "learning_rate": 0.0001, + "loss": 4.0358, + "loss/crossentropy": 1.8930317163467407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19436348974704742, + "step": 18468 + }, + { + "epoch": 0.3694, + "grad_norm": 1.9140625, + "grad_norm_var": 0.15979588826497396, + "learning_rate": 0.0001, + "loss": 4.1046, + "loss/crossentropy": 2.1041005849838257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21314333379268646, + "step": 18470 + }, + { + "epoch": 0.36944, + "grad_norm": 1.9453125, + "grad_norm_var": 0.15750732421875, + "learning_rate": 0.0001, + "loss": 4.2096, + "loss/crossentropy": 1.8747637867927551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19172174483537674, + "step": 18472 + }, + { + "epoch": 0.36948, + "grad_norm": 1.9453125, + "grad_norm_var": 0.15751113891601562, + "learning_rate": 0.0001, + "loss": 3.9809, + "loss/crossentropy": 2.1883610486984253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19927512109279633, + "step": 18474 + }, + { + "epoch": 0.36952, + "grad_norm": 1.9921875, + "grad_norm_var": 0.1606353759765625, + "learning_rate": 0.0001, + "loss": 3.9303, + "loss/crossentropy": 1.972772240638733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17948149144649506, + "step": 18476 + }, + { + "epoch": 0.36956, + "grad_norm": 2.0625, + "grad_norm_var": 0.14570490519205728, + "learning_rate": 0.0001, + "loss": 4.1246, + "loss/crossentropy": 1.8912597298622131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18716244399547577, + "step": 18478 + }, + { + "epoch": 0.3696, + "grad_norm": 1.9140625, + "grad_norm_var": 0.1458740234375, + "learning_rate": 0.0001, + "loss": 4.0816, + "loss/crossentropy": 2.044828712940216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2076854333281517, + "step": 18480 + }, + { + "epoch": 0.36964, + "grad_norm": 2.03125, + "grad_norm_var": 0.14629618326822916, + "learning_rate": 0.0001, + "loss": 4.2007, + "loss/crossentropy": 2.0936968326568604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22010911256074905, + "step": 18482 + }, + { + "epoch": 0.36968, + "grad_norm": 1.96875, + "grad_norm_var": 0.0027414957682291665, + "learning_rate": 0.0001, + "loss": 4.08, + "loss/crossentropy": 2.2678394317626953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2073991820216179, + "step": 18484 + }, + { + "epoch": 0.36972, + "grad_norm": 2.078125, + "grad_norm_var": 0.0031064351399739585, + "learning_rate": 0.0001, + "loss": 3.9361, + "loss/crossentropy": 2.281362295150757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015322744846344, + "step": 18486 + }, + { + "epoch": 0.36976, + "grad_norm": 2.015625, + "grad_norm_var": 0.0034739176432291665, + "learning_rate": 0.0001, + "loss": 4.2318, + "loss/crossentropy": 2.0029674768447876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2155751883983612, + "step": 18488 + }, + { + "epoch": 0.3698, + "grad_norm": 1.984375, + "grad_norm_var": 0.003525543212890625, + "learning_rate": 0.0001, + "loss": 4.3761, + "loss/crossentropy": 2.370519280433655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2308923304080963, + "step": 18490 + }, + { + "epoch": 0.36984, + "grad_norm": 2.09375, + "grad_norm_var": 0.004109446207682292, + "learning_rate": 0.0001, + "loss": 4.3137, + "loss/crossentropy": 1.9416582584381104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18809420615434647, + "step": 18492 + }, + { + "epoch": 0.36988, + "grad_norm": 1.9765625, + "grad_norm_var": 0.003885650634765625, + "learning_rate": 0.0001, + "loss": 4.1381, + "loss/crossentropy": 2.065304160118103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19029072672128677, + "step": 18494 + }, + { + "epoch": 0.36992, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0036516825358072916, + "learning_rate": 0.0001, + "loss": 4.1366, + "loss/crossentropy": 1.9798340201377869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1716388538479805, + "step": 18496 + }, + { + "epoch": 0.36996, + "grad_norm": 1.984375, + "grad_norm_var": 0.0035540262858072915, + "learning_rate": 0.0001, + "loss": 4.2321, + "loss/crossentropy": 2.336732864379883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19805839657783508, + "step": 18498 + }, + { + "epoch": 0.37, + "grad_norm": 1.984375, + "grad_norm_var": 0.0033444722493489584, + "learning_rate": 0.0001, + "loss": 4.08, + "loss/crossentropy": 1.9556902050971985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19576922804117203, + "step": 18500 + }, + { + "epoch": 0.37004, + "grad_norm": 1.796875, + "grad_norm_var": 0.006581370035807292, + "learning_rate": 0.0001, + "loss": 3.7281, + "loss/crossentropy": 2.0062427520751953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19548972696065903, + "step": 18502 + }, + { + "epoch": 0.37008, + "grad_norm": 1.9375, + "grad_norm_var": 0.0061419169108072914, + "learning_rate": 0.0001, + "loss": 4.0954, + "loss/crossentropy": 2.1030293703079224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22147215902805328, + "step": 18504 + }, + { + "epoch": 0.37012, + "grad_norm": 2.046875, + "grad_norm_var": 0.005163319905598958, + "learning_rate": 0.0001, + "loss": 3.9545, + "loss/crossentropy": 1.7618860006332397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17934025079011917, + "step": 18506 + }, + { + "epoch": 0.37016, + "grad_norm": 2.109375, + "grad_norm_var": 0.005454254150390625, + "learning_rate": 0.0001, + "loss": 4.239, + "loss/crossentropy": 2.4424854516983032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23131447285413742, + "step": 18508 + }, + { + "epoch": 0.3702, + "grad_norm": 1.96875, + "grad_norm_var": 0.0076904296875, + "learning_rate": 0.0001, + "loss": 4.0365, + "loss/crossentropy": 2.125267446041107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19127625226974487, + "step": 18510 + }, + { + "epoch": 0.37024, + "grad_norm": 1.828125, + "grad_norm_var": 0.008506011962890626, + "learning_rate": 0.0001, + "loss": 3.9498, + "loss/crossentropy": 2.2222214937210083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.204838365316391, + "step": 18512 + }, + { + "epoch": 0.37028, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009056599934895833, + "learning_rate": 0.0001, + "loss": 4.1012, + "loss/crossentropy": 2.1930031776428223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21769095957279205, + "step": 18514 + }, + { + "epoch": 0.37032, + "grad_norm": 1.84375, + "grad_norm_var": 0.010814412434895834, + "learning_rate": 0.0001, + "loss": 4.0068, + "loss/crossentropy": 2.078152060508728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19528701901435852, + "step": 18516 + }, + { + "epoch": 0.37036, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009016927083333333, + "learning_rate": 0.0001, + "loss": 4.0135, + "loss/crossentropy": 1.9786349534988403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19193317741155624, + "step": 18518 + }, + { + "epoch": 0.3704, + "grad_norm": 2.015625, + "grad_norm_var": 0.0119293212890625, + "learning_rate": 0.0001, + "loss": 4.1458, + "loss/crossentropy": 2.279360294342041, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20865458250045776, + "step": 18520 + }, + { + "epoch": 0.37044, + "grad_norm": 2.078125, + "grad_norm_var": 0.012548828125, + "learning_rate": 0.0001, + "loss": 4.195, + "loss/crossentropy": 2.0668978691101074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19060996919870377, + "step": 18522 + }, + { + "epoch": 0.37048, + "grad_norm": 1.90625, + "grad_norm_var": 0.011116536458333333, + "learning_rate": 0.0001, + "loss": 4.2661, + "loss/crossentropy": 2.4433913230895996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2263498529791832, + "step": 18524 + }, + { + "epoch": 0.37052, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009049224853515624, + "learning_rate": 0.0001, + "loss": 3.7953, + "loss/crossentropy": 1.7785582542419434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18417692929506302, + "step": 18526 + }, + { + "epoch": 0.37056, + "grad_norm": 2.015625, + "grad_norm_var": 0.008499908447265624, + "learning_rate": 0.0001, + "loss": 4.2958, + "loss/crossentropy": 1.9753797054290771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1898263543844223, + "step": 18528 + }, + { + "epoch": 0.3706, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007696278889973958, + "learning_rate": 0.0001, + "loss": 4.1945, + "loss/crossentropy": 2.302052319049835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21289903670549393, + "step": 18530 + }, + { + "epoch": 0.37064, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007523600260416667, + "learning_rate": 0.0001, + "loss": 4.1132, + "loss/crossentropy": 2.3275071382522583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2265043556690216, + "step": 18532 + }, + { + "epoch": 0.37068, + "grad_norm": 2.046875, + "grad_norm_var": 0.00826416015625, + "learning_rate": 0.0001, + "loss": 4.2004, + "loss/crossentropy": 2.3155715465545654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20101473480463028, + "step": 18534 + }, + { + "epoch": 0.37072, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0075681050618489586, + "learning_rate": 0.0001, + "loss": 3.8546, + "loss/crossentropy": 2.0893908739089966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18266596645116806, + "step": 18536 + }, + { + "epoch": 0.37076, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006546783447265625, + "learning_rate": 0.0001, + "loss": 4.297, + "loss/crossentropy": 2.075433909893036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20270193368196487, + "step": 18538 + }, + { + "epoch": 0.3708, + "grad_norm": 1.90625, + "grad_norm_var": 0.006605784098307292, + "learning_rate": 0.0001, + "loss": 4.186, + "loss/crossentropy": 2.2795485258102417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21547292172908783, + "step": 18540 + }, + { + "epoch": 0.37084, + "grad_norm": 1.859375, + "grad_norm_var": 0.006956990559895833, + "learning_rate": 0.0001, + "loss": 4.0351, + "loss/crossentropy": 2.0038467049598694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19927001744508743, + "step": 18542 + }, + { + "epoch": 0.37088, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0056111653645833336, + "learning_rate": 0.0001, + "loss": 3.979, + "loss/crossentropy": 1.8877951502799988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19283732771873474, + "step": 18544 + }, + { + "epoch": 0.37092, + "grad_norm": 1.90625, + "grad_norm_var": 0.0065305074055989586, + "learning_rate": 0.0001, + "loss": 4.0169, + "loss/crossentropy": 1.8111292719841003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20587582886219025, + "step": 18546 + }, + { + "epoch": 0.37096, + "grad_norm": 1.9609375, + "grad_norm_var": 0.004548136393229167, + "learning_rate": 0.0001, + "loss": 4.0766, + "loss/crossentropy": 2.12824147939682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1996314972639084, + "step": 18548 + }, + { + "epoch": 0.371, + "grad_norm": 1.953125, + "grad_norm_var": 0.0035845438639322915, + "learning_rate": 0.0001, + "loss": 3.9549, + "loss/crossentropy": 2.22495698928833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20922152698040009, + "step": 18550 + }, + { + "epoch": 0.37104, + "grad_norm": 1.9609375, + "grad_norm_var": 0.003110504150390625, + "learning_rate": 0.0001, + "loss": 4.2245, + "loss/crossentropy": 2.1434414386749268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2070355862379074, + "step": 18552 + }, + { + "epoch": 0.37108, + "grad_norm": 1.859375, + "grad_norm_var": 0.003979237874348959, + "learning_rate": 0.0001, + "loss": 4.0359, + "loss/crossentropy": 2.248233437538147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21400754898786545, + "step": 18554 + }, + { + "epoch": 0.37112, + "grad_norm": 2.0, + "grad_norm_var": 0.0051513671875, + "learning_rate": 0.0001, + "loss": 4.2072, + "loss/crossentropy": 2.246641755104065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22847777605056763, + "step": 18556 + }, + { + "epoch": 0.37116, + "grad_norm": 1.9765625, + "grad_norm_var": 0.004654693603515625, + "learning_rate": 0.0001, + "loss": 4.0805, + "loss/crossentropy": 2.057798206806183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23668289184570312, + "step": 18558 + }, + { + "epoch": 0.3712, + "grad_norm": 2.0, + "grad_norm_var": 0.004587554931640625, + "learning_rate": 0.0001, + "loss": 3.8454, + "loss/crossentropy": 1.7570822834968567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1820850819349289, + "step": 18560 + }, + { + "epoch": 0.37124, + "grad_norm": 2.390625, + "grad_norm_var": 0.015372467041015626, + "learning_rate": 0.0001, + "loss": 4.0077, + "loss/crossentropy": 1.7747303247451782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1704205796122551, + "step": 18562 + }, + { + "epoch": 0.37128, + "grad_norm": 1.9921875, + "grad_norm_var": 0.015632120768229167, + "learning_rate": 0.0001, + "loss": 4.0611, + "loss/crossentropy": 2.3060861825942993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20002726465463638, + "step": 18564 + }, + { + "epoch": 0.37132, + "grad_norm": 1.921875, + "grad_norm_var": 0.015242258707682291, + "learning_rate": 0.0001, + "loss": 4.258, + "loss/crossentropy": 2.2742475271224976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22524481266736984, + "step": 18566 + }, + { + "epoch": 0.37136, + "grad_norm": 1.859375, + "grad_norm_var": 0.01647923787434896, + "learning_rate": 0.0001, + "loss": 3.9816, + "loss/crossentropy": 1.7929689288139343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1819063350558281, + "step": 18568 + }, + { + "epoch": 0.3714, + "grad_norm": 1.9140625, + "grad_norm_var": 0.015718587239583335, + "learning_rate": 0.0001, + "loss": 4.2487, + "loss/crossentropy": 2.4505950212478638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22857815772294998, + "step": 18570 + }, + { + "epoch": 0.37144, + "grad_norm": 1.828125, + "grad_norm_var": 0.017658487955729166, + "learning_rate": 0.0001, + "loss": 3.5856, + "loss/crossentropy": 1.640372097492218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16958069056272507, + "step": 18572 + }, + { + "epoch": 0.37148, + "grad_norm": 1.984375, + "grad_norm_var": 0.01916681925455729, + "learning_rate": 0.0001, + "loss": 4.1601, + "loss/crossentropy": 2.1556472778320312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20749886333942413, + "step": 18574 + }, + { + "epoch": 0.37152, + "grad_norm": 2.015625, + "grad_norm_var": 0.01846491495768229, + "learning_rate": 0.0001, + "loss": 4.1579, + "loss/crossentropy": 2.1131449937820435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20596522092819214, + "step": 18576 + }, + { + "epoch": 0.37156, + "grad_norm": 1.84375, + "grad_norm_var": 0.008998362223307292, + "learning_rate": 0.0001, + "loss": 3.8036, + "loss/crossentropy": 1.8966050148010254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19289513677358627, + "step": 18578 + }, + { + "epoch": 0.3716, + "grad_norm": 2.109375, + "grad_norm_var": 0.010306803385416667, + "learning_rate": 0.0001, + "loss": 4.1883, + "loss/crossentropy": 2.1016936898231506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1917622685432434, + "step": 18580 + }, + { + "epoch": 0.37164, + "grad_norm": 2.265625, + "grad_norm_var": 0.01512451171875, + "learning_rate": 0.0001, + "loss": 4.2596, + "loss/crossentropy": 2.289618492126465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033960521221161, + "step": 18582 + }, + { + "epoch": 0.37168, + "grad_norm": 1.9765625, + "grad_norm_var": 0.020287068684895833, + "learning_rate": 0.0001, + "loss": 4.0441, + "loss/crossentropy": 2.205715775489807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20015332102775574, + "step": 18584 + }, + { + "epoch": 0.37172, + "grad_norm": 2.171875, + "grad_norm_var": 0.02072118123372396, + "learning_rate": 0.0001, + "loss": 4.0834, + "loss/crossentropy": 2.3037471771240234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2424507886171341, + "step": 18586 + }, + { + "epoch": 0.37176, + "grad_norm": 1.890625, + "grad_norm_var": 0.016686757405598957, + "learning_rate": 0.0001, + "loss": 4.0718, + "loss/crossentropy": 2.1757054328918457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20777830481529236, + "step": 18588 + }, + { + "epoch": 0.3718, + "grad_norm": 2.03125, + "grad_norm_var": 0.015860748291015626, + "learning_rate": 0.0001, + "loss": 4.2905, + "loss/crossentropy": 2.3257133960723877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22848188877105713, + "step": 18590 + }, + { + "epoch": 0.37184, + "grad_norm": 1.9140625, + "grad_norm_var": 0.017179107666015624, + "learning_rate": 0.0001, + "loss": 3.9132, + "loss/crossentropy": 2.3375465869903564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21214719116687775, + "step": 18592 + }, + { + "epoch": 0.37188, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0162750244140625, + "learning_rate": 0.0001, + "loss": 4.507, + "loss/crossentropy": 2.1896092891693115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2303280457854271, + "step": 18594 + }, + { + "epoch": 0.37192, + "grad_norm": 2.0, + "grad_norm_var": 0.016287994384765626, + "learning_rate": 0.0001, + "loss": 3.9078, + "loss/crossentropy": 1.8913645148277283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17311514914035797, + "step": 18596 + }, + { + "epoch": 0.37196, + "grad_norm": 2.03125, + "grad_norm_var": 0.012442779541015626, + "learning_rate": 0.0001, + "loss": 4.2212, + "loss/crossentropy": 2.5278197526931763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23568174242973328, + "step": 18598 + }, + { + "epoch": 0.372, + "grad_norm": 1.9375, + "grad_norm_var": 0.0101959228515625, + "learning_rate": 0.0001, + "loss": 4.1568, + "loss/crossentropy": 2.073936700820923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21885155141353607, + "step": 18600 + }, + { + "epoch": 0.37204, + "grad_norm": 2.265625, + "grad_norm_var": 0.012684885660807292, + "learning_rate": 0.0001, + "loss": 4.388, + "loss/crossentropy": 2.162986159324646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2012084275484085, + "step": 18602 + }, + { + "epoch": 0.37208, + "grad_norm": 1.921875, + "grad_norm_var": 0.013108062744140624, + "learning_rate": 0.0001, + "loss": 3.9659, + "loss/crossentropy": 1.9332863092422485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18284077942371368, + "step": 18604 + }, + { + "epoch": 0.37212, + "grad_norm": 1.9140625, + "grad_norm_var": 0.013809967041015624, + "learning_rate": 0.0001, + "loss": 4.0279, + "loss/crossentropy": 2.083172380924225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.186289481818676, + "step": 18606 + }, + { + "epoch": 0.37216, + "grad_norm": 1.96875, + "grad_norm_var": 0.013588205973307291, + "learning_rate": 0.0001, + "loss": 3.9932, + "loss/crossentropy": 2.100313901901245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1977698802947998, + "step": 18608 + }, + { + "epoch": 0.3722, + "grad_norm": 1.890625, + "grad_norm_var": 0.016792805989583333, + "learning_rate": 0.0001, + "loss": 4.1086, + "loss/crossentropy": 2.2336236238479614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21973469108343124, + "step": 18610 + }, + { + "epoch": 0.37224, + "grad_norm": 1.984375, + "grad_norm_var": 0.017048136393229166, + "learning_rate": 0.0001, + "loss": 3.9422, + "loss/crossentropy": 2.0263352394104004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19232206791639328, + "step": 18612 + }, + { + "epoch": 0.37228, + "grad_norm": 1.9765625, + "grad_norm_var": 0.01727472941080729, + "learning_rate": 0.0001, + "loss": 4.2295, + "loss/crossentropy": 2.0719032287597656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22896382957696915, + "step": 18614 + }, + { + "epoch": 0.37232, + "grad_norm": 1.84375, + "grad_norm_var": 0.015400950113932292, + "learning_rate": 0.0001, + "loss": 3.881, + "loss/crossentropy": 1.9514707326889038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1899290755391121, + "step": 18616 + }, + { + "epoch": 0.37236, + "grad_norm": 1.921875, + "grad_norm_var": 0.010172526041666666, + "learning_rate": 0.0001, + "loss": 4.0163, + "loss/crossentropy": 2.0810786485671997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1983262002468109, + "step": 18618 + }, + { + "epoch": 0.3724, + "grad_norm": 1.921875, + "grad_norm_var": 0.010302734375, + "learning_rate": 0.0001, + "loss": 4.1311, + "loss/crossentropy": 1.823366403579712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18820683658123016, + "step": 18620 + }, + { + "epoch": 0.37244, + "grad_norm": 2.4375, + "grad_norm_var": 0.023933919270833333, + "learning_rate": 0.0001, + "loss": 4.2361, + "loss/crossentropy": 2.2814120054244995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19500700384378433, + "step": 18622 + }, + { + "epoch": 0.37248, + "grad_norm": 1.9765625, + "grad_norm_var": 0.023339589436848957, + "learning_rate": 0.0001, + "loss": 3.9548, + "loss/crossentropy": 1.5761349201202393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17777466028928757, + "step": 18624 + }, + { + "epoch": 0.37252, + "grad_norm": 1.9375, + "grad_norm_var": 0.018853505452473957, + "learning_rate": 0.0001, + "loss": 3.9134, + "loss/crossentropy": 1.8996286988258362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21907050907611847, + "step": 18626 + }, + { + "epoch": 0.37256, + "grad_norm": 1.9296875, + "grad_norm_var": 0.020869700113932292, + "learning_rate": 0.0001, + "loss": 4.2361, + "loss/crossentropy": 1.6650620698928833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18808475136756897, + "step": 18628 + }, + { + "epoch": 0.3726, + "grad_norm": 2.125, + "grad_norm_var": 0.02156550089518229, + "learning_rate": 0.0001, + "loss": 4.4004, + "loss/crossentropy": 2.2887942790985107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20416373759508133, + "step": 18630 + }, + { + "epoch": 0.37264, + "grad_norm": 2.03125, + "grad_norm_var": 0.018629709879557293, + "learning_rate": 0.0001, + "loss": 3.9606, + "loss/crossentropy": 1.9479430317878723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18489708751440048, + "step": 18632 + }, + { + "epoch": 0.37268, + "grad_norm": 2.0, + "grad_norm_var": 0.016676584879557293, + "learning_rate": 0.0001, + "loss": 4.3041, + "loss/crossentropy": 2.2156901359558105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20997263491153717, + "step": 18634 + }, + { + "epoch": 0.37272, + "grad_norm": 2.0, + "grad_norm_var": 0.015950520833333332, + "learning_rate": 0.0001, + "loss": 4.1619, + "loss/crossentropy": 2.50583279132843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23347270488739014, + "step": 18636 + }, + { + "epoch": 0.37276, + "grad_norm": 1.8203125, + "grad_norm_var": 0.008650716145833333, + "learning_rate": 0.0001, + "loss": 3.8241, + "loss/crossentropy": 1.7688243985176086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18601343035697937, + "step": 18638 + }, + { + "epoch": 0.3728, + "grad_norm": 2.109375, + "grad_norm_var": 0.009439849853515625, + "learning_rate": 0.0001, + "loss": 4.2241, + "loss/crossentropy": 2.3786444664001465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2287142351269722, + "step": 18640 + }, + { + "epoch": 0.37284, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009479777018229166, + "learning_rate": 0.0001, + "loss": 4.243, + "loss/crossentropy": 2.2686339616775513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21086719632148743, + "step": 18642 + }, + { + "epoch": 0.37288, + "grad_norm": 2.0, + "grad_norm_var": 0.007490793863932292, + "learning_rate": 0.0001, + "loss": 3.8341, + "loss/crossentropy": 1.9386130571365356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19289222359657288, + "step": 18644 + }, + { + "epoch": 0.37292, + "grad_norm": 1.953125, + "grad_norm_var": 0.006819407145182292, + "learning_rate": 0.0001, + "loss": 4.3357, + "loss/crossentropy": 2.3655601739883423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2207036018371582, + "step": 18646 + }, + { + "epoch": 0.37296, + "grad_norm": 1.9375, + "grad_norm_var": 0.007100168863932292, + "learning_rate": 0.0001, + "loss": 3.9039, + "loss/crossentropy": 1.9600831270217896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18018172681331635, + "step": 18648 + }, + { + "epoch": 0.373, + "grad_norm": 2.515625, + "grad_norm_var": 0.024873860677083335, + "learning_rate": 0.0001, + "loss": 3.9492, + "loss/crossentropy": 1.86528480052948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19763591885566711, + "step": 18650 + }, + { + "epoch": 0.37304, + "grad_norm": 1.984375, + "grad_norm_var": 0.024925740559895833, + "learning_rate": 0.0001, + "loss": 4.07, + "loss/crossentropy": 2.0938327312469482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2051714062690735, + "step": 18652 + }, + { + "epoch": 0.37308, + "grad_norm": 1.9296875, + "grad_norm_var": 0.022454579671223957, + "learning_rate": 0.0001, + "loss": 3.878, + "loss/crossentropy": 1.964399516582489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20198997855186462, + "step": 18654 + }, + { + "epoch": 0.37312, + "grad_norm": 2.09375, + "grad_norm_var": 0.0245849609375, + "learning_rate": 0.0001, + "loss": 3.9929, + "loss/crossentropy": 2.0186336040496826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19310183823108673, + "step": 18656 + }, + { + "epoch": 0.37316, + "grad_norm": 1.921875, + "grad_norm_var": 0.025052642822265624, + "learning_rate": 0.0001, + "loss": 3.8569, + "loss/crossentropy": 1.8805240392684937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20392487198114395, + "step": 18658 + }, + { + "epoch": 0.3732, + "grad_norm": 1.90625, + "grad_norm_var": 0.025480143229166665, + "learning_rate": 0.0001, + "loss": 4.0306, + "loss/crossentropy": 2.45102322101593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2161283940076828, + "step": 18660 + }, + { + "epoch": 0.37324, + "grad_norm": 1.921875, + "grad_norm_var": 0.025679270426432293, + "learning_rate": 0.0001, + "loss": 4.1015, + "loss/crossentropy": 2.2971357107162476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23155176639556885, + "step": 18662 + }, + { + "epoch": 0.37328, + "grad_norm": 1.953125, + "grad_norm_var": 0.025614166259765626, + "learning_rate": 0.0001, + "loss": 4.285, + "loss/crossentropy": 2.3736867904663086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22897879034280777, + "step": 18664 + }, + { + "epoch": 0.37332, + "grad_norm": 2.0625, + "grad_norm_var": 0.005500284830729166, + "learning_rate": 0.0001, + "loss": 3.9884, + "loss/crossentropy": 1.5138108134269714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1526506468653679, + "step": 18666 + }, + { + "epoch": 0.37336, + "grad_norm": 1.859375, + "grad_norm_var": 0.0061948140462239586, + "learning_rate": 0.0001, + "loss": 3.9507, + "loss/crossentropy": 2.415435791015625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21689960360527039, + "step": 18668 + }, + { + "epoch": 0.3734, + "grad_norm": 1.90625, + "grad_norm_var": 0.006306966145833333, + "learning_rate": 0.0001, + "loss": 4.014, + "loss/crossentropy": 2.1235941648483276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20921900868415833, + "step": 18670 + }, + { + "epoch": 0.37344, + "grad_norm": 2.015625, + "grad_norm_var": 0.004428863525390625, + "learning_rate": 0.0001, + "loss": 4.2549, + "loss/crossentropy": 2.3612579703330994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21255160123109818, + "step": 18672 + }, + { + "epoch": 0.37348, + "grad_norm": 2.09375, + "grad_norm_var": 0.054351552327473955, + "learning_rate": 0.0001, + "loss": 4.1722, + "loss/crossentropy": 2.23550283908844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18890902400016785, + "step": 18674 + }, + { + "epoch": 0.37352, + "grad_norm": 1.9140625, + "grad_norm_var": 0.05468317667643229, + "learning_rate": 0.0001, + "loss": 4.0985, + "loss/crossentropy": 1.7998243570327759, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19070486724376678, + "step": 18676 + }, + { + "epoch": 0.37356, + "grad_norm": 1.984375, + "grad_norm_var": 0.055214182535807295, + "learning_rate": 0.0001, + "loss": 4.151, + "loss/crossentropy": 1.892760992050171, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19817107915878296, + "step": 18678 + }, + { + "epoch": 0.3736, + "grad_norm": 1.9140625, + "grad_norm_var": 0.055757395426432294, + "learning_rate": 0.0001, + "loss": 3.9862, + "loss/crossentropy": 1.9619091153144836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21299303323030472, + "step": 18680 + }, + { + "epoch": 0.37364, + "grad_norm": 1.90625, + "grad_norm_var": 0.057889556884765624, + "learning_rate": 0.0001, + "loss": 3.8419, + "loss/crossentropy": 2.0857014656066895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20634424686431885, + "step": 18682 + }, + { + "epoch": 0.37368, + "grad_norm": 1.796875, + "grad_norm_var": 0.05814793904622396, + "learning_rate": 0.0001, + "loss": 3.9589, + "loss/crossentropy": 2.1093358397483826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1933949589729309, + "step": 18684 + }, + { + "epoch": 0.37372, + "grad_norm": 1.9375, + "grad_norm_var": 0.05987548828125, + "learning_rate": 0.0001, + "loss": 3.9537, + "loss/crossentropy": 2.1040873527526855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061748430132866, + "step": 18686 + }, + { + "epoch": 0.37376, + "grad_norm": 1.9609375, + "grad_norm_var": 0.059081776936848955, + "learning_rate": 0.0001, + "loss": 4.1676, + "loss/crossentropy": 2.4795751571655273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2075038179755211, + "step": 18688 + }, + { + "epoch": 0.3738, + "grad_norm": 1.8515625, + "grad_norm_var": 0.006599934895833334, + "learning_rate": 0.0001, + "loss": 3.9992, + "loss/crossentropy": 1.864591360092163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17119919508695602, + "step": 18690 + }, + { + "epoch": 0.37384, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007063547770182292, + "learning_rate": 0.0001, + "loss": 3.7565, + "loss/crossentropy": 1.6705753207206726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19292957335710526, + "step": 18692 + }, + { + "epoch": 0.37388, + "grad_norm": 2.03125, + "grad_norm_var": 0.004209136962890625, + "learning_rate": 0.0001, + "loss": 4.0448, + "loss/crossentropy": 2.16109561920166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20515639334917068, + "step": 18694 + }, + { + "epoch": 0.37392, + "grad_norm": 2.28125, + "grad_norm_var": 0.014212799072265626, + "learning_rate": 0.0001, + "loss": 4.1683, + "loss/crossentropy": 1.771731436252594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20304742455482483, + "step": 18696 + }, + { + "epoch": 0.37396, + "grad_norm": 1.8203125, + "grad_norm_var": 0.014330037434895833, + "learning_rate": 0.0001, + "loss": 3.9636, + "loss/crossentropy": 2.046182096004486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1976921409368515, + "step": 18698 + }, + { + "epoch": 0.374, + "grad_norm": 1.78125, + "grad_norm_var": 0.0148345947265625, + "learning_rate": 0.0001, + "loss": 4.0257, + "loss/crossentropy": 2.0343292355537415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19037891179323196, + "step": 18700 + }, + { + "epoch": 0.37404, + "grad_norm": 1.859375, + "grad_norm_var": 0.014782460530598958, + "learning_rate": 0.0001, + "loss": 4.1727, + "loss/crossentropy": 2.034367859363556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20182596892118454, + "step": 18702 + }, + { + "epoch": 0.37408, + "grad_norm": 1.9609375, + "grad_norm_var": 0.015616861979166667, + "learning_rate": 0.0001, + "loss": 3.966, + "loss/crossentropy": 2.048890709877014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19922567903995514, + "step": 18704 + }, + { + "epoch": 0.37412, + "grad_norm": 2.03125, + "grad_norm_var": 0.0163970947265625, + "learning_rate": 0.0001, + "loss": 4.0864, + "loss/crossentropy": 1.8759222626686096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19452380388975143, + "step": 18706 + }, + { + "epoch": 0.37416, + "grad_norm": 1.9609375, + "grad_norm_var": 0.015681966145833334, + "learning_rate": 0.0001, + "loss": 3.9964, + "loss/crossentropy": 1.693844199180603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16944261640310287, + "step": 18708 + }, + { + "epoch": 0.3742, + "grad_norm": 1.9765625, + "grad_norm_var": 0.01727879842122396, + "learning_rate": 0.0001, + "loss": 4.293, + "loss/crossentropy": 2.266264319419861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2103852555155754, + "step": 18710 + }, + { + "epoch": 0.37424, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008250935872395834, + "learning_rate": 0.0001, + "loss": 3.8547, + "loss/crossentropy": 2.029367506504059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18859465420246124, + "step": 18712 + }, + { + "epoch": 0.37428, + "grad_norm": 2.640625, + "grad_norm_var": 0.039184315999348955, + "learning_rate": 0.0001, + "loss": 4.1141, + "loss/crossentropy": 1.9427701234817505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1960265040397644, + "step": 18714 + }, + { + "epoch": 0.37432, + "grad_norm": 2.078125, + "grad_norm_var": 0.03737360636393229, + "learning_rate": 0.0001, + "loss": 4.3087, + "loss/crossentropy": 2.2656137943267822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22287357598543167, + "step": 18716 + }, + { + "epoch": 0.37436, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0365386962890625, + "learning_rate": 0.0001, + "loss": 4.163, + "loss/crossentropy": 2.0447250604629517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20505183935165405, + "step": 18718 + }, + { + "epoch": 0.3744, + "grad_norm": 2.03125, + "grad_norm_var": 0.04125137329101562, + "learning_rate": 0.0001, + "loss": 3.6507, + "loss/crossentropy": 1.8176262378692627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17877614498138428, + "step": 18720 + }, + { + "epoch": 0.37444, + "grad_norm": 1.890625, + "grad_norm_var": 0.04017333984375, + "learning_rate": 0.0001, + "loss": 3.9368, + "loss/crossentropy": 1.8362378478050232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18403278291225433, + "step": 18722 + }, + { + "epoch": 0.37448, + "grad_norm": 2.140625, + "grad_norm_var": 0.04104410807291667, + "learning_rate": 0.0001, + "loss": 4.4322, + "loss/crossentropy": 2.43450927734375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21623602509498596, + "step": 18724 + }, + { + "epoch": 0.37452, + "grad_norm": 2.09375, + "grad_norm_var": 0.04133275349934896, + "learning_rate": 0.0001, + "loss": 4.3905, + "loss/crossentropy": 2.227471947669983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20000503957271576, + "step": 18726 + }, + { + "epoch": 0.37456, + "grad_norm": 1.8046875, + "grad_norm_var": 0.04317626953125, + "learning_rate": 0.0001, + "loss": 3.9138, + "loss/crossentropy": 1.9392182230949402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19422952830791473, + "step": 18728 + }, + { + "epoch": 0.3746, + "grad_norm": 1.96875, + "grad_norm_var": 0.014469401041666666, + "learning_rate": 0.0001, + "loss": 4.1284, + "loss/crossentropy": 2.0168241262435913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1774912029504776, + "step": 18730 + }, + { + "epoch": 0.37464, + "grad_norm": 1.8984375, + "grad_norm_var": 0.012859853108723958, + "learning_rate": 0.0001, + "loss": 3.8982, + "loss/crossentropy": 1.6861125230789185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16301175951957703, + "step": 18732 + }, + { + "epoch": 0.37468, + "grad_norm": 2.09375, + "grad_norm_var": 0.014111328125, + "learning_rate": 0.0001, + "loss": 4.2366, + "loss/crossentropy": 2.2991716861724854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21094633638858795, + "step": 18734 + }, + { + "epoch": 0.37472, + "grad_norm": 1.96875, + "grad_norm_var": 0.008115386962890625, + "learning_rate": 0.0001, + "loss": 4.0184, + "loss/crossentropy": 1.8746486902236938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19248205423355103, + "step": 18736 + }, + { + "epoch": 0.37476, + "grad_norm": 2.15625, + "grad_norm_var": 0.009642537434895833, + "learning_rate": 0.0001, + "loss": 4.1886, + "loss/crossentropy": 2.137068212032318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1996043175458908, + "step": 18738 + }, + { + "epoch": 0.3748, + "grad_norm": 1.9296875, + "grad_norm_var": 0.015636952718098958, + "learning_rate": 0.0001, + "loss": 4.1503, + "loss/crossentropy": 1.724601149559021, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23018527776002884, + "step": 18740 + }, + { + "epoch": 0.37484, + "grad_norm": 1.9140625, + "grad_norm_var": 0.015584309895833334, + "learning_rate": 0.0001, + "loss": 4.1593, + "loss/crossentropy": 2.1407381296157837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21157334744930267, + "step": 18742 + }, + { + "epoch": 0.37488, + "grad_norm": 1.9453125, + "grad_norm_var": 0.013090006510416667, + "learning_rate": 0.0001, + "loss": 4.0371, + "loss/crossentropy": 2.2977999448776245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19883693754673004, + "step": 18744 + }, + { + "epoch": 0.37492, + "grad_norm": 2.078125, + "grad_norm_var": 0.018155924479166665, + "learning_rate": 0.0001, + "loss": 3.8912, + "loss/crossentropy": 2.096716046333313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21084215492010117, + "step": 18746 + }, + { + "epoch": 0.37496, + "grad_norm": 1.9453125, + "grad_norm_var": 0.019254302978515624, + "learning_rate": 0.0001, + "loss": 3.8508, + "loss/crossentropy": 1.7292688488960266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18280881643295288, + "step": 18748 + }, + { + "epoch": 0.375, + "grad_norm": 1.984375, + "grad_norm_var": 0.0191314697265625, + "learning_rate": 0.0001, + "loss": 3.9798, + "loss/crossentropy": 1.7562988996505737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1889612227678299, + "step": 18750 + }, + { + "epoch": 0.37504, + "grad_norm": 1.9453125, + "grad_norm_var": 0.02008641560872396, + "learning_rate": 0.0001, + "loss": 4.1498, + "loss/crossentropy": 2.1730951070785522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20177915692329407, + "step": 18752 + }, + { + "epoch": 0.37508, + "grad_norm": 2.0625, + "grad_norm_var": 0.018161773681640625, + "learning_rate": 0.0001, + "loss": 4.0316, + "loss/crossentropy": 1.7671055793762207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17018838971853256, + "step": 18754 + }, + { + "epoch": 0.37512, + "grad_norm": 2.015625, + "grad_norm_var": 0.010228474934895834, + "learning_rate": 0.0001, + "loss": 4.1922, + "loss/crossentropy": 2.2650365829467773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22090423107147217, + "step": 18756 + }, + { + "epoch": 0.37516, + "grad_norm": 1.921875, + "grad_norm_var": 0.01002197265625, + "learning_rate": 0.0001, + "loss": 4.1985, + "loss/crossentropy": 2.252619981765747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20853855460882187, + "step": 18758 + }, + { + "epoch": 0.3752, + "grad_norm": 2.109375, + "grad_norm_var": 0.011766560872395833, + "learning_rate": 0.0001, + "loss": 3.8947, + "loss/crossentropy": 2.022092342376709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19293325394392014, + "step": 18760 + }, + { + "epoch": 0.37524, + "grad_norm": 2.078125, + "grad_norm_var": 0.010081990559895834, + "learning_rate": 0.0001, + "loss": 4.0289, + "loss/crossentropy": 1.7500890493392944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18832595646381378, + "step": 18762 + }, + { + "epoch": 0.37528, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009220123291015625, + "learning_rate": 0.0001, + "loss": 3.7769, + "loss/crossentropy": 1.8201736211776733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1910964399576187, + "step": 18764 + }, + { + "epoch": 0.37532, + "grad_norm": 1.9375, + "grad_norm_var": 0.009989420572916666, + "learning_rate": 0.0001, + "loss": 4.0508, + "loss/crossentropy": 2.1644541025161743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20023848116397858, + "step": 18766 + }, + { + "epoch": 0.37536, + "grad_norm": 2.078125, + "grad_norm_var": 0.010436757405598959, + "learning_rate": 0.0001, + "loss": 4.163, + "loss/crossentropy": 2.275505781173706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23095671832561493, + "step": 18768 + }, + { + "epoch": 0.3754, + "grad_norm": 2.078125, + "grad_norm_var": 0.010773722330729167, + "learning_rate": 0.0001, + "loss": 4.0557, + "loss/crossentropy": 1.976850986480713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125934213399887, + "step": 18770 + }, + { + "epoch": 0.37544, + "grad_norm": 2.140625, + "grad_norm_var": 0.012035115559895834, + "learning_rate": 0.0001, + "loss": 4.146, + "loss/crossentropy": 2.150836706161499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19192685186862946, + "step": 18772 + }, + { + "epoch": 0.37548, + "grad_norm": 2.046875, + "grad_norm_var": 0.012400054931640625, + "learning_rate": 0.0001, + "loss": 4.2302, + "loss/crossentropy": 2.0790088176727295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2147749587893486, + "step": 18774 + }, + { + "epoch": 0.37552, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010697428385416667, + "learning_rate": 0.0001, + "loss": 3.7918, + "loss/crossentropy": 1.8366054892539978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19662949442863464, + "step": 18776 + }, + { + "epoch": 0.37556, + "grad_norm": 2.015625, + "grad_norm_var": 0.008278147379557291, + "learning_rate": 0.0001, + "loss": 3.9056, + "loss/crossentropy": 1.6928801536560059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18292289972305298, + "step": 18778 + }, + { + "epoch": 0.3756, + "grad_norm": 1.96875, + "grad_norm_var": 0.007591756184895834, + "learning_rate": 0.0001, + "loss": 4.2872, + "loss/crossentropy": 2.015128195285797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20940368622541428, + "step": 18780 + }, + { + "epoch": 0.37564, + "grad_norm": 2.125, + "grad_norm_var": 0.0061757405598958336, + "learning_rate": 0.0001, + "loss": 4.3304, + "loss/crossentropy": 2.026209592819214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2227947860956192, + "step": 18782 + }, + { + "epoch": 0.37568, + "grad_norm": 2.0, + "grad_norm_var": 0.004349772135416667, + "learning_rate": 0.0001, + "loss": 4.0591, + "loss/crossentropy": 2.29964280128479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23132510483264923, + "step": 18784 + }, + { + "epoch": 0.37572, + "grad_norm": 1.859375, + "grad_norm_var": 0.005366770426432291, + "learning_rate": 0.0001, + "loss": 3.9675, + "loss/crossentropy": 1.881381332874298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17967566847801208, + "step": 18786 + }, + { + "epoch": 0.37576, + "grad_norm": 2.046875, + "grad_norm_var": 0.004198201497395833, + "learning_rate": 0.0001, + "loss": 4.1834, + "loss/crossentropy": 2.2231001257896423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21163878589868546, + "step": 18788 + }, + { + "epoch": 0.3758, + "grad_norm": 1.8984375, + "grad_norm_var": 0.004526519775390625, + "learning_rate": 0.0001, + "loss": 4.0475, + "loss/crossentropy": 2.3531078100204468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21745989471673965, + "step": 18790 + }, + { + "epoch": 0.37584, + "grad_norm": 2.046875, + "grad_norm_var": 0.004801432291666667, + "learning_rate": 0.0001, + "loss": 4.2549, + "loss/crossentropy": 2.1178460121154785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23571017384529114, + "step": 18792 + }, + { + "epoch": 0.37588, + "grad_norm": 1.84375, + "grad_norm_var": 0.005890909830729167, + "learning_rate": 0.0001, + "loss": 3.8373, + "loss/crossentropy": 2.0659135580062866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20817245543003082, + "step": 18794 + }, + { + "epoch": 0.37592, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006318918863932292, + "learning_rate": 0.0001, + "loss": 4.0103, + "loss/crossentropy": 1.579395353794098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16713083535432816, + "step": 18796 + }, + { + "epoch": 0.37596, + "grad_norm": 1.953125, + "grad_norm_var": 0.005940500895182292, + "learning_rate": 0.0001, + "loss": 4.1272, + "loss/crossentropy": 2.102541923522949, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2174636349081993, + "step": 18798 + }, + { + "epoch": 0.376, + "grad_norm": 2.09375, + "grad_norm_var": 0.0067942301432291664, + "learning_rate": 0.0001, + "loss": 4.2622, + "loss/crossentropy": 1.9342190027236938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19780533015727997, + "step": 18800 + }, + { + "epoch": 0.37604, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006121571858723958, + "learning_rate": 0.0001, + "loss": 4.0659, + "loss/crossentropy": 2.140671730041504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21276966482400894, + "step": 18802 + }, + { + "epoch": 0.37608, + "grad_norm": 2.21875, + "grad_norm_var": 0.009590657552083333, + "learning_rate": 0.0001, + "loss": 4.1929, + "loss/crossentropy": 1.9004405736923218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17683426290750504, + "step": 18804 + }, + { + "epoch": 0.37612, + "grad_norm": 1.859375, + "grad_norm_var": 0.010155232747395833, + "learning_rate": 0.0001, + "loss": 3.7163, + "loss/crossentropy": 1.9487649202346802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19463464617729187, + "step": 18806 + }, + { + "epoch": 0.37616, + "grad_norm": 1.8828125, + "grad_norm_var": 0.011262766520182292, + "learning_rate": 0.0001, + "loss": 3.9517, + "loss/crossentropy": 2.15252423286438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20531394332647324, + "step": 18808 + }, + { + "epoch": 0.3762, + "grad_norm": 1.8359375, + "grad_norm_var": 0.011592356363932292, + "learning_rate": 0.0001, + "loss": 4.0494, + "loss/crossentropy": 2.114433467388153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20183662325143814, + "step": 18810 + }, + { + "epoch": 0.37624, + "grad_norm": 2.0, + "grad_norm_var": 0.011205037434895834, + "learning_rate": 0.0001, + "loss": 3.9035, + "loss/crossentropy": 1.9229055047035217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18662934005260468, + "step": 18812 + }, + { + "epoch": 0.37628, + "grad_norm": 2.046875, + "grad_norm_var": 1.8609944661458333, + "learning_rate": 0.0001, + "loss": 4.2516, + "loss/crossentropy": 1.8242689371109009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17858239263296127, + "step": 18814 + }, + { + "epoch": 0.37632, + "grad_norm": 1.9765625, + "grad_norm_var": 1.864818318684896, + "learning_rate": 0.0001, + "loss": 3.8917, + "loss/crossentropy": 2.3987890481948853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2246004194021225, + "step": 18816 + }, + { + "epoch": 0.37636, + "grad_norm": 2.234375, + "grad_norm_var": 1.8565500895182292, + "learning_rate": 0.0001, + "loss": 4.3493, + "loss/crossentropy": 2.4376614093780518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24541915208101273, + "step": 18818 + }, + { + "epoch": 0.3764, + "grad_norm": 1.984375, + "grad_norm_var": 1.865612538655599, + "learning_rate": 0.0001, + "loss": 4.1344, + "loss/crossentropy": 2.2477601766586304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2178046852350235, + "step": 18820 + }, + { + "epoch": 0.37644, + "grad_norm": 1.96875, + "grad_norm_var": 1.8636464436848958, + "learning_rate": 0.0001, + "loss": 3.9685, + "loss/crossentropy": 1.9782747626304626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19459142535924911, + "step": 18822 + }, + { + "epoch": 0.37648, + "grad_norm": 2.0625, + "grad_norm_var": 1.8451738993326823, + "learning_rate": 0.0001, + "loss": 4.043, + "loss/crossentropy": 2.1214100122451782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2082664743065834, + "step": 18824 + }, + { + "epoch": 0.37652, + "grad_norm": 1.90625, + "grad_norm_var": 1.8489461263020834, + "learning_rate": 0.0001, + "loss": 3.8256, + "loss/crossentropy": 1.68446546792984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.167055182158947, + "step": 18826 + }, + { + "epoch": 0.37656, + "grad_norm": 1.9921875, + "grad_norm_var": 1.8380022684733073, + "learning_rate": 0.0001, + "loss": 4.0588, + "loss/crossentropy": 1.7720499634742737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18632127344608307, + "step": 18828 + }, + { + "epoch": 0.3766, + "grad_norm": 2.140625, + "grad_norm_var": 0.013492838541666666, + "learning_rate": 0.0001, + "loss": 4.2288, + "loss/crossentropy": 2.283332347869873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23949767649173737, + "step": 18830 + }, + { + "epoch": 0.37664, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014644114176432292, + "learning_rate": 0.0001, + "loss": 3.9134, + "loss/crossentropy": 1.9924857020378113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20181651413440704, + "step": 18832 + }, + { + "epoch": 0.37668, + "grad_norm": 2.046875, + "grad_norm_var": 0.011091105143229167, + "learning_rate": 0.0001, + "loss": 3.8424, + "loss/crossentropy": 2.4015761613845825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2279737964272499, + "step": 18834 + }, + { + "epoch": 0.37672, + "grad_norm": 1.8515625, + "grad_norm_var": 0.014152018229166667, + "learning_rate": 0.0001, + "loss": 3.8636, + "loss/crossentropy": 2.0717111229896545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19089852273464203, + "step": 18836 + }, + { + "epoch": 0.37676, + "grad_norm": 2.09375, + "grad_norm_var": 0.0149566650390625, + "learning_rate": 0.0001, + "loss": 4.2694, + "loss/crossentropy": 1.9993728995323181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19489262998104095, + "step": 18838 + }, + { + "epoch": 0.3768, + "grad_norm": 1.8125, + "grad_norm_var": 0.015949503580729166, + "learning_rate": 0.0001, + "loss": 3.7009, + "loss/crossentropy": 1.915448248386383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1923104077577591, + "step": 18840 + }, + { + "epoch": 0.37684, + "grad_norm": 1.890625, + "grad_norm_var": 0.016511027018229166, + "learning_rate": 0.0001, + "loss": 4.0424, + "loss/crossentropy": 2.292284607887268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22159253805875778, + "step": 18842 + }, + { + "epoch": 0.37688, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012784830729166667, + "learning_rate": 0.0001, + "loss": 3.9904, + "loss/crossentropy": 1.9938938617706299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19937817752361298, + "step": 18844 + }, + { + "epoch": 0.37692, + "grad_norm": 1.9453125, + "grad_norm_var": 0.016454060872395832, + "learning_rate": 0.0001, + "loss": 4.1914, + "loss/crossentropy": 2.045258641242981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2652823179960251, + "step": 18846 + }, + { + "epoch": 0.37696, + "grad_norm": 1.9921875, + "grad_norm_var": 0.016707356770833334, + "learning_rate": 0.0001, + "loss": 4.0391, + "loss/crossentropy": 2.1330259442329407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19155749678611755, + "step": 18848 + }, + { + "epoch": 0.377, + "grad_norm": 1.921875, + "grad_norm_var": 0.017101796468098958, + "learning_rate": 0.0001, + "loss": 4.1422, + "loss/crossentropy": 2.2135089635849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20062117278575897, + "step": 18850 + }, + { + "epoch": 0.37704, + "grad_norm": 2.046875, + "grad_norm_var": 0.013826243082682292, + "learning_rate": 0.0001, + "loss": 4.1188, + "loss/crossentropy": 2.073060691356659, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19443543255329132, + "step": 18852 + }, + { + "epoch": 0.37708, + "grad_norm": 1.96875, + "grad_norm_var": 0.014170074462890625, + "learning_rate": 0.0001, + "loss": 4.089, + "loss/crossentropy": 1.930641233921051, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19604724645614624, + "step": 18854 + }, + { + "epoch": 0.37712, + "grad_norm": 2.015625, + "grad_norm_var": 0.011252593994140626, + "learning_rate": 0.0001, + "loss": 4.0238, + "loss/crossentropy": 2.0813130140304565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21019181609153748, + "step": 18856 + }, + { + "epoch": 0.37716, + "grad_norm": 2.03125, + "grad_norm_var": 0.010080718994140625, + "learning_rate": 0.0001, + "loss": 3.9855, + "loss/crossentropy": 1.8528355956077576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19945576786994934, + "step": 18858 + }, + { + "epoch": 0.3772, + "grad_norm": 2.0, + "grad_norm_var": 0.011896769205729166, + "learning_rate": 0.0001, + "loss": 3.9655, + "loss/crossentropy": 2.32794725894928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22688604146242142, + "step": 18860 + }, + { + "epoch": 0.37724, + "grad_norm": 2.046875, + "grad_norm_var": 0.007264963785807292, + "learning_rate": 0.0001, + "loss": 4.25, + "loss/crossentropy": 2.061101734638214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1961553767323494, + "step": 18862 + }, + { + "epoch": 0.37728, + "grad_norm": 1.9609375, + "grad_norm_var": 0.00604248046875, + "learning_rate": 0.0001, + "loss": 3.7839, + "loss/crossentropy": 1.9127929210662842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20082338899374008, + "step": 18864 + }, + { + "epoch": 0.37732, + "grad_norm": 2.046875, + "grad_norm_var": 0.14662272135416668, + "learning_rate": 0.0001, + "loss": 4.289, + "loss/crossentropy": 2.001932919025421, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1881283074617386, + "step": 18866 + }, + { + "epoch": 0.37736, + "grad_norm": 1.9609375, + "grad_norm_var": 0.14527180989583333, + "learning_rate": 0.0001, + "loss": 4.0851, + "loss/crossentropy": 1.693075716495514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18698371946811676, + "step": 18868 + }, + { + "epoch": 0.3774, + "grad_norm": 1.984375, + "grad_norm_var": 0.1481402079264323, + "learning_rate": 0.0001, + "loss": 4.0484, + "loss/crossentropy": 1.879252314567566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18412816524505615, + "step": 18870 + }, + { + "epoch": 0.37744, + "grad_norm": 2.015625, + "grad_norm_var": 0.1550066630045573, + "learning_rate": 0.0001, + "loss": 4.3488, + "loss/crossentropy": 2.1819299459457397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20243355631828308, + "step": 18872 + }, + { + "epoch": 0.37748, + "grad_norm": 2.21875, + "grad_norm_var": 0.1576568603515625, + "learning_rate": 0.0001, + "loss": 4.0889, + "loss/crossentropy": 2.2487794160842896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20594948530197144, + "step": 18874 + }, + { + "epoch": 0.37752, + "grad_norm": 1.8984375, + "grad_norm_var": 0.15466079711914063, + "learning_rate": 0.0001, + "loss": 4.303, + "loss/crossentropy": 1.990889549255371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19230055809020996, + "step": 18876 + }, + { + "epoch": 0.37756, + "grad_norm": 1.9765625, + "grad_norm_var": 0.1582763671875, + "learning_rate": 0.0001, + "loss": 4.0435, + "loss/crossentropy": 2.091560959815979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19504332542419434, + "step": 18878 + }, + { + "epoch": 0.3776, + "grad_norm": 2.109375, + "grad_norm_var": 0.1599273681640625, + "learning_rate": 0.0001, + "loss": 4.0776, + "loss/crossentropy": 1.9213955998420715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1994621828198433, + "step": 18880 + }, + { + "epoch": 0.37764, + "grad_norm": 1.8671875, + "grad_norm_var": 0.02360814412434896, + "learning_rate": 0.0001, + "loss": 4.0103, + "loss/crossentropy": 2.1728278398513794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21039214730262756, + "step": 18882 + }, + { + "epoch": 0.37768, + "grad_norm": 1.78125, + "grad_norm_var": 0.026341756184895832, + "learning_rate": 0.0001, + "loss": 4.052, + "loss/crossentropy": 1.889222264289856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17232363671064377, + "step": 18884 + }, + { + "epoch": 0.37772, + "grad_norm": 1.9921875, + "grad_norm_var": 0.026228841145833334, + "learning_rate": 0.0001, + "loss": 4.118, + "loss/crossentropy": 2.264981746673584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.213841512799263, + "step": 18886 + }, + { + "epoch": 0.37776, + "grad_norm": 1.78125, + "grad_norm_var": 0.013598378499348958, + "learning_rate": 0.0001, + "loss": 3.8843, + "loss/crossentropy": 1.8615361452102661, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1745932251214981, + "step": 18888 + }, + { + "epoch": 0.3778, + "grad_norm": 1.765625, + "grad_norm_var": 0.009266916910807292, + "learning_rate": 0.0001, + "loss": 3.5687, + "loss/crossentropy": 2.1459723711013794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21431762725114822, + "step": 18890 + }, + { + "epoch": 0.37784, + "grad_norm": 2.0625, + "grad_norm_var": 0.010479482014973958, + "learning_rate": 0.0001, + "loss": 4.2125, + "loss/crossentropy": 1.8859283328056335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20627497136592865, + "step": 18892 + }, + { + "epoch": 0.37788, + "grad_norm": 2.046875, + "grad_norm_var": 0.012211100260416666, + "learning_rate": 0.0001, + "loss": 3.9978, + "loss/crossentropy": 1.824287474155426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17877592146396637, + "step": 18894 + }, + { + "epoch": 0.37792, + "grad_norm": 1.953125, + "grad_norm_var": 0.012414296468098959, + "learning_rate": 0.0001, + "loss": 3.9857, + "loss/crossentropy": 2.486180305480957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033834606409073, + "step": 18896 + }, + { + "epoch": 0.37796, + "grad_norm": 2.046875, + "grad_norm_var": 0.012981923421223958, + "learning_rate": 0.0001, + "loss": 4.2339, + "loss/crossentropy": 2.0677965879440308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21111362427473068, + "step": 18898 + }, + { + "epoch": 0.378, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011107381184895833, + "learning_rate": 0.0001, + "loss": 4.2559, + "loss/crossentropy": 1.941792368888855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18762809038162231, + "step": 18900 + }, + { + "epoch": 0.37804, + "grad_norm": 2.078125, + "grad_norm_var": 0.012504069010416667, + "learning_rate": 0.0001, + "loss": 3.8028, + "loss/crossentropy": 1.907107174396515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2050766795873642, + "step": 18902 + }, + { + "epoch": 0.37808, + "grad_norm": 1.9453125, + "grad_norm_var": 0.009242502848307292, + "learning_rate": 0.0001, + "loss": 3.9742, + "loss/crossentropy": 1.5920222997665405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17366845160722733, + "step": 18904 + }, + { + "epoch": 0.37812, + "grad_norm": 1.9375, + "grad_norm_var": 0.0066650390625, + "learning_rate": 0.0001, + "loss": 3.9581, + "loss/crossentropy": 2.054854154586792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20102889090776443, + "step": 18906 + }, + { + "epoch": 0.37816, + "grad_norm": 1.90625, + "grad_norm_var": 0.0067789713541666664, + "learning_rate": 0.0001, + "loss": 4.2582, + "loss/crossentropy": 2.42835795879364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2154095396399498, + "step": 18908 + }, + { + "epoch": 0.3782, + "grad_norm": 2.09375, + "grad_norm_var": 0.007169596354166667, + "learning_rate": 0.0001, + "loss": 4.1466, + "loss/crossentropy": 2.1060370206832886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20842333883047104, + "step": 18910 + }, + { + "epoch": 0.37824, + "grad_norm": 1.96875, + "grad_norm_var": 0.008194986979166667, + "learning_rate": 0.0001, + "loss": 4.1587, + "loss/crossentropy": 2.0559674501419067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2195875570178032, + "step": 18912 + }, + { + "epoch": 0.37828, + "grad_norm": 2.03125, + "grad_norm_var": 0.008888498942057291, + "learning_rate": 0.0001, + "loss": 4.2786, + "loss/crossentropy": 2.240954041481018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20908969640731812, + "step": 18914 + }, + { + "epoch": 0.37832, + "grad_norm": 1.96875, + "grad_norm_var": 0.008747355143229166, + "learning_rate": 0.0001, + "loss": 4.0729, + "loss/crossentropy": 2.0393139123916626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19933026283979416, + "step": 18916 + }, + { + "epoch": 0.37836, + "grad_norm": 2.109375, + "grad_norm_var": 0.007682291666666666, + "learning_rate": 0.0001, + "loss": 4.0969, + "loss/crossentropy": 2.0294516682624817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20086780190467834, + "step": 18918 + }, + { + "epoch": 0.3784, + "grad_norm": 1.96875, + "grad_norm_var": 0.007425944010416667, + "learning_rate": 0.0001, + "loss": 3.9138, + "loss/crossentropy": 1.8509765267372131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18834587186574936, + "step": 18920 + }, + { + "epoch": 0.37844, + "grad_norm": 1.875, + "grad_norm_var": 0.0068267822265625, + "learning_rate": 0.0001, + "loss": 3.8669, + "loss/crossentropy": 1.8817378878593445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17679119110107422, + "step": 18922 + }, + { + "epoch": 0.37848, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005975087483723958, + "learning_rate": 0.0001, + "loss": 3.9276, + "loss/crossentropy": 1.766166627407074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16948368400335312, + "step": 18924 + }, + { + "epoch": 0.37852, + "grad_norm": 1.8671875, + "grad_norm_var": 0.006786855061848959, + "learning_rate": 0.0001, + "loss": 3.8948, + "loss/crossentropy": 2.251446485519409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22272750735282898, + "step": 18926 + }, + { + "epoch": 0.37856, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0045125325520833336, + "learning_rate": 0.0001, + "loss": 4.1119, + "loss/crossentropy": 2.0006097555160522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21003134548664093, + "step": 18928 + }, + { + "epoch": 0.3786, + "grad_norm": 2.03125, + "grad_norm_var": 0.0036936442057291666, + "learning_rate": 0.0001, + "loss": 3.7731, + "loss/crossentropy": 2.2906641960144043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23650038242340088, + "step": 18930 + }, + { + "epoch": 0.37864, + "grad_norm": 2.03125, + "grad_norm_var": 0.0039866129557291664, + "learning_rate": 0.0001, + "loss": 4.1848, + "loss/crossentropy": 2.185898005962372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21387950330972672, + "step": 18932 + }, + { + "epoch": 0.37868, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0028928120930989585, + "learning_rate": 0.0001, + "loss": 4.1808, + "loss/crossentropy": 2.2457324266433716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21054718643426895, + "step": 18934 + }, + { + "epoch": 0.37872, + "grad_norm": 2.046875, + "grad_norm_var": 0.0032793680826822915, + "learning_rate": 0.0001, + "loss": 4.2711, + "loss/crossentropy": 2.1835416555404663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23258651047945023, + "step": 18936 + }, + { + "epoch": 0.37876, + "grad_norm": 2.078125, + "grad_norm_var": 0.0029436747233072915, + "learning_rate": 0.0001, + "loss": 4.1555, + "loss/crossentropy": 2.1908507347106934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20261384546756744, + "step": 18938 + }, + { + "epoch": 0.3788, + "grad_norm": 2.078125, + "grad_norm_var": 0.0035336812337239585, + "learning_rate": 0.0001, + "loss": 4.1359, + "loss/crossentropy": 2.276697278022766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20843130350112915, + "step": 18940 + }, + { + "epoch": 0.37884, + "grad_norm": 1.9375, + "grad_norm_var": 0.0023251851399739582, + "learning_rate": 0.0001, + "loss": 4.1309, + "loss/crossentropy": 2.16153222322464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19693513214588165, + "step": 18942 + }, + { + "epoch": 0.37888, + "grad_norm": 1.96875, + "grad_norm_var": 0.0023590087890625, + "learning_rate": 0.0001, + "loss": 3.9822, + "loss/crossentropy": 2.293270707130432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21085364371538162, + "step": 18944 + }, + { + "epoch": 0.37892, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0025042215983072918, + "learning_rate": 0.0001, + "loss": 4.1291, + "loss/crossentropy": 2.0382518768310547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21567383408546448, + "step": 18946 + }, + { + "epoch": 0.37896, + "grad_norm": 1.8359375, + "grad_norm_var": 0.00894775390625, + "learning_rate": 0.0001, + "loss": 3.9993, + "loss/crossentropy": 2.038296341896057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19963379949331284, + "step": 18948 + }, + { + "epoch": 0.379, + "grad_norm": 1.953125, + "grad_norm_var": 0.00994873046875, + "learning_rate": 0.0001, + "loss": 4.2363, + "loss/crossentropy": 2.1765074729919434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19551265239715576, + "step": 18950 + }, + { + "epoch": 0.37904, + "grad_norm": 2.015625, + "grad_norm_var": 0.010591379801432292, + "learning_rate": 0.0001, + "loss": 3.8641, + "loss/crossentropy": 1.870033621788025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17858022451400757, + "step": 18952 + }, + { + "epoch": 0.37908, + "grad_norm": 2.0625, + "grad_norm_var": 0.011083730061848958, + "learning_rate": 0.0001, + "loss": 4.1877, + "loss/crossentropy": 1.8716632723808289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20534101128578186, + "step": 18954 + }, + { + "epoch": 0.37912, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0111236572265625, + "learning_rate": 0.0001, + "loss": 4.1662, + "loss/crossentropy": 1.925970435142517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19596435129642487, + "step": 18956 + }, + { + "epoch": 0.37916, + "grad_norm": 1.890625, + "grad_norm_var": 0.011800130208333334, + "learning_rate": 0.0001, + "loss": 4.126, + "loss/crossentropy": 2.2280589938163757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21037188172340393, + "step": 18958 + }, + { + "epoch": 0.3792, + "grad_norm": 2.0, + "grad_norm_var": 0.0150299072265625, + "learning_rate": 0.0001, + "loss": 3.7683, + "loss/crossentropy": 1.8350458145141602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1821446716785431, + "step": 18960 + }, + { + "epoch": 0.37924, + "grad_norm": 1.953125, + "grad_norm_var": 0.015018463134765625, + "learning_rate": 0.0001, + "loss": 4.0376, + "loss/crossentropy": 2.0393940210342407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18378175050020218, + "step": 18962 + }, + { + "epoch": 0.37928, + "grad_norm": 1.9375, + "grad_norm_var": 0.00897216796875, + "learning_rate": 0.0001, + "loss": 4.0018, + "loss/crossentropy": 1.7470228672027588, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19222351908683777, + "step": 18964 + }, + { + "epoch": 0.37932, + "grad_norm": 2.203125, + "grad_norm_var": 0.011742146809895833, + "learning_rate": 0.0001, + "loss": 4.1791, + "loss/crossentropy": 2.3943980932235718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23214909434318542, + "step": 18966 + }, + { + "epoch": 0.37936, + "grad_norm": 1.90625, + "grad_norm_var": 0.012239329020182292, + "learning_rate": 0.0001, + "loss": 3.7877, + "loss/crossentropy": 1.8153263330459595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19808300584554672, + "step": 18968 + }, + { + "epoch": 0.3794, + "grad_norm": 2.0625, + "grad_norm_var": 0.011171213785807292, + "learning_rate": 0.0001, + "loss": 4.0122, + "loss/crossentropy": 2.0037755370140076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20173701643943787, + "step": 18970 + }, + { + "epoch": 0.37944, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012168121337890626, + "learning_rate": 0.0001, + "loss": 4.1069, + "loss/crossentropy": 2.144823908805847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21421342343091965, + "step": 18972 + }, + { + "epoch": 0.37948, + "grad_norm": 2.015625, + "grad_norm_var": 0.011565907796223959, + "learning_rate": 0.0001, + "loss": 4.2788, + "loss/crossentropy": 2.2151081562042236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20057211816310883, + "step": 18974 + }, + { + "epoch": 0.37952, + "grad_norm": 2.03125, + "grad_norm_var": 0.008976236979166666, + "learning_rate": 0.0001, + "loss": 4.5061, + "loss/crossentropy": 2.341770827770233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21535535156726837, + "step": 18976 + }, + { + "epoch": 0.37956, + "grad_norm": 2.109375, + "grad_norm_var": 0.009520467122395833, + "learning_rate": 0.0001, + "loss": 4.1399, + "loss/crossentropy": 1.8952747583389282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17530933022499084, + "step": 18978 + }, + { + "epoch": 0.3796, + "grad_norm": 2.03125, + "grad_norm_var": 0.008885701497395834, + "learning_rate": 0.0001, + "loss": 4.141, + "loss/crossentropy": 2.2774226665496826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20682721585035324, + "step": 18980 + }, + { + "epoch": 0.37964, + "grad_norm": 2.078125, + "grad_norm_var": 0.008508046468098959, + "learning_rate": 0.0001, + "loss": 3.982, + "loss/crossentropy": 1.9596800208091736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18774595111608505, + "step": 18982 + }, + { + "epoch": 0.37968, + "grad_norm": 1.8359375, + "grad_norm_var": 0.009309895833333333, + "learning_rate": 0.0001, + "loss": 4.008, + "loss/crossentropy": 2.1485220193862915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19221040606498718, + "step": 18984 + }, + { + "epoch": 0.37972, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009039052327473958, + "learning_rate": 0.0001, + "loss": 4.142, + "loss/crossentropy": 2.2140207290649414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2012096345424652, + "step": 18986 + }, + { + "epoch": 0.37976, + "grad_norm": 1.890625, + "grad_norm_var": 0.008063761393229167, + "learning_rate": 0.0001, + "loss": 4.1088, + "loss/crossentropy": 1.9045360684394836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19290773570537567, + "step": 18988 + }, + { + "epoch": 0.3798, + "grad_norm": 2.03125, + "grad_norm_var": 0.008072662353515624, + "learning_rate": 0.0001, + "loss": 4.0349, + "loss/crossentropy": 1.8351407051086426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18892163783311844, + "step": 18990 + }, + { + "epoch": 0.37984, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007865397135416667, + "learning_rate": 0.0001, + "loss": 3.9136, + "loss/crossentropy": 2.021119713783264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19214972108602524, + "step": 18992 + }, + { + "epoch": 0.37988, + "grad_norm": 2.0, + "grad_norm_var": 0.006394195556640625, + "learning_rate": 0.0001, + "loss": 4.2961, + "loss/crossentropy": 2.0591527223587036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20295391231775284, + "step": 18994 + }, + { + "epoch": 0.37992, + "grad_norm": 2.0625, + "grad_norm_var": 0.006566365559895833, + "learning_rate": 0.0001, + "loss": 4.1135, + "loss/crossentropy": 1.9794987440109253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20964892208576202, + "step": 18996 + }, + { + "epoch": 0.37996, + "grad_norm": 2.078125, + "grad_norm_var": 0.0042111714680989586, + "learning_rate": 0.0001, + "loss": 4.0699, + "loss/crossentropy": 2.116178512573242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1962205320596695, + "step": 18998 + }, + { + "epoch": 0.38, + "grad_norm": 1.8828125, + "grad_norm_var": 0.003714752197265625, + "learning_rate": 0.0001, + "loss": 4.0683, + "loss/crossentropy": 2.2458595037460327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981743946671486, + "step": 19000 + }, + { + "epoch": 0.38004, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0038266499837239583, + "learning_rate": 0.0001, + "loss": 4.2085, + "loss/crossentropy": 2.0612659454345703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20526322722434998, + "step": 19002 + }, + { + "epoch": 0.38008, + "grad_norm": 1.9296875, + "grad_norm_var": 0.00347900390625, + "learning_rate": 0.0001, + "loss": 4.191, + "loss/crossentropy": 2.0810243487358093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.222077377140522, + "step": 19004 + }, + { + "epoch": 0.38012, + "grad_norm": 1.890625, + "grad_norm_var": 0.003940582275390625, + "learning_rate": 0.0001, + "loss": 4.0957, + "loss/crossentropy": 2.2651617527008057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2106148600578308, + "step": 19006 + }, + { + "epoch": 0.38016, + "grad_norm": 1.8125, + "grad_norm_var": 0.0056793212890625, + "learning_rate": 0.0001, + "loss": 3.9671, + "loss/crossentropy": 1.9701108932495117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18778277933597565, + "step": 19008 + }, + { + "epoch": 0.3802, + "grad_norm": 1.859375, + "grad_norm_var": 0.007024892171223958, + "learning_rate": 0.0001, + "loss": 4.1265, + "loss/crossentropy": 1.9822896122932434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25280050933361053, + "step": 19010 + }, + { + "epoch": 0.38024, + "grad_norm": 2.0, + "grad_norm_var": 0.0063435872395833336, + "learning_rate": 0.0001, + "loss": 3.9577, + "loss/crossentropy": 2.020545542240143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19008295983076096, + "step": 19012 + }, + { + "epoch": 0.38028, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007336171468098959, + "learning_rate": 0.0001, + "loss": 4.2044, + "loss/crossentropy": 2.0922394394874573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1972956582903862, + "step": 19014 + }, + { + "epoch": 0.38032, + "grad_norm": 1.859375, + "grad_norm_var": 0.007067616780598958, + "learning_rate": 0.0001, + "loss": 3.9726, + "loss/crossentropy": 2.028052031993866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18549586832523346, + "step": 19016 + }, + { + "epoch": 0.38036, + "grad_norm": 1.6796875, + "grad_norm_var": 0.011395009358723958, + "learning_rate": 0.0001, + "loss": 4.0118, + "loss/crossentropy": 1.9518752098083496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17747169733047485, + "step": 19018 + }, + { + "epoch": 0.3804, + "grad_norm": 2.078125, + "grad_norm_var": 0.013492838541666666, + "learning_rate": 0.0001, + "loss": 4.1404, + "loss/crossentropy": 1.8896766901016235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20509368181228638, + "step": 19020 + }, + { + "epoch": 0.38044, + "grad_norm": 2.046875, + "grad_norm_var": 0.01407470703125, + "learning_rate": 0.0001, + "loss": 4.1159, + "loss/crossentropy": 2.10469388961792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2058955579996109, + "step": 19022 + }, + { + "epoch": 0.38048, + "grad_norm": 1.875, + "grad_norm_var": 0.012717437744140626, + "learning_rate": 0.0001, + "loss": 3.8601, + "loss/crossentropy": 1.7341394424438477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19358114898204803, + "step": 19024 + }, + { + "epoch": 0.38052, + "grad_norm": 1.7734375, + "grad_norm_var": 0.013634999593098959, + "learning_rate": 0.0001, + "loss": 3.8528, + "loss/crossentropy": 2.186724543571472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981809437274933, + "step": 19026 + }, + { + "epoch": 0.38056, + "grad_norm": 1.8984375, + "grad_norm_var": 0.014070638020833333, + "learning_rate": 0.0001, + "loss": 4.2421, + "loss/crossentropy": 2.228869318962097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20712755620479584, + "step": 19028 + }, + { + "epoch": 0.3806, + "grad_norm": 2.125, + "grad_norm_var": 0.0140869140625, + "learning_rate": 0.0001, + "loss": 4.126, + "loss/crossentropy": 2.2783373594284058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20916558057069778, + "step": 19030 + }, + { + "epoch": 0.38064, + "grad_norm": 1.96875, + "grad_norm_var": 0.013270823160807292, + "learning_rate": 0.0001, + "loss": 4.2744, + "loss/crossentropy": 2.2671823501586914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19389723241329193, + "step": 19032 + }, + { + "epoch": 0.38068, + "grad_norm": 1.859375, + "grad_norm_var": 0.008540852864583334, + "learning_rate": 0.0001, + "loss": 4.0267, + "loss/crossentropy": 1.948801040649414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18766260892152786, + "step": 19034 + }, + { + "epoch": 0.38072, + "grad_norm": 1.90625, + "grad_norm_var": 0.007818349202473958, + "learning_rate": 0.0001, + "loss": 4.0856, + "loss/crossentropy": 1.895260751247406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1848379746079445, + "step": 19036 + }, + { + "epoch": 0.38076, + "grad_norm": 1.8046875, + "grad_norm_var": 0.008504231770833334, + "learning_rate": 0.0001, + "loss": 3.8189, + "loss/crossentropy": 1.8295226097106934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1816689372062683, + "step": 19038 + }, + { + "epoch": 0.3808, + "grad_norm": 2.0, + "grad_norm_var": 0.007830556233723958, + "learning_rate": 0.0001, + "loss": 4.0363, + "loss/crossentropy": 1.6487661004066467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.178475059568882, + "step": 19040 + }, + { + "epoch": 0.38084, + "grad_norm": 1.921875, + "grad_norm_var": 0.0058095296223958336, + "learning_rate": 0.0001, + "loss": 3.828, + "loss/crossentropy": 1.9264054894447327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2009180784225464, + "step": 19042 + }, + { + "epoch": 0.38088, + "grad_norm": 1.84375, + "grad_norm_var": 0.007989247639973959, + "learning_rate": 0.0001, + "loss": 4.2636, + "loss/crossentropy": 2.3018531799316406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039026990532875, + "step": 19044 + }, + { + "epoch": 0.38092, + "grad_norm": 2.15625, + "grad_norm_var": 0.009093983968098959, + "learning_rate": 0.0001, + "loss": 4.0201, + "loss/crossentropy": 2.135006010532379, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.172908216714859, + "step": 19046 + }, + { + "epoch": 0.38096, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009476725260416667, + "learning_rate": 0.0001, + "loss": 4.2064, + "loss/crossentropy": 2.1111900806427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19583145529031754, + "step": 19048 + }, + { + "epoch": 0.381, + "grad_norm": 2.015625, + "grad_norm_var": 0.0096099853515625, + "learning_rate": 0.0001, + "loss": 4.1165, + "loss/crossentropy": 2.208159327507019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21107421815395355, + "step": 19050 + }, + { + "epoch": 0.38104, + "grad_norm": 2.203125, + "grad_norm_var": 0.012894694010416667, + "learning_rate": 0.0001, + "loss": 4.1722, + "loss/crossentropy": 2.144998788833618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108323574066162, + "step": 19052 + }, + { + "epoch": 0.38108, + "grad_norm": 2.125, + "grad_norm_var": 0.0116455078125, + "learning_rate": 0.0001, + "loss": 4.0938, + "loss/crossentropy": 2.0046940445899963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20454590767621994, + "step": 19054 + }, + { + "epoch": 0.38112, + "grad_norm": 2.03125, + "grad_norm_var": 0.010992177327473958, + "learning_rate": 0.0001, + "loss": 4.2219, + "loss/crossentropy": 2.042281448841095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21530431509017944, + "step": 19056 + }, + { + "epoch": 0.38116, + "grad_norm": 2.09375, + "grad_norm_var": 0.0102691650390625, + "learning_rate": 0.0001, + "loss": 4.2701, + "loss/crossentropy": 2.0579177141189575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20769091695547104, + "step": 19058 + }, + { + "epoch": 0.3812, + "grad_norm": 2.3125, + "grad_norm_var": 0.012719472249348959, + "learning_rate": 0.0001, + "loss": 4.0561, + "loss/crossentropy": 1.8340198993682861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1927536353468895, + "step": 19060 + }, + { + "epoch": 0.38124, + "grad_norm": 2.375, + "grad_norm_var": 0.016584269205729165, + "learning_rate": 0.0001, + "loss": 4.0412, + "loss/crossentropy": 2.146699070930481, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18840950727462769, + "step": 19062 + }, + { + "epoch": 0.38128, + "grad_norm": 1.90625, + "grad_norm_var": 0.017600250244140626, + "learning_rate": 0.0001, + "loss": 3.7448, + "loss/crossentropy": 2.0206886529922485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20354034006595612, + "step": 19064 + }, + { + "epoch": 0.38132, + "grad_norm": 1.78125, + "grad_norm_var": 0.023372141520182292, + "learning_rate": 0.0001, + "loss": 4.0047, + "loss/crossentropy": 1.9568690061569214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.183781698346138, + "step": 19066 + }, + { + "epoch": 0.38136, + "grad_norm": 2.109375, + "grad_norm_var": 0.022855631510416665, + "learning_rate": 0.0001, + "loss": 4.0898, + "loss/crossentropy": 2.092573404312134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20443417131900787, + "step": 19068 + }, + { + "epoch": 0.3814, + "grad_norm": 1.84375, + "grad_norm_var": 0.025770823160807293, + "learning_rate": 0.0001, + "loss": 4.2253, + "loss/crossentropy": 2.17366099357605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064819559454918, + "step": 19070 + }, + { + "epoch": 0.38144, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0265777587890625, + "learning_rate": 0.0001, + "loss": 4.0398, + "loss/crossentropy": 1.7935467958450317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19256697595119476, + "step": 19072 + }, + { + "epoch": 0.38148, + "grad_norm": 1.9765625, + "grad_norm_var": 0.026554107666015625, + "learning_rate": 0.0001, + "loss": 3.9846, + "loss/crossentropy": 1.995141625404358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19474036246538162, + "step": 19074 + }, + { + "epoch": 0.38152, + "grad_norm": 1.8203125, + "grad_norm_var": 0.023221588134765624, + "learning_rate": 0.0001, + "loss": 3.9058, + "loss/crossentropy": 2.140002131462097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19610393047332764, + "step": 19076 + }, + { + "epoch": 0.38156, + "grad_norm": 2.0, + "grad_norm_var": 0.013288370768229167, + "learning_rate": 0.0001, + "loss": 4.1176, + "loss/crossentropy": 2.1740564107894897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20218566060066223, + "step": 19078 + }, + { + "epoch": 0.3816, + "grad_norm": 2.1875, + "grad_norm_var": 0.015925089518229168, + "learning_rate": 0.0001, + "loss": 4.3043, + "loss/crossentropy": 2.065530776977539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21683599054813385, + "step": 19080 + }, + { + "epoch": 0.38164, + "grad_norm": 2.0625, + "grad_norm_var": 0.0127349853515625, + "learning_rate": 0.0001, + "loss": 4.2859, + "loss/crossentropy": 1.844248354434967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19479060918092728, + "step": 19082 + }, + { + "epoch": 0.38168, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011136627197265625, + "learning_rate": 0.0001, + "loss": 3.886, + "loss/crossentropy": 1.8596556186676025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2059667930006981, + "step": 19084 + }, + { + "epoch": 0.38172, + "grad_norm": 2.234375, + "grad_norm_var": 0.014731597900390626, + "learning_rate": 0.0001, + "loss": 4.0982, + "loss/crossentropy": 2.08061683177948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20623548328876495, + "step": 19086 + }, + { + "epoch": 0.38176, + "grad_norm": 2.03125, + "grad_norm_var": 0.014615631103515625, + "learning_rate": 0.0001, + "loss": 4.0013, + "loss/crossentropy": 1.888766884803772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18000122904777527, + "step": 19088 + }, + { + "epoch": 0.3818, + "grad_norm": 1.9140625, + "grad_norm_var": 0.017032877604166666, + "learning_rate": 0.0001, + "loss": 3.8784, + "loss/crossentropy": 2.021374225616455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21468255668878555, + "step": 19090 + }, + { + "epoch": 0.38184, + "grad_norm": 2.0625, + "grad_norm_var": 0.016805013020833332, + "learning_rate": 0.0001, + "loss": 3.8327, + "loss/crossentropy": 2.083697557449341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21188172698020935, + "step": 19092 + }, + { + "epoch": 0.38188, + "grad_norm": 2.171875, + "grad_norm_var": 0.02019017537434896, + "learning_rate": 0.0001, + "loss": 3.9617, + "loss/crossentropy": 2.1484656929969788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21978207677602768, + "step": 19094 + }, + { + "epoch": 0.38192, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0218414306640625, + "learning_rate": 0.0001, + "loss": 4.1728, + "loss/crossentropy": 2.321221709251404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22688252478837967, + "step": 19096 + }, + { + "epoch": 0.38196, + "grad_norm": 2.703125, + "grad_norm_var": 0.0518218994140625, + "learning_rate": 0.0001, + "loss": 4.157, + "loss/crossentropy": 2.0623167753219604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148035168647766, + "step": 19098 + }, + { + "epoch": 0.382, + "grad_norm": 2.234375, + "grad_norm_var": 0.05501708984375, + "learning_rate": 0.0001, + "loss": 4.0255, + "loss/crossentropy": 1.8845015168190002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1880025863647461, + "step": 19100 + }, + { + "epoch": 0.38204, + "grad_norm": 1.9140625, + "grad_norm_var": 0.05045750935872396, + "learning_rate": 0.0001, + "loss": 3.8361, + "loss/crossentropy": 1.927463173866272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17446774244308472, + "step": 19102 + }, + { + "epoch": 0.38208, + "grad_norm": 2.0, + "grad_norm_var": 0.0536529541015625, + "learning_rate": 0.0001, + "loss": 4.2909, + "loss/crossentropy": 2.6780699491500854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22951193153858185, + "step": 19104 + }, + { + "epoch": 0.38212, + "grad_norm": 1.875, + "grad_norm_var": 0.05358250935872396, + "learning_rate": 0.0001, + "loss": 3.9188, + "loss/crossentropy": 2.2530760765075684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2141779437661171, + "step": 19106 + }, + { + "epoch": 0.38216, + "grad_norm": 1.90625, + "grad_norm_var": 0.07016499837239583, + "learning_rate": 0.0001, + "loss": 4.1647, + "loss/crossentropy": 2.0292217135429382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20607471466064453, + "step": 19108 + }, + { + "epoch": 0.3822, + "grad_norm": 2.015625, + "grad_norm_var": 0.07394790649414062, + "learning_rate": 0.0001, + "loss": 3.8144, + "loss/crossentropy": 1.8341345191001892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18437029421329498, + "step": 19110 + }, + { + "epoch": 0.38224, + "grad_norm": 2.078125, + "grad_norm_var": 0.06982421875, + "learning_rate": 0.0001, + "loss": 4.1782, + "loss/crossentropy": 1.9262341260910034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1894589066505432, + "step": 19112 + }, + { + "epoch": 0.38228, + "grad_norm": 1.9609375, + "grad_norm_var": 0.04212824503580729, + "learning_rate": 0.0001, + "loss": 4.2473, + "loss/crossentropy": 1.9369717240333557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17737293988466263, + "step": 19114 + }, + { + "epoch": 0.38232, + "grad_norm": 1.953125, + "grad_norm_var": 0.03845621744791667, + "learning_rate": 0.0001, + "loss": 4.0904, + "loss/crossentropy": 1.7547513842582703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1895698457956314, + "step": 19116 + }, + { + "epoch": 0.38236, + "grad_norm": 2.15625, + "grad_norm_var": 0.042789459228515625, + "learning_rate": 0.0001, + "loss": 4.2814, + "loss/crossentropy": 2.2367511987686157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22208665311336517, + "step": 19118 + }, + { + "epoch": 0.3824, + "grad_norm": 1.890625, + "grad_norm_var": 0.040169016520182295, + "learning_rate": 0.0001, + "loss": 4.1671, + "loss/crossentropy": 2.248104691505432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2132478505373001, + "step": 19120 + }, + { + "epoch": 0.38244, + "grad_norm": 1.7890625, + "grad_norm_var": 0.040415191650390626, + "learning_rate": 0.0001, + "loss": 4.0371, + "loss/crossentropy": 2.237170696258545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21920116990804672, + "step": 19122 + }, + { + "epoch": 0.38248, + "grad_norm": 1.84375, + "grad_norm_var": 0.018027496337890626, + "learning_rate": 0.0001, + "loss": 4.1011, + "loss/crossentropy": 1.82357919216156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1723759025335312, + "step": 19124 + }, + { + "epoch": 0.38252, + "grad_norm": 1.8046875, + "grad_norm_var": 0.018668619791666667, + "learning_rate": 0.0001, + "loss": 3.8541, + "loss/crossentropy": 1.9371765851974487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19782865047454834, + "step": 19126 + }, + { + "epoch": 0.38256, + "grad_norm": 2.1875, + "grad_norm_var": 0.0208404541015625, + "learning_rate": 0.0001, + "loss": 4.1305, + "loss/crossentropy": 1.8295999765396118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1756351813673973, + "step": 19128 + }, + { + "epoch": 0.3826, + "grad_norm": 2.046875, + "grad_norm_var": 0.044077301025390626, + "learning_rate": 0.0001, + "loss": 4.0657, + "loss/crossentropy": 2.0991050601005554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17175115644931793, + "step": 19130 + }, + { + "epoch": 0.38264, + "grad_norm": 2.078125, + "grad_norm_var": 0.04390640258789062, + "learning_rate": 0.0001, + "loss": 4.1359, + "loss/crossentropy": 2.2707191705703735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22085107117891312, + "step": 19132 + }, + { + "epoch": 0.38268, + "grad_norm": 2.015625, + "grad_norm_var": 0.03630345662434896, + "learning_rate": 0.0001, + "loss": 4.352, + "loss/crossentropy": 2.237929582595825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2524951994419098, + "step": 19134 + }, + { + "epoch": 0.38272, + "grad_norm": 1.9609375, + "grad_norm_var": 0.03629735310872396, + "learning_rate": 0.0001, + "loss": 4.0293, + "loss/crossentropy": 1.9790211915969849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18156784772872925, + "step": 19136 + }, + { + "epoch": 0.38276, + "grad_norm": 1.890625, + "grad_norm_var": 0.034795888264973956, + "learning_rate": 0.0001, + "loss": 3.9467, + "loss/crossentropy": 1.9819305539131165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18941760063171387, + "step": 19138 + }, + { + "epoch": 0.3828, + "grad_norm": 2.046875, + "grad_norm_var": 0.032022857666015626, + "learning_rate": 0.0001, + "loss": 3.9699, + "loss/crossentropy": 1.9541404843330383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.195252887904644, + "step": 19140 + }, + { + "epoch": 0.38284, + "grad_norm": 2.0, + "grad_norm_var": 0.028824869791666666, + "learning_rate": 0.0001, + "loss": 3.9563, + "loss/crossentropy": 2.1265366673469543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20113253593444824, + "step": 19142 + }, + { + "epoch": 0.38288, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0292724609375, + "learning_rate": 0.0001, + "loss": 4.0248, + "loss/crossentropy": 1.767483413219452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18520043045282364, + "step": 19144 + }, + { + "epoch": 0.38292, + "grad_norm": 2.0625, + "grad_norm_var": 0.005352528889973959, + "learning_rate": 0.0001, + "loss": 4.2516, + "loss/crossentropy": 2.1124974489212036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20909518748521805, + "step": 19146 + }, + { + "epoch": 0.38296, + "grad_norm": 2.046875, + "grad_norm_var": 0.005181630452473958, + "learning_rate": 0.0001, + "loss": 4.3104, + "loss/crossentropy": 2.2959529161453247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2183123677968979, + "step": 19148 + }, + { + "epoch": 0.383, + "grad_norm": 2.0625, + "grad_norm_var": 0.005078125, + "learning_rate": 0.0001, + "loss": 4.3217, + "loss/crossentropy": 2.3522391319274902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22062842547893524, + "step": 19150 + }, + { + "epoch": 0.38304, + "grad_norm": 1.953125, + "grad_norm_var": 0.0048095703125, + "learning_rate": 0.0001, + "loss": 3.7375, + "loss/crossentropy": 1.6673399806022644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16702746599912643, + "step": 19152 + }, + { + "epoch": 0.38308, + "grad_norm": 1.984375, + "grad_norm_var": 0.004073079427083333, + "learning_rate": 0.0001, + "loss": 4.2058, + "loss/crossentropy": 2.021436333656311, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19833282381296158, + "step": 19154 + }, + { + "epoch": 0.38312, + "grad_norm": 2.0, + "grad_norm_var": 0.007439931233723958, + "learning_rate": 0.0001, + "loss": 3.7624, + "loss/crossentropy": 1.4123128056526184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1485099121928215, + "step": 19156 + }, + { + "epoch": 0.38316, + "grad_norm": 1.8125, + "grad_norm_var": 0.009959920247395834, + "learning_rate": 0.0001, + "loss": 4.2364, + "loss/crossentropy": 1.9472095966339111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16975026577711105, + "step": 19158 + }, + { + "epoch": 0.3832, + "grad_norm": 2.078125, + "grad_norm_var": 0.0094390869140625, + "learning_rate": 0.0001, + "loss": 4.2025, + "loss/crossentropy": 2.112083077430725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2198200300335884, + "step": 19160 + }, + { + "epoch": 0.38324, + "grad_norm": 2.015625, + "grad_norm_var": 0.0121246337890625, + "learning_rate": 0.0001, + "loss": 3.9661, + "loss/crossentropy": 2.1301331520080566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039676010608673, + "step": 19162 + }, + { + "epoch": 0.38328, + "grad_norm": 2.109375, + "grad_norm_var": 0.019456990559895835, + "learning_rate": 0.0001, + "loss": 4.0655, + "loss/crossentropy": 1.8461318016052246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17811527848243713, + "step": 19164 + }, + { + "epoch": 0.38332, + "grad_norm": 1.984375, + "grad_norm_var": 0.018973795572916667, + "learning_rate": 0.0001, + "loss": 4.0408, + "loss/crossentropy": 2.102015793323517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21166063100099564, + "step": 19166 + }, + { + "epoch": 0.38336, + "grad_norm": 1.8359375, + "grad_norm_var": 0.020167795817057292, + "learning_rate": 0.0001, + "loss": 3.9572, + "loss/crossentropy": 1.9496545791625977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17148377001285553, + "step": 19168 + }, + { + "epoch": 0.3834, + "grad_norm": 1.890625, + "grad_norm_var": 0.020643870035807293, + "learning_rate": 0.0001, + "loss": 4.0675, + "loss/crossentropy": 2.059878885746002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1984856054186821, + "step": 19170 + }, + { + "epoch": 0.38344, + "grad_norm": 1.8359375, + "grad_norm_var": 0.017923990885416668, + "learning_rate": 0.0001, + "loss": 3.6476, + "loss/crossentropy": 1.7966619729995728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17823103070259094, + "step": 19172 + }, + { + "epoch": 0.38348, + "grad_norm": 1.8359375, + "grad_norm_var": 0.01645075480143229, + "learning_rate": 0.0001, + "loss": 3.6671, + "loss/crossentropy": 2.0173062086105347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19681749492883682, + "step": 19174 + }, + { + "epoch": 0.38352, + "grad_norm": 1.7265625, + "grad_norm_var": 0.019108072916666666, + "learning_rate": 0.0001, + "loss": 3.6439, + "loss/crossentropy": 1.6330417394638062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17260564863681793, + "step": 19176 + }, + { + "epoch": 0.38356, + "grad_norm": 1.8984375, + "grad_norm_var": 0.01646728515625, + "learning_rate": 0.0001, + "loss": 3.8957, + "loss/crossentropy": 1.6575458645820618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17257437109947205, + "step": 19178 + }, + { + "epoch": 0.3836, + "grad_norm": 2.0625, + "grad_norm_var": 0.006266021728515625, + "learning_rate": 0.0001, + "loss": 3.8633, + "loss/crossentropy": 2.094203770160675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039681375026703, + "step": 19180 + }, + { + "epoch": 0.38364, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006306966145833333, + "learning_rate": 0.0001, + "loss": 3.9579, + "loss/crossentropy": 1.9561032056808472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19183862209320068, + "step": 19182 + }, + { + "epoch": 0.38368, + "grad_norm": 2.609375, + "grad_norm_var": 0.03765055338541667, + "learning_rate": 0.0001, + "loss": 4.4288, + "loss/crossentropy": 1.950503408908844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198556549847126, + "step": 19184 + }, + { + "epoch": 0.38372, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0398193359375, + "learning_rate": 0.0001, + "loss": 4.2354, + "loss/crossentropy": 2.421600103378296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.208856999874115, + "step": 19186 + }, + { + "epoch": 0.38376, + "grad_norm": 1.96875, + "grad_norm_var": 0.04036356608072917, + "learning_rate": 0.0001, + "loss": 3.9318, + "loss/crossentropy": 1.7086477279663086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19327964633703232, + "step": 19188 + }, + { + "epoch": 0.3838, + "grad_norm": 2.171875, + "grad_norm_var": 0.041071573893229164, + "learning_rate": 0.0001, + "loss": 4.1488, + "loss/crossentropy": 2.1670665740966797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19333725422620773, + "step": 19190 + }, + { + "epoch": 0.38384, + "grad_norm": 1.9609375, + "grad_norm_var": 0.03430582682291667, + "learning_rate": 0.0001, + "loss": 4.3145, + "loss/crossentropy": 2.218670129776001, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20835422724485397, + "step": 19192 + }, + { + "epoch": 0.38388, + "grad_norm": 1.828125, + "grad_norm_var": 0.037699127197265626, + "learning_rate": 0.0001, + "loss": 3.7753, + "loss/crossentropy": 1.909380555152893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18018050491809845, + "step": 19194 + }, + { + "epoch": 0.38392, + "grad_norm": 1.953125, + "grad_norm_var": 0.038211822509765625, + "learning_rate": 0.0001, + "loss": 4.1049, + "loss/crossentropy": 1.9840248227119446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19120849668979645, + "step": 19196 + }, + { + "epoch": 0.38396, + "grad_norm": 2.265625, + "grad_norm_var": 0.042012532552083336, + "learning_rate": 0.0001, + "loss": 4.4124, + "loss/crossentropy": 2.3527809381484985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20009320974349976, + "step": 19198 + }, + { + "epoch": 0.384, + "grad_norm": 1.96875, + "grad_norm_var": 0.0181793212890625, + "learning_rate": 0.0001, + "loss": 4.1011, + "loss/crossentropy": 2.2111377716064453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19603972136974335, + "step": 19200 + }, + { + "epoch": 0.38404, + "grad_norm": 1.9375, + "grad_norm_var": 0.01685358683268229, + "learning_rate": 0.0001, + "loss": 3.9995, + "loss/crossentropy": 1.6832327842712402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18738068640232086, + "step": 19202 + }, + { + "epoch": 0.38408, + "grad_norm": 2.046875, + "grad_norm_var": 0.015805816650390624, + "learning_rate": 0.0001, + "loss": 3.9789, + "loss/crossentropy": 1.9412715435028076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1835481896996498, + "step": 19204 + }, + { + "epoch": 0.38412, + "grad_norm": 2.046875, + "grad_norm_var": 0.013451131184895833, + "learning_rate": 0.0001, + "loss": 4.203, + "loss/crossentropy": 2.0641706585884094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20165929198265076, + "step": 19206 + }, + { + "epoch": 0.38416, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011400349934895833, + "learning_rate": 0.0001, + "loss": 4.0054, + "loss/crossentropy": 2.2053582668304443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108134999871254, + "step": 19208 + }, + { + "epoch": 0.3842, + "grad_norm": 1.828125, + "grad_norm_var": 0.010015614827473958, + "learning_rate": 0.0001, + "loss": 3.8549, + "loss/crossentropy": 2.0534805059432983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19088375568389893, + "step": 19210 + }, + { + "epoch": 0.38424, + "grad_norm": 1.8203125, + "grad_norm_var": 0.010969034830729167, + "learning_rate": 0.0001, + "loss": 4.06, + "loss/crossentropy": 1.8782867789268494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17865915596485138, + "step": 19212 + }, + { + "epoch": 0.38428, + "grad_norm": 2.078125, + "grad_norm_var": 0.0059201558430989586, + "learning_rate": 0.0001, + "loss": 4.0641, + "loss/crossentropy": 1.9773831963539124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20622816681861877, + "step": 19214 + }, + { + "epoch": 0.38432, + "grad_norm": 2.078125, + "grad_norm_var": 0.0067942301432291664, + "learning_rate": 0.0001, + "loss": 4.1479, + "loss/crossentropy": 2.0890401005744934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2173013687133789, + "step": 19216 + }, + { + "epoch": 0.38436, + "grad_norm": 1.90625, + "grad_norm_var": 0.008676910400390625, + "learning_rate": 0.0001, + "loss": 3.7221, + "loss/crossentropy": 1.858555793762207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15820113569498062, + "step": 19218 + }, + { + "epoch": 0.3844, + "grad_norm": 2.078125, + "grad_norm_var": 0.00911865234375, + "learning_rate": 0.0001, + "loss": 3.9924, + "loss/crossentropy": 2.2149851322174072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2221473827958107, + "step": 19220 + }, + { + "epoch": 0.38444, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009490712483723959, + "learning_rate": 0.0001, + "loss": 3.7057, + "loss/crossentropy": 2.086311161518097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1791597157716751, + "step": 19222 + }, + { + "epoch": 0.38448, + "grad_norm": 1.90625, + "grad_norm_var": 0.009112294514973958, + "learning_rate": 0.0001, + "loss": 3.9117, + "loss/crossentropy": 1.810127079486847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2018662691116333, + "step": 19224 + }, + { + "epoch": 0.38452, + "grad_norm": 1.984375, + "grad_norm_var": 0.009064737955729167, + "learning_rate": 0.0001, + "loss": 4.3156, + "loss/crossentropy": 2.132863163948059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22217299789190292, + "step": 19226 + }, + { + "epoch": 0.38456, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008299763997395833, + "learning_rate": 0.0001, + "loss": 3.8697, + "loss/crossentropy": 2.0617652535438538, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1968286782503128, + "step": 19228 + }, + { + "epoch": 0.3846, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007250722249348958, + "learning_rate": 0.0001, + "loss": 3.8566, + "loss/crossentropy": 1.9286837577819824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20616496354341507, + "step": 19230 + }, + { + "epoch": 0.38464, + "grad_norm": 2.125, + "grad_norm_var": 0.008150227864583333, + "learning_rate": 0.0001, + "loss": 4.2854, + "loss/crossentropy": 2.009112238883972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19038759917020798, + "step": 19232 + }, + { + "epoch": 0.38468, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006818644205729167, + "learning_rate": 0.0001, + "loss": 4.0879, + "loss/crossentropy": 2.1937737464904785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20164573192596436, + "step": 19234 + }, + { + "epoch": 0.38472, + "grad_norm": 1.9296875, + "grad_norm_var": 0.00587158203125, + "learning_rate": 0.0001, + "loss": 4.2356, + "loss/crossentropy": 1.981977641582489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20445218682289124, + "step": 19236 + }, + { + "epoch": 0.38476, + "grad_norm": 1.9609375, + "grad_norm_var": 0.004648590087890625, + "learning_rate": 0.0001, + "loss": 3.9515, + "loss/crossentropy": 1.6332372426986694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1629931926727295, + "step": 19238 + }, + { + "epoch": 0.3848, + "grad_norm": 1.9375, + "grad_norm_var": 0.003952789306640625, + "learning_rate": 0.0001, + "loss": 3.9061, + "loss/crossentropy": 1.8656854629516602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18546659499406815, + "step": 19240 + }, + { + "epoch": 0.38484, + "grad_norm": 2.09375, + "grad_norm_var": 0.005594635009765625, + "learning_rate": 0.0001, + "loss": 4.1251, + "loss/crossentropy": 2.219391703605652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21327267587184906, + "step": 19242 + }, + { + "epoch": 0.38488, + "grad_norm": 2.171875, + "grad_norm_var": 0.007883453369140625, + "learning_rate": 0.0001, + "loss": 4.448, + "loss/crossentropy": 1.9518468976020813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23419301211833954, + "step": 19244 + }, + { + "epoch": 0.38492, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007411448160807291, + "learning_rate": 0.0001, + "loss": 4.2947, + "loss/crossentropy": 2.190311014652252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2218230664730072, + "step": 19246 + }, + { + "epoch": 0.38496, + "grad_norm": 1.765625, + "grad_norm_var": 0.009633127848307292, + "learning_rate": 0.0001, + "loss": 3.8475, + "loss/crossentropy": 1.8347881436347961, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20036222785711288, + "step": 19248 + }, + { + "epoch": 0.385, + "grad_norm": 2.0625, + "grad_norm_var": 0.012157185872395834, + "learning_rate": 0.0001, + "loss": 3.9732, + "loss/crossentropy": 1.949516236782074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20307819545269012, + "step": 19250 + }, + { + "epoch": 0.38504, + "grad_norm": 1.9375, + "grad_norm_var": 0.012892405192057291, + "learning_rate": 0.0001, + "loss": 4.007, + "loss/crossentropy": 1.883664846420288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1825740560889244, + "step": 19252 + }, + { + "epoch": 0.38508, + "grad_norm": 2.140625, + "grad_norm_var": 0.0146881103515625, + "learning_rate": 0.0001, + "loss": 4.1794, + "loss/crossentropy": 2.1245557069778442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20935190469026566, + "step": 19254 + }, + { + "epoch": 0.38512, + "grad_norm": 2.078125, + "grad_norm_var": 0.015166982014973959, + "learning_rate": 0.0001, + "loss": 3.9125, + "loss/crossentropy": 1.546354353427887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15509265661239624, + "step": 19256 + }, + { + "epoch": 0.38516, + "grad_norm": 1.96875, + "grad_norm_var": 0.014290110270182291, + "learning_rate": 0.0001, + "loss": 4.1744, + "loss/crossentropy": 1.979533076286316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20994720607995987, + "step": 19258 + }, + { + "epoch": 0.3852, + "grad_norm": 1.984375, + "grad_norm_var": 0.014074452718098958, + "learning_rate": 0.0001, + "loss": 3.7774, + "loss/crossentropy": 2.125569224357605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21175500005483627, + "step": 19260 + }, + { + "epoch": 0.38524, + "grad_norm": 1.859375, + "grad_norm_var": 0.0124267578125, + "learning_rate": 0.0001, + "loss": 3.911, + "loss/crossentropy": 2.027758777141571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20255477726459503, + "step": 19262 + }, + { + "epoch": 0.38528, + "grad_norm": 1.84375, + "grad_norm_var": 0.011058553059895834, + "learning_rate": 0.0001, + "loss": 3.4728, + "loss/crossentropy": 1.9937690496444702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1836088001728058, + "step": 19264 + }, + { + "epoch": 0.38532, + "grad_norm": 1.921875, + "grad_norm_var": 0.00863037109375, + "learning_rate": 0.0001, + "loss": 4.1797, + "loss/crossentropy": 2.0846773386001587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18216369301080704, + "step": 19266 + }, + { + "epoch": 0.38536, + "grad_norm": 1.984375, + "grad_norm_var": 0.008429972330729167, + "learning_rate": 0.0001, + "loss": 3.8911, + "loss/crossentropy": 1.445238471031189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17925872653722763, + "step": 19268 + }, + { + "epoch": 0.3854, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0061757405598958336, + "learning_rate": 0.0001, + "loss": 4.0615, + "loss/crossentropy": 2.0442845821380615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2024901956319809, + "step": 19270 + }, + { + "epoch": 0.38544, + "grad_norm": 1.7265625, + "grad_norm_var": 0.009144846598307292, + "learning_rate": 0.0001, + "loss": 4.0898, + "loss/crossentropy": 1.9336092472076416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17869817465543747, + "step": 19272 + }, + { + "epoch": 0.38548, + "grad_norm": 2.125, + "grad_norm_var": 0.010884348551432292, + "learning_rate": 0.0001, + "loss": 4.1939, + "loss/crossentropy": 1.815299928188324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.182998888194561, + "step": 19274 + }, + { + "epoch": 0.38552, + "grad_norm": 2.0, + "grad_norm_var": 0.008886464436848958, + "learning_rate": 0.0001, + "loss": 4.2121, + "loss/crossentropy": 2.091490864753723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19843900948762894, + "step": 19276 + }, + { + "epoch": 0.38556, + "grad_norm": 2.03125, + "grad_norm_var": 0.009608713785807292, + "learning_rate": 0.0001, + "loss": 4.098, + "loss/crossentropy": 1.9923794865608215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.186911903321743, + "step": 19278 + }, + { + "epoch": 0.3856, + "grad_norm": 1.828125, + "grad_norm_var": 0.009867350260416666, + "learning_rate": 0.0001, + "loss": 4.0658, + "loss/crossentropy": 1.9323501586914062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1931384950876236, + "step": 19280 + }, + { + "epoch": 0.38564, + "grad_norm": 1.875, + "grad_norm_var": 0.010262044270833333, + "learning_rate": 0.0001, + "loss": 3.8819, + "loss/crossentropy": 1.8469224572181702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1835423856973648, + "step": 19282 + }, + { + "epoch": 0.38568, + "grad_norm": 1.859375, + "grad_norm_var": 0.010453287760416667, + "learning_rate": 0.0001, + "loss": 3.9027, + "loss/crossentropy": 1.9476045370101929, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17014781385660172, + "step": 19284 + }, + { + "epoch": 0.38572, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010285441080729167, + "learning_rate": 0.0001, + "loss": 3.9943, + "loss/crossentropy": 1.7913609743118286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18661632388830185, + "step": 19286 + }, + { + "epoch": 0.38576, + "grad_norm": 1.8046875, + "grad_norm_var": 0.00760498046875, + "learning_rate": 0.0001, + "loss": 4.0283, + "loss/crossentropy": 1.9682837128639221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1799641102552414, + "step": 19288 + }, + { + "epoch": 0.3858, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006331125895182292, + "learning_rate": 0.0001, + "loss": 4.3075, + "loss/crossentropy": 2.117633819580078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.208334282040596, + "step": 19290 + }, + { + "epoch": 0.38584, + "grad_norm": 1.890625, + "grad_norm_var": 0.005761464436848958, + "learning_rate": 0.0001, + "loss": 4.1215, + "loss/crossentropy": 2.1208746433258057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20928698778152466, + "step": 19292 + }, + { + "epoch": 0.38588, + "grad_norm": 1.8046875, + "grad_norm_var": 0.005549112955729167, + "learning_rate": 0.0001, + "loss": 4.0572, + "loss/crossentropy": 1.9424505829811096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18668201565742493, + "step": 19294 + }, + { + "epoch": 0.38592, + "grad_norm": 1.921875, + "grad_norm_var": 0.005277252197265625, + "learning_rate": 0.0001, + "loss": 4.1575, + "loss/crossentropy": 2.251755177974701, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21153685450553894, + "step": 19296 + }, + { + "epoch": 0.38596, + "grad_norm": 2.078125, + "grad_norm_var": 0.007114410400390625, + "learning_rate": 0.0001, + "loss": 4.1885, + "loss/crossentropy": 2.1629436016082764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.206063412129879, + "step": 19298 + }, + { + "epoch": 0.386, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0069732666015625, + "learning_rate": 0.0001, + "loss": 4.1267, + "loss/crossentropy": 2.0333832502365112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1872929260134697, + "step": 19300 + }, + { + "epoch": 0.38604, + "grad_norm": 1.890625, + "grad_norm_var": 0.007641347249348959, + "learning_rate": 0.0001, + "loss": 3.8433, + "loss/crossentropy": 1.8364217281341553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17408161610364914, + "step": 19302 + }, + { + "epoch": 0.38608, + "grad_norm": 1.890625, + "grad_norm_var": 0.005973307291666666, + "learning_rate": 0.0001, + "loss": 3.9175, + "loss/crossentropy": 1.975037932395935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18705998361110687, + "step": 19304 + }, + { + "epoch": 0.38612, + "grad_norm": 1.7890625, + "grad_norm_var": 0.006477864583333334, + "learning_rate": 0.0001, + "loss": 3.7868, + "loss/crossentropy": 1.6889175176620483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16312924772500992, + "step": 19306 + }, + { + "epoch": 0.38616, + "grad_norm": 1.9375, + "grad_norm_var": 0.006392161051432292, + "learning_rate": 0.0001, + "loss": 4.1299, + "loss/crossentropy": 2.2644211053848267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18390889465808868, + "step": 19308 + }, + { + "epoch": 0.3862, + "grad_norm": 2.1875, + "grad_norm_var": 0.009723917643229166, + "learning_rate": 0.0001, + "loss": 4.325, + "loss/crossentropy": 2.180909037590027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20263531804084778, + "step": 19310 + }, + { + "epoch": 0.38624, + "grad_norm": 2.015625, + "grad_norm_var": 0.010138956705729167, + "learning_rate": 0.0001, + "loss": 4.2001, + "loss/crossentropy": 2.4007444381713867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23251044005155563, + "step": 19312 + }, + { + "epoch": 0.38628, + "grad_norm": 2.0, + "grad_norm_var": 0.010534413655598958, + "learning_rate": 0.0001, + "loss": 3.9862, + "loss/crossentropy": 1.8122236728668213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18613354116678238, + "step": 19314 + }, + { + "epoch": 0.38632, + "grad_norm": 1.8828125, + "grad_norm_var": 0.010404459635416667, + "learning_rate": 0.0001, + "loss": 4.1379, + "loss/crossentropy": 2.159746825695038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21711497008800507, + "step": 19316 + }, + { + "epoch": 0.38636, + "grad_norm": 1.953125, + "grad_norm_var": 0.009627024332682291, + "learning_rate": 0.0001, + "loss": 3.9929, + "loss/crossentropy": 2.129119336605072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19623322039842606, + "step": 19318 + }, + { + "epoch": 0.3864, + "grad_norm": 1.96875, + "grad_norm_var": 0.009934234619140624, + "learning_rate": 0.0001, + "loss": 4.1578, + "loss/crossentropy": 2.322758913040161, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20978064835071564, + "step": 19320 + }, + { + "epoch": 0.38644, + "grad_norm": 2.015625, + "grad_norm_var": 0.010322825113932291, + "learning_rate": 0.0001, + "loss": 4.0266, + "loss/crossentropy": 2.2643767595291138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21338346600532532, + "step": 19322 + }, + { + "epoch": 0.38648, + "grad_norm": 1.8828125, + "grad_norm_var": 0.010731760660807292, + "learning_rate": 0.0001, + "loss": 4.0614, + "loss/crossentropy": 1.8038535118103027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19042538851499557, + "step": 19324 + }, + { + "epoch": 0.38652, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0069620768229166664, + "learning_rate": 0.0001, + "loss": 3.9641, + "loss/crossentropy": 2.0847853422164917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1965423971414566, + "step": 19326 + }, + { + "epoch": 0.38656, + "grad_norm": 2.015625, + "grad_norm_var": 0.007692209879557292, + "learning_rate": 0.0001, + "loss": 4.3347, + "loss/crossentropy": 2.2121591567993164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21294504404067993, + "step": 19328 + }, + { + "epoch": 0.3866, + "grad_norm": 1.8671875, + "grad_norm_var": 0.005985260009765625, + "learning_rate": 0.0001, + "loss": 3.9869, + "loss/crossentropy": 2.100327968597412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20181559026241302, + "step": 19330 + }, + { + "epoch": 0.38664, + "grad_norm": 1.78125, + "grad_norm_var": 0.00750732421875, + "learning_rate": 0.0001, + "loss": 3.642, + "loss/crossentropy": 1.7100898623466492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18045856803655624, + "step": 19332 + }, + { + "epoch": 0.38668, + "grad_norm": 1.953125, + "grad_norm_var": 0.008337148030598958, + "learning_rate": 0.0001, + "loss": 3.712, + "loss/crossentropy": 1.6435258388519287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1821775734424591, + "step": 19334 + }, + { + "epoch": 0.38672, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008674875895182291, + "learning_rate": 0.0001, + "loss": 3.8334, + "loss/crossentropy": 2.338008165359497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2040715217590332, + "step": 19336 + }, + { + "epoch": 0.38676, + "grad_norm": 1.78125, + "grad_norm_var": 0.008454386393229167, + "learning_rate": 0.0001, + "loss": 3.8911, + "loss/crossentropy": 1.8373408913612366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17516936361789703, + "step": 19338 + }, + { + "epoch": 0.3868, + "grad_norm": 1.921875, + "grad_norm_var": 0.008422597249348959, + "learning_rate": 0.0001, + "loss": 3.9902, + "loss/crossentropy": 2.0346588492393494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20979416370391846, + "step": 19340 + }, + { + "epoch": 0.38684, + "grad_norm": 2.078125, + "grad_norm_var": 0.009399159749348959, + "learning_rate": 0.0001, + "loss": 4.1759, + "loss/crossentropy": 2.083326816558838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20702984929084778, + "step": 19342 + }, + { + "epoch": 0.38688, + "grad_norm": 2.0, + "grad_norm_var": 0.012473297119140626, + "learning_rate": 0.0001, + "loss": 4.611, + "loss/crossentropy": 2.4100319147109985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2617071568965912, + "step": 19344 + }, + { + "epoch": 0.38692, + "grad_norm": 1.96875, + "grad_norm_var": 0.012550608317057291, + "learning_rate": 0.0001, + "loss": 3.8864, + "loss/crossentropy": 1.7147992849349976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.174342080950737, + "step": 19346 + }, + { + "epoch": 0.38696, + "grad_norm": 2.046875, + "grad_norm_var": 0.013602701822916667, + "learning_rate": 0.0001, + "loss": 4.1508, + "loss/crossentropy": 2.041933536529541, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2024158239364624, + "step": 19348 + }, + { + "epoch": 0.387, + "grad_norm": 1.921875, + "grad_norm_var": 0.012434895833333333, + "learning_rate": 0.0001, + "loss": 4.0929, + "loss/crossentropy": 1.835128128528595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1861354559659958, + "step": 19350 + }, + { + "epoch": 0.38704, + "grad_norm": 2.15625, + "grad_norm_var": 0.013370768229166666, + "learning_rate": 0.0001, + "loss": 4.164, + "loss/crossentropy": 1.9472790360450745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19485513865947723, + "step": 19352 + }, + { + "epoch": 0.38708, + "grad_norm": 1.8359375, + "grad_norm_var": 0.012428538004557291, + "learning_rate": 0.0001, + "loss": 4.0217, + "loss/crossentropy": 2.0546197295188904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1883516013622284, + "step": 19354 + }, + { + "epoch": 0.38712, + "grad_norm": 1.8359375, + "grad_norm_var": 0.013169097900390624, + "learning_rate": 0.0001, + "loss": 3.6439, + "loss/crossentropy": 1.8285245299339294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1730484738945961, + "step": 19356 + }, + { + "epoch": 0.38716, + "grad_norm": 1.984375, + "grad_norm_var": 0.013890584309895834, + "learning_rate": 0.0001, + "loss": 4.2169, + "loss/crossentropy": 2.320949673652649, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21517502516508102, + "step": 19358 + }, + { + "epoch": 0.3872, + "grad_norm": 2.03125, + "grad_norm_var": 0.011289215087890625, + "learning_rate": 0.0001, + "loss": 4.0196, + "loss/crossentropy": 2.2944518327713013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20343206077814102, + "step": 19360 + }, + { + "epoch": 0.38724, + "grad_norm": 1.96875, + "grad_norm_var": 0.0112945556640625, + "learning_rate": 0.0001, + "loss": 4.2113, + "loss/crossentropy": 2.134889602661133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19354654103517532, + "step": 19362 + }, + { + "epoch": 0.38728, + "grad_norm": 1.984375, + "grad_norm_var": 0.009639231363932292, + "learning_rate": 0.0001, + "loss": 4.1244, + "loss/crossentropy": 2.206334412097931, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20670340210199356, + "step": 19364 + }, + { + "epoch": 0.38732, + "grad_norm": 2.0, + "grad_norm_var": 0.009769694010416666, + "learning_rate": 0.0001, + "loss": 3.8194, + "loss/crossentropy": 2.1286932229995728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19814791530370712, + "step": 19366 + }, + { + "epoch": 0.38736, + "grad_norm": 1.984375, + "grad_norm_var": 0.0066314697265625, + "learning_rate": 0.0001, + "loss": 4.1533, + "loss/crossentropy": 2.119162678718567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18044983595609665, + "step": 19368 + }, + { + "epoch": 0.3874, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005060831705729167, + "learning_rate": 0.0001, + "loss": 4.0061, + "loss/crossentropy": 2.4316102266311646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22509240359067917, + "step": 19370 + }, + { + "epoch": 0.38744, + "grad_norm": 1.8046875, + "grad_norm_var": 0.0060117085774739586, + "learning_rate": 0.0001, + "loss": 3.9984, + "loss/crossentropy": 2.540325403213501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22844604402780533, + "step": 19372 + }, + { + "epoch": 0.38748, + "grad_norm": 2.078125, + "grad_norm_var": 0.005782063802083333, + "learning_rate": 0.0001, + "loss": 4.1795, + "loss/crossentropy": 2.3002058267593384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21805168688297272, + "step": 19374 + }, + { + "epoch": 0.38752, + "grad_norm": 2.03125, + "grad_norm_var": 0.0058258056640625, + "learning_rate": 0.0001, + "loss": 4.274, + "loss/crossentropy": 2.2381847500801086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19565512239933014, + "step": 19376 + }, + { + "epoch": 0.38756, + "grad_norm": 2.046875, + "grad_norm_var": 0.0060302734375, + "learning_rate": 0.0001, + "loss": 4.2683, + "loss/crossentropy": 2.101936161518097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198813758790493, + "step": 19378 + }, + { + "epoch": 0.3876, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0060943603515625, + "learning_rate": 0.0001, + "loss": 3.8679, + "loss/crossentropy": 2.1088311672210693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21237140893936157, + "step": 19380 + }, + { + "epoch": 0.38764, + "grad_norm": 2.0, + "grad_norm_var": 0.0060943603515625, + "learning_rate": 0.0001, + "loss": 3.9588, + "loss/crossentropy": 2.0985374450683594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18979168683290482, + "step": 19382 + }, + { + "epoch": 0.38768, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005690256754557292, + "learning_rate": 0.0001, + "loss": 4.0798, + "loss/crossentropy": 2.3004449605941772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2053247168660164, + "step": 19384 + }, + { + "epoch": 0.38772, + "grad_norm": 1.9375, + "grad_norm_var": 0.005277252197265625, + "learning_rate": 0.0001, + "loss": 4.0285, + "loss/crossentropy": 2.1257707476615906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22222986072301865, + "step": 19386 + }, + { + "epoch": 0.38776, + "grad_norm": 1.9765625, + "grad_norm_var": 0.003733062744140625, + "learning_rate": 0.0001, + "loss": 4.1336, + "loss/crossentropy": 2.3034613132476807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2174137905240059, + "step": 19388 + }, + { + "epoch": 0.3878, + "grad_norm": 1.7578125, + "grad_norm_var": 0.006245930989583333, + "learning_rate": 0.0001, + "loss": 3.9854, + "loss/crossentropy": 2.1303011178970337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22152414172887802, + "step": 19390 + }, + { + "epoch": 0.38784, + "grad_norm": 2.0, + "grad_norm_var": 0.010412343343098958, + "learning_rate": 0.0001, + "loss": 4.1868, + "loss/crossentropy": 1.8575092554092407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18628983944654465, + "step": 19392 + }, + { + "epoch": 0.38788, + "grad_norm": 2.109375, + "grad_norm_var": 0.011694081624348958, + "learning_rate": 0.0001, + "loss": 4.1219, + "loss/crossentropy": 2.0332913994789124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2499927133321762, + "step": 19394 + }, + { + "epoch": 0.38792, + "grad_norm": 2.125, + "grad_norm_var": 0.013004302978515625, + "learning_rate": 0.0001, + "loss": 4.2275, + "loss/crossentropy": 2.0926660895347595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19264871627092361, + "step": 19396 + }, + { + "epoch": 0.38796, + "grad_norm": 2.21875, + "grad_norm_var": 0.0156005859375, + "learning_rate": 0.0001, + "loss": 4.2337, + "loss/crossentropy": 2.08814400434494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21137948334217072, + "step": 19398 + }, + { + "epoch": 0.388, + "grad_norm": 1.9921875, + "grad_norm_var": 0.019162750244140624, + "learning_rate": 0.0001, + "loss": 4.0263, + "loss/crossentropy": 2.074121594429016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20524942874908447, + "step": 19400 + }, + { + "epoch": 0.38804, + "grad_norm": 1.8671875, + "grad_norm_var": 0.020589192708333332, + "learning_rate": 0.0001, + "loss": 3.9437, + "loss/crossentropy": 1.8818755745887756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1794123500585556, + "step": 19402 + }, + { + "epoch": 0.38808, + "grad_norm": 1.890625, + "grad_norm_var": 0.020992024739583334, + "learning_rate": 0.0001, + "loss": 3.8638, + "loss/crossentropy": 2.5050524473190308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22337081283330917, + "step": 19404 + }, + { + "epoch": 0.38812, + "grad_norm": 2.03125, + "grad_norm_var": 0.0180572509765625, + "learning_rate": 0.0001, + "loss": 4.2158, + "loss/crossentropy": 2.2035861015319824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20753300189971924, + "step": 19406 + }, + { + "epoch": 0.38816, + "grad_norm": 1.90625, + "grad_norm_var": 0.014012654622395834, + "learning_rate": 0.0001, + "loss": 3.8975, + "loss/crossentropy": 2.2270091772079468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20527157932519913, + "step": 19408 + }, + { + "epoch": 0.3882, + "grad_norm": 1.8828125, + "grad_norm_var": 0.013504791259765624, + "learning_rate": 0.0001, + "loss": 3.951, + "loss/crossentropy": 1.6546313762664795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17243140190839767, + "step": 19410 + }, + { + "epoch": 0.38824, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012756093343098959, + "learning_rate": 0.0001, + "loss": 4.0653, + "loss/crossentropy": 1.8858160376548767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18621040880680084, + "step": 19412 + }, + { + "epoch": 0.38828, + "grad_norm": 2.109375, + "grad_norm_var": 0.011875152587890625, + "learning_rate": 0.0001, + "loss": 4.4744, + "loss/crossentropy": 1.9876565337181091, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30138373374938965, + "step": 19414 + }, + { + "epoch": 0.38832, + "grad_norm": 2.1875, + "grad_norm_var": 0.013053131103515626, + "learning_rate": 0.0001, + "loss": 3.7802, + "loss/crossentropy": 2.1887502670288086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1910400092601776, + "step": 19416 + }, + { + "epoch": 0.38836, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012511952718098959, + "learning_rate": 0.0001, + "loss": 3.9398, + "loss/crossentropy": 2.246508002281189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2018478363752365, + "step": 19418 + }, + { + "epoch": 0.3884, + "grad_norm": 1.8828125, + "grad_norm_var": 0.011510976155598958, + "learning_rate": 0.0001, + "loss": 4.1703, + "loss/crossentropy": 2.041202425956726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1890542358160019, + "step": 19420 + }, + { + "epoch": 0.38844, + "grad_norm": 2.09375, + "grad_norm_var": 0.016904449462890624, + "learning_rate": 0.0001, + "loss": 4.4154, + "loss/crossentropy": 2.0470627546310425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19648576527833939, + "step": 19422 + }, + { + "epoch": 0.38848, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0155181884765625, + "learning_rate": 0.0001, + "loss": 4.1794, + "loss/crossentropy": 2.172736167907715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22087720036506653, + "step": 19424 + }, + { + "epoch": 0.38852, + "grad_norm": 1.8828125, + "grad_norm_var": 0.015811920166015625, + "learning_rate": 0.0001, + "loss": 4.0106, + "loss/crossentropy": 2.0988662242889404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18553303182125092, + "step": 19426 + }, + { + "epoch": 0.38856, + "grad_norm": 1.8359375, + "grad_norm_var": 0.017134348551432293, + "learning_rate": 0.0001, + "loss": 4.1368, + "loss/crossentropy": 1.821892261505127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19670143723487854, + "step": 19428 + }, + { + "epoch": 0.3886, + "grad_norm": 1.875, + "grad_norm_var": 0.016747792561848957, + "learning_rate": 0.0001, + "loss": 3.7878, + "loss/crossentropy": 1.8893300294876099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18850861489772797, + "step": 19430 + }, + { + "epoch": 0.38864, + "grad_norm": 2.984375, + "grad_norm_var": 0.07888997395833333, + "learning_rate": 0.0001, + "loss": 4.243, + "loss/crossentropy": 2.210429847240448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17908993363380432, + "step": 19432 + }, + { + "epoch": 0.38868, + "grad_norm": 2.015625, + "grad_norm_var": 0.07731526692708333, + "learning_rate": 0.0001, + "loss": 3.9939, + "loss/crossentropy": 2.164771556854248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1919529214501381, + "step": 19434 + }, + { + "epoch": 0.38872, + "grad_norm": 1.9609375, + "grad_norm_var": 0.07656631469726563, + "learning_rate": 0.0001, + "loss": 4.1721, + "loss/crossentropy": 2.1335190534591675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2263372465968132, + "step": 19436 + }, + { + "epoch": 0.38876, + "grad_norm": 1.9609375, + "grad_norm_var": 0.073779296875, + "learning_rate": 0.0001, + "loss": 4.06, + "loss/crossentropy": 2.199760317802429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20560061931610107, + "step": 19438 + }, + { + "epoch": 0.3888, + "grad_norm": 1.8125, + "grad_norm_var": 0.07665608723958334, + "learning_rate": 0.0001, + "loss": 3.9367, + "loss/crossentropy": 2.2948896884918213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016046792268753, + "step": 19440 + }, + { + "epoch": 0.38884, + "grad_norm": 1.921875, + "grad_norm_var": 0.07588882446289062, + "learning_rate": 0.0001, + "loss": 4.1444, + "loss/crossentropy": 2.120785415172577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18029560148715973, + "step": 19442 + }, + { + "epoch": 0.38888, + "grad_norm": 1.875, + "grad_norm_var": 0.07493260701497396, + "learning_rate": 0.0001, + "loss": 3.8872, + "loss/crossentropy": 2.049036145210266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19056373089551926, + "step": 19444 + }, + { + "epoch": 0.38892, + "grad_norm": 2.078125, + "grad_norm_var": 0.07226130167643229, + "learning_rate": 0.0001, + "loss": 4.2652, + "loss/crossentropy": 2.0076504945755005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20951516926288605, + "step": 19446 + }, + { + "epoch": 0.38896, + "grad_norm": 1.9296875, + "grad_norm_var": 0.006740061442057291, + "learning_rate": 0.0001, + "loss": 3.8178, + "loss/crossentropy": 1.9398083090782166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21217594295740128, + "step": 19448 + }, + { + "epoch": 0.389, + "grad_norm": 1.875, + "grad_norm_var": 0.00546875, + "learning_rate": 0.0001, + "loss": 4.0245, + "loss/crossentropy": 2.0976104736328125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19852425903081894, + "step": 19450 + }, + { + "epoch": 0.38904, + "grad_norm": 7.625, + "grad_norm_var": 2.023509724934896, + "learning_rate": 0.0001, + "loss": 3.9741, + "loss/crossentropy": 1.8561761379241943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17980807274580002, + "step": 19452 + }, + { + "epoch": 0.38908, + "grad_norm": 1.859375, + "grad_norm_var": 2.0183570861816404, + "learning_rate": 0.0001, + "loss": 3.913, + "loss/crossentropy": 2.1107255816459656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20205769687891006, + "step": 19454 + }, + { + "epoch": 0.38912, + "grad_norm": 1.8203125, + "grad_norm_var": 2.009368642171224, + "learning_rate": 0.0001, + "loss": 3.9216, + "loss/crossentropy": 2.018397331237793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18721628934144974, + "step": 19456 + }, + { + "epoch": 0.38916, + "grad_norm": 1.84375, + "grad_norm_var": 2.010087076822917, + "learning_rate": 0.0001, + "loss": 3.8738, + "loss/crossentropy": 2.016912341117859, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20780544728040695, + "step": 19458 + }, + { + "epoch": 0.3892, + "grad_norm": 1.8671875, + "grad_norm_var": 2.0083984375, + "learning_rate": 0.0001, + "loss": 3.9523, + "loss/crossentropy": 1.945086121559143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19544267654418945, + "step": 19460 + }, + { + "epoch": 0.38924, + "grad_norm": 1.9375, + "grad_norm_var": 2.01300048828125, + "learning_rate": 0.0001, + "loss": 4.1854, + "loss/crossentropy": 2.059622883796692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21501117944717407, + "step": 19462 + }, + { + "epoch": 0.38928, + "grad_norm": 1.96875, + "grad_norm_var": 2.024466705322266, + "learning_rate": 0.0001, + "loss": 3.8336, + "loss/crossentropy": 1.8915096521377563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1852220892906189, + "step": 19464 + }, + { + "epoch": 0.38932, + "grad_norm": 2.015625, + "grad_norm_var": 2.0162534077962238, + "learning_rate": 0.0001, + "loss": 4.0612, + "loss/crossentropy": 1.5394552946090698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16873380541801453, + "step": 19466 + }, + { + "epoch": 0.38936, + "grad_norm": 1.921875, + "grad_norm_var": 0.03472468058268229, + "learning_rate": 0.0001, + "loss": 4.2426, + "loss/crossentropy": 2.0290380716323853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19913609325885773, + "step": 19468 + }, + { + "epoch": 0.3894, + "grad_norm": 2.03125, + "grad_norm_var": 0.033841705322265624, + "learning_rate": 0.0001, + "loss": 4.1607, + "loss/crossentropy": 2.208653211593628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.190569207072258, + "step": 19470 + }, + { + "epoch": 0.38944, + "grad_norm": 1.9375, + "grad_norm_var": 0.0092041015625, + "learning_rate": 0.0001, + "loss": 4.2646, + "loss/crossentropy": 2.2873799800872803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20589639246463776, + "step": 19472 + }, + { + "epoch": 0.38948, + "grad_norm": 1.953125, + "grad_norm_var": 0.008014933268229166, + "learning_rate": 0.0001, + "loss": 4.1662, + "loss/crossentropy": 2.152024030685425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20272599160671234, + "step": 19474 + }, + { + "epoch": 0.38952, + "grad_norm": 1.96875, + "grad_norm_var": 0.009374745686848958, + "learning_rate": 0.0001, + "loss": 3.9932, + "loss/crossentropy": 2.0079659819602966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19110675901174545, + "step": 19476 + }, + { + "epoch": 0.38956, + "grad_norm": 2.078125, + "grad_norm_var": 0.010081990559895834, + "learning_rate": 0.0001, + "loss": 4.1061, + "loss/crossentropy": 2.3006476163864136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2253967523574829, + "step": 19478 + }, + { + "epoch": 0.3896, + "grad_norm": 1.8359375, + "grad_norm_var": 0.010389963785807291, + "learning_rate": 0.0001, + "loss": 4.0678, + "loss/crossentropy": 2.1429306864738464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20268720388412476, + "step": 19480 + }, + { + "epoch": 0.38964, + "grad_norm": 2.03125, + "grad_norm_var": 0.011417643229166666, + "learning_rate": 0.0001, + "loss": 4.0581, + "loss/crossentropy": 2.260764956474304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2121199518442154, + "step": 19482 + }, + { + "epoch": 0.38968, + "grad_norm": 2.015625, + "grad_norm_var": 0.010636393229166667, + "learning_rate": 0.0001, + "loss": 4.1607, + "loss/crossentropy": 2.3874053955078125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2229148969054222, + "step": 19484 + }, + { + "epoch": 0.38972, + "grad_norm": 2.140625, + "grad_norm_var": 0.009227498372395834, + "learning_rate": 0.0001, + "loss": 4.1456, + "loss/crossentropy": 2.2740933895111084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2509172707796097, + "step": 19486 + }, + { + "epoch": 0.38976, + "grad_norm": 1.921875, + "grad_norm_var": 0.009679921468098958, + "learning_rate": 0.0001, + "loss": 4.1831, + "loss/crossentropy": 2.1949650049209595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19400373846292496, + "step": 19488 + }, + { + "epoch": 0.3898, + "grad_norm": 1.96875, + "grad_norm_var": 0.009549713134765625, + "learning_rate": 0.0001, + "loss": 3.9984, + "loss/crossentropy": 1.4600969552993774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15833207219839096, + "step": 19490 + }, + { + "epoch": 0.38984, + "grad_norm": 2.453125, + "grad_norm_var": 0.019913482666015624, + "learning_rate": 0.0001, + "loss": 4.1694, + "loss/crossentropy": 2.0005985498428345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20003372430801392, + "step": 19492 + }, + { + "epoch": 0.38988, + "grad_norm": 1.84375, + "grad_norm_var": 0.020304107666015626, + "learning_rate": 0.0001, + "loss": 3.8336, + "loss/crossentropy": 1.9609830379486084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19764738529920578, + "step": 19494 + }, + { + "epoch": 0.38992, + "grad_norm": 1.8984375, + "grad_norm_var": 0.01943359375, + "learning_rate": 0.0001, + "loss": 3.8708, + "loss/crossentropy": 1.7853738069534302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18979863077402115, + "step": 19496 + }, + { + "epoch": 0.38996, + "grad_norm": 1.953125, + "grad_norm_var": 0.018961588541666668, + "learning_rate": 0.0001, + "loss": 4.0335, + "loss/crossentropy": 2.004499912261963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20765355974435806, + "step": 19498 + }, + { + "epoch": 0.39, + "grad_norm": 2.015625, + "grad_norm_var": 0.019870758056640625, + "learning_rate": 0.0001, + "loss": 4.1277, + "loss/crossentropy": 2.0148558020591736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18088079243898392, + "step": 19500 + }, + { + "epoch": 0.39004, + "grad_norm": 2.140625, + "grad_norm_var": 0.022965240478515624, + "learning_rate": 0.0001, + "loss": 4.1213, + "loss/crossentropy": 2.2602895498275757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22686412185430527, + "step": 19502 + }, + { + "epoch": 0.39008, + "grad_norm": 1.9921875, + "grad_norm_var": 0.024326324462890625, + "learning_rate": 0.0001, + "loss": 4.0541, + "loss/crossentropy": 2.1789051294326782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1989632546901703, + "step": 19504 + }, + { + "epoch": 0.39012, + "grad_norm": 2.03125, + "grad_norm_var": 0.024568684895833335, + "learning_rate": 0.0001, + "loss": 3.9234, + "loss/crossentropy": 1.93650484085083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1976509690284729, + "step": 19506 + }, + { + "epoch": 0.39016, + "grad_norm": 1.96875, + "grad_norm_var": 0.008351389567057292, + "learning_rate": 0.0001, + "loss": 3.9519, + "loss/crossentropy": 1.9904287457466125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17271699011325836, + "step": 19508 + }, + { + "epoch": 0.3902, + "grad_norm": 2.03125, + "grad_norm_var": 0.008194986979166667, + "learning_rate": 0.0001, + "loss": 3.7009, + "loss/crossentropy": 1.4440776705741882, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16108233481645584, + "step": 19510 + }, + { + "epoch": 0.39024, + "grad_norm": 1.984375, + "grad_norm_var": 0.008111317952473959, + "learning_rate": 0.0001, + "loss": 4.0218, + "loss/crossentropy": 2.143462061882019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2024468332529068, + "step": 19512 + }, + { + "epoch": 0.39028, + "grad_norm": 2.03125, + "grad_norm_var": 0.008436838785807291, + "learning_rate": 0.0001, + "loss": 4.1267, + "loss/crossentropy": 2.085771322250366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22135765105485916, + "step": 19514 + }, + { + "epoch": 0.39032, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008337148030598958, + "learning_rate": 0.0001, + "loss": 4.1423, + "loss/crossentropy": 2.251617908477783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2026226669549942, + "step": 19516 + }, + { + "epoch": 0.39036, + "grad_norm": 2.03125, + "grad_norm_var": 0.003885650634765625, + "learning_rate": 0.0001, + "loss": 4.3156, + "loss/crossentropy": 2.3808701038360596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2263621687889099, + "step": 19518 + }, + { + "epoch": 0.3904, + "grad_norm": 1.8203125, + "grad_norm_var": 0.004019927978515625, + "learning_rate": 0.0001, + "loss": 4.0323, + "loss/crossentropy": 2.000342011451721, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20008830726146698, + "step": 19520 + }, + { + "epoch": 0.39044, + "grad_norm": 2.09375, + "grad_norm_var": 0.0044830322265625, + "learning_rate": 0.0001, + "loss": 4.3285, + "loss/crossentropy": 2.242555856704712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2443876415491104, + "step": 19522 + }, + { + "epoch": 0.39048, + "grad_norm": 2.0625, + "grad_norm_var": 0.005060831705729167, + "learning_rate": 0.0001, + "loss": 4.344, + "loss/crossentropy": 2.2867462635040283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21522662043571472, + "step": 19524 + }, + { + "epoch": 0.39052, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0053955078125, + "learning_rate": 0.0001, + "loss": 4.0117, + "loss/crossentropy": 2.1892699003219604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.205935537815094, + "step": 19526 + }, + { + "epoch": 0.39056, + "grad_norm": 1.890625, + "grad_norm_var": 0.007972971598307291, + "learning_rate": 0.0001, + "loss": 3.8673, + "loss/crossentropy": 2.182734966278076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1980958804488182, + "step": 19528 + }, + { + "epoch": 0.3906, + "grad_norm": 1.96875, + "grad_norm_var": 0.0075927734375, + "learning_rate": 0.0001, + "loss": 4.0193, + "loss/crossentropy": 2.3624355792999268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22833774983882904, + "step": 19530 + }, + { + "epoch": 0.39064, + "grad_norm": 1.875, + "grad_norm_var": 0.007671864827473959, + "learning_rate": 0.0001, + "loss": 4.0497, + "loss/crossentropy": 2.037912607192993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2032359093427658, + "step": 19532 + }, + { + "epoch": 0.39068, + "grad_norm": 2.015625, + "grad_norm_var": 0.007513173421223958, + "learning_rate": 0.0001, + "loss": 3.9812, + "loss/crossentropy": 2.076040804386139, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21829531341791153, + "step": 19534 + }, + { + "epoch": 0.39072, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0058502197265625, + "learning_rate": 0.0001, + "loss": 4.0407, + "loss/crossentropy": 2.2695836424827576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125643789768219, + "step": 19536 + }, + { + "epoch": 0.39076, + "grad_norm": 2.078125, + "grad_norm_var": 0.0061279296875, + "learning_rate": 0.0001, + "loss": 4.2664, + "loss/crossentropy": 1.9343088269233704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2045460194349289, + "step": 19538 + }, + { + "epoch": 0.3908, + "grad_norm": 1.9609375, + "grad_norm_var": 0.005272420247395834, + "learning_rate": 0.0001, + "loss": 4.0757, + "loss/crossentropy": 1.9968918561935425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17788998782634735, + "step": 19540 + }, + { + "epoch": 0.39084, + "grad_norm": 1.9609375, + "grad_norm_var": 0.005467732747395833, + "learning_rate": 0.0001, + "loss": 4.094, + "loss/crossentropy": 2.157910704612732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19516880810260773, + "step": 19542 + }, + { + "epoch": 0.39088, + "grad_norm": 2.0, + "grad_norm_var": 0.004042307535807292, + "learning_rate": 0.0001, + "loss": 4.1056, + "loss/crossentropy": 2.1784998178482056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20280858874320984, + "step": 19544 + }, + { + "epoch": 0.39092, + "grad_norm": 2.03125, + "grad_norm_var": 0.00438232421875, + "learning_rate": 0.0001, + "loss": 4.0796, + "loss/crossentropy": 2.3444844484329224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2418832629919052, + "step": 19546 + }, + { + "epoch": 0.39096, + "grad_norm": 1.96875, + "grad_norm_var": 0.0038083394368489585, + "learning_rate": 0.0001, + "loss": 3.9802, + "loss/crossentropy": 1.90557062625885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2010379433631897, + "step": 19548 + }, + { + "epoch": 0.391, + "grad_norm": 1.90625, + "grad_norm_var": 0.004107411702473958, + "learning_rate": 0.0001, + "loss": 3.7756, + "loss/crossentropy": 1.4939787983894348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1362278200685978, + "step": 19550 + }, + { + "epoch": 0.39104, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0063517252604166664, + "learning_rate": 0.0001, + "loss": 4.1738, + "loss/crossentropy": 2.037553310394287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2159510999917984, + "step": 19552 + }, + { + "epoch": 0.39108, + "grad_norm": 1.9375, + "grad_norm_var": 0.004816691080729167, + "learning_rate": 0.0001, + "loss": 3.887, + "loss/crossentropy": 1.9987242221832275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19919036328792572, + "step": 19554 + }, + { + "epoch": 0.39112, + "grad_norm": 1.90625, + "grad_norm_var": 0.0048906962076822914, + "learning_rate": 0.0001, + "loss": 4.0298, + "loss/crossentropy": 2.1593196392059326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21370293200016022, + "step": 19556 + }, + { + "epoch": 0.39116, + "grad_norm": 1.9375, + "grad_norm_var": 0.005631510416666667, + "learning_rate": 0.0001, + "loss": 3.8348, + "loss/crossentropy": 1.8323914408683777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20382094383239746, + "step": 19558 + }, + { + "epoch": 0.3912, + "grad_norm": 1.9375, + "grad_norm_var": 0.0053059895833333336, + "learning_rate": 0.0001, + "loss": 4.1138, + "loss/crossentropy": 2.0240999460220337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19374582171440125, + "step": 19560 + }, + { + "epoch": 0.39124, + "grad_norm": 2.0, + "grad_norm_var": 0.005304972330729167, + "learning_rate": 0.0001, + "loss": 4.4349, + "loss/crossentropy": 2.377955436706543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2213267982006073, + "step": 19562 + }, + { + "epoch": 0.39128, + "grad_norm": 2.015625, + "grad_norm_var": 0.006154123942057292, + "learning_rate": 0.0001, + "loss": 4.0878, + "loss/crossentropy": 1.724283754825592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20119256526231766, + "step": 19564 + }, + { + "epoch": 0.39132, + "grad_norm": 1.953125, + "grad_norm_var": 0.005655670166015625, + "learning_rate": 0.0001, + "loss": 3.9066, + "loss/crossentropy": 2.0352718234062195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19105417281389236, + "step": 19566 + }, + { + "epoch": 0.39136, + "grad_norm": 1.875, + "grad_norm_var": 0.0037913004557291667, + "learning_rate": 0.0001, + "loss": 3.7868, + "loss/crossentropy": 2.078265905380249, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19992397725582123, + "step": 19568 + }, + { + "epoch": 0.3914, + "grad_norm": 1.859375, + "grad_norm_var": 0.004329172770182291, + "learning_rate": 0.0001, + "loss": 3.8276, + "loss/crossentropy": 1.9722577929496765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17697854340076447, + "step": 19570 + }, + { + "epoch": 0.39144, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004915364583333333, + "learning_rate": 0.0001, + "loss": 4.0587, + "loss/crossentropy": 2.2238826751708984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20041973888874054, + "step": 19572 + }, + { + "epoch": 0.39148, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0037923177083333333, + "learning_rate": 0.0001, + "loss": 4.0226, + "loss/crossentropy": 1.8374757170677185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18086174875497818, + "step": 19574 + }, + { + "epoch": 0.39152, + "grad_norm": 2.140625, + "grad_norm_var": 0.00635986328125, + "learning_rate": 0.0001, + "loss": 4.2903, + "loss/crossentropy": 2.0642590522766113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19544780254364014, + "step": 19576 + }, + { + "epoch": 0.39156, + "grad_norm": 2.421875, + "grad_norm_var": 0.020493316650390624, + "learning_rate": 0.0001, + "loss": 4.1231, + "loss/crossentropy": 2.090702533721924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19859656691551208, + "step": 19578 + }, + { + "epoch": 0.3916, + "grad_norm": 2.1875, + "grad_norm_var": 0.023538970947265626, + "learning_rate": 0.0001, + "loss": 3.9075, + "loss/crossentropy": 1.8087154030799866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1918642893433571, + "step": 19580 + }, + { + "epoch": 0.39164, + "grad_norm": 1.9140625, + "grad_norm_var": 0.023527018229166665, + "learning_rate": 0.0001, + "loss": 3.8869, + "loss/crossentropy": 2.0180618166923523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19190336763858795, + "step": 19582 + }, + { + "epoch": 0.39168, + "grad_norm": 2.171875, + "grad_norm_var": 0.025886027018229167, + "learning_rate": 0.0001, + "loss": 4.2298, + "loss/crossentropy": 1.980249285697937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20591172575950623, + "step": 19584 + }, + { + "epoch": 0.39172, + "grad_norm": 1.8359375, + "grad_norm_var": 0.026374308268229167, + "learning_rate": 0.0001, + "loss": 3.9883, + "loss/crossentropy": 2.1208410263061523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20714347064495087, + "step": 19586 + }, + { + "epoch": 0.39176, + "grad_norm": 2.046875, + "grad_norm_var": 0.024179840087890626, + "learning_rate": 0.0001, + "loss": 4.3101, + "loss/crossentropy": 2.351140856742859, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26494763791561127, + "step": 19588 + }, + { + "epoch": 0.3918, + "grad_norm": 2.125, + "grad_norm_var": 0.024448394775390625, + "learning_rate": 0.0001, + "loss": 4.1259, + "loss/crossentropy": 1.9821222424507141, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2062402218580246, + "step": 19590 + }, + { + "epoch": 0.39184, + "grad_norm": 2.015625, + "grad_norm_var": 0.02462158203125, + "learning_rate": 0.0001, + "loss": 4.1373, + "loss/crossentropy": 2.363589644432068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22435829043388367, + "step": 19592 + }, + { + "epoch": 0.39188, + "grad_norm": 1.984375, + "grad_norm_var": 0.013963826497395833, + "learning_rate": 0.0001, + "loss": 4.1845, + "loss/crossentropy": 1.925455391407013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17338209599256516, + "step": 19594 + }, + { + "epoch": 0.39192, + "grad_norm": 2.109375, + "grad_norm_var": 0.012254842122395833, + "learning_rate": 0.0001, + "loss": 3.9777, + "loss/crossentropy": 2.176175117492676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21482165157794952, + "step": 19596 + }, + { + "epoch": 0.39196, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014330037434895833, + "learning_rate": 0.0001, + "loss": 3.9266, + "loss/crossentropy": 2.0013960003852844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20629072189331055, + "step": 19598 + }, + { + "epoch": 0.392, + "grad_norm": 1.859375, + "grad_norm_var": 0.011797841389973958, + "learning_rate": 0.0001, + "loss": 3.9402, + "loss/crossentropy": 2.066355049610138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22155777364969254, + "step": 19600 + }, + { + "epoch": 0.39204, + "grad_norm": 1.8359375, + "grad_norm_var": 0.010809071858723958, + "learning_rate": 0.0001, + "loss": 4.0319, + "loss/crossentropy": 2.0513144731521606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19442218542099, + "step": 19602 + }, + { + "epoch": 0.39208, + "grad_norm": 2.046875, + "grad_norm_var": 0.010896809895833333, + "learning_rate": 0.0001, + "loss": 4.2601, + "loss/crossentropy": 1.9635959267616272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20168906450271606, + "step": 19604 + }, + { + "epoch": 0.39212, + "grad_norm": 1.890625, + "grad_norm_var": 0.009025065104166667, + "learning_rate": 0.0001, + "loss": 3.9406, + "loss/crossentropy": 1.9395795464515686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17990338802337646, + "step": 19606 + }, + { + "epoch": 0.39216, + "grad_norm": 2.125, + "grad_norm_var": 0.011004384358723958, + "learning_rate": 0.0001, + "loss": 4.1936, + "loss/crossentropy": 2.111960232257843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20662778615951538, + "step": 19608 + }, + { + "epoch": 0.3922, + "grad_norm": 1.8125, + "grad_norm_var": 0.011466471354166667, + "learning_rate": 0.0001, + "loss": 3.9238, + "loss/crossentropy": 1.855182707309723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19665290415287018, + "step": 19610 + }, + { + "epoch": 0.39224, + "grad_norm": 2.0, + "grad_norm_var": 0.0156494140625, + "learning_rate": 0.0001, + "loss": 4.262, + "loss/crossentropy": 2.1587076783180237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20080996304750443, + "step": 19612 + }, + { + "epoch": 0.39228, + "grad_norm": 2.015625, + "grad_norm_var": 0.01597874959309896, + "learning_rate": 0.0001, + "loss": 4.1227, + "loss/crossentropy": 2.130104422569275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19484283030033112, + "step": 19614 + }, + { + "epoch": 0.39232, + "grad_norm": 1.75, + "grad_norm_var": 0.017992146809895835, + "learning_rate": 0.0001, + "loss": 3.9046, + "loss/crossentropy": 1.6983461380004883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18040503561496735, + "step": 19616 + }, + { + "epoch": 0.39236, + "grad_norm": 1.953125, + "grad_norm_var": 0.017756144205729168, + "learning_rate": 0.0001, + "loss": 4.0141, + "loss/crossentropy": 2.0672999024391174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20455461740493774, + "step": 19618 + }, + { + "epoch": 0.3924, + "grad_norm": 1.84375, + "grad_norm_var": 0.018202463785807293, + "learning_rate": 0.0001, + "loss": 3.83, + "loss/crossentropy": 2.0494508743286133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18307264149188995, + "step": 19620 + }, + { + "epoch": 0.39244, + "grad_norm": 1.875, + "grad_norm_var": 0.018277740478515624, + "learning_rate": 0.0001, + "loss": 3.9901, + "loss/crossentropy": 1.683717966079712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16951359808444977, + "step": 19622 + }, + { + "epoch": 0.39248, + "grad_norm": 1.859375, + "grad_norm_var": 0.015197499593098959, + "learning_rate": 0.0001, + "loss": 3.8383, + "loss/crossentropy": 2.62760066986084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22649705410003662, + "step": 19624 + }, + { + "epoch": 0.39252, + "grad_norm": 2.296875, + "grad_norm_var": 0.024072011311848957, + "learning_rate": 0.0001, + "loss": 4.19, + "loss/crossentropy": 1.857836663722992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1827384978532791, + "step": 19626 + }, + { + "epoch": 0.39256, + "grad_norm": 2.0, + "grad_norm_var": 0.016108957926432292, + "learning_rate": 0.0001, + "loss": 3.7635, + "loss/crossentropy": 2.1203905940055847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20033711194992065, + "step": 19628 + }, + { + "epoch": 0.3926, + "grad_norm": 2.125, + "grad_norm_var": 0.01721165974934896, + "learning_rate": 0.0001, + "loss": 4.4474, + "loss/crossentropy": 2.378189444541931, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20892338454723358, + "step": 19630 + }, + { + "epoch": 0.39264, + "grad_norm": 1.9140625, + "grad_norm_var": 0.015819295247395834, + "learning_rate": 0.0001, + "loss": 4.0897, + "loss/crossentropy": 1.896558940410614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18873175233602524, + "step": 19632 + }, + { + "epoch": 0.39268, + "grad_norm": 2.109375, + "grad_norm_var": 0.020798492431640624, + "learning_rate": 0.0001, + "loss": 4.3611, + "loss/crossentropy": 2.3503568172454834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23128806054592133, + "step": 19634 + }, + { + "epoch": 0.39272, + "grad_norm": 1.7578125, + "grad_norm_var": 0.021345011393229165, + "learning_rate": 0.0001, + "loss": 4.0515, + "loss/crossentropy": 2.081954002380371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20059917122125626, + "step": 19636 + }, + { + "epoch": 0.39276, + "grad_norm": 2.015625, + "grad_norm_var": 0.021512858072916665, + "learning_rate": 0.0001, + "loss": 4.0986, + "loss/crossentropy": 1.9573910236358643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1903771460056305, + "step": 19638 + }, + { + "epoch": 0.3928, + "grad_norm": 1.859375, + "grad_norm_var": 0.022025299072265626, + "learning_rate": 0.0001, + "loss": 3.9333, + "loss/crossentropy": 2.098384141921997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19905343651771545, + "step": 19640 + }, + { + "epoch": 0.39284, + "grad_norm": 2.0, + "grad_norm_var": 0.015892537434895833, + "learning_rate": 0.0001, + "loss": 4.0854, + "loss/crossentropy": 1.8510947227478027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18955568969249725, + "step": 19642 + }, + { + "epoch": 0.39288, + "grad_norm": 1.9453125, + "grad_norm_var": 0.016440582275390626, + "learning_rate": 0.0001, + "loss": 4.2948, + "loss/crossentropy": 2.2839618921279907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23230211436748505, + "step": 19644 + }, + { + "epoch": 0.39292, + "grad_norm": 1.953125, + "grad_norm_var": 0.014778391520182291, + "learning_rate": 0.0001, + "loss": 4.0112, + "loss/crossentropy": 1.8055492639541626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20515839755535126, + "step": 19646 + }, + { + "epoch": 0.39296, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013348134358723958, + "learning_rate": 0.0001, + "loss": 4.0556, + "loss/crossentropy": 1.6756115555763245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17216521501541138, + "step": 19648 + }, + { + "epoch": 0.393, + "grad_norm": 1.984375, + "grad_norm_var": 0.007287343343098958, + "learning_rate": 0.0001, + "loss": 4.4001, + "loss/crossentropy": 2.3247755765914917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21122244000434875, + "step": 19650 + }, + { + "epoch": 0.39304, + "grad_norm": 1.9296875, + "grad_norm_var": 0.004044596354166667, + "learning_rate": 0.0001, + "loss": 3.895, + "loss/crossentropy": 1.9654970169067383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20018797367811203, + "step": 19652 + }, + { + "epoch": 0.39308, + "grad_norm": 1.96875, + "grad_norm_var": 0.0029042561848958335, + "learning_rate": 0.0001, + "loss": 3.9018, + "loss/crossentropy": 1.9368168115615845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19478464126586914, + "step": 19654 + }, + { + "epoch": 0.39312, + "grad_norm": 2.03125, + "grad_norm_var": 0.0023671468098958332, + "learning_rate": 0.0001, + "loss": 3.7869, + "loss/crossentropy": 2.099206328392029, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20060646533966064, + "step": 19656 + }, + { + "epoch": 0.39316, + "grad_norm": 1.890625, + "grad_norm_var": 0.002561187744140625, + "learning_rate": 0.0001, + "loss": 4.3438, + "loss/crossentropy": 2.1585338711738586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20852376520633698, + "step": 19658 + }, + { + "epoch": 0.3932, + "grad_norm": 1.796875, + "grad_norm_var": 0.0044830322265625, + "learning_rate": 0.0001, + "loss": 4.0533, + "loss/crossentropy": 2.285220742225647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1983182057738304, + "step": 19660 + }, + { + "epoch": 0.39324, + "grad_norm": 2.03125, + "grad_norm_var": 0.0047190348307291664, + "learning_rate": 0.0001, + "loss": 4.0046, + "loss/crossentropy": 1.955579936504364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21165720373392105, + "step": 19662 + }, + { + "epoch": 0.39328, + "grad_norm": 1.8046875, + "grad_norm_var": 0.00645751953125, + "learning_rate": 0.0001, + "loss": 3.823, + "loss/crossentropy": 1.9925233721733093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18789846450090408, + "step": 19664 + }, + { + "epoch": 0.39332, + "grad_norm": 1.984375, + "grad_norm_var": 0.006490071614583333, + "learning_rate": 0.0001, + "loss": 4.0123, + "loss/crossentropy": 1.9267281293869019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19545376300811768, + "step": 19666 + }, + { + "epoch": 0.39336, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006493123372395834, + "learning_rate": 0.0001, + "loss": 4.0401, + "loss/crossentropy": 2.179477632045746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19430068880319595, + "step": 19668 + }, + { + "epoch": 0.3934, + "grad_norm": 2.046875, + "grad_norm_var": 0.007228342692057291, + "learning_rate": 0.0001, + "loss": 4.1508, + "loss/crossentropy": 2.1411179900169373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2276124805212021, + "step": 19670 + }, + { + "epoch": 0.39344, + "grad_norm": 2.3125, + "grad_norm_var": 0.014438629150390625, + "learning_rate": 0.0001, + "loss": 4.2689, + "loss/crossentropy": 2.085427463054657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21856296062469482, + "step": 19672 + }, + { + "epoch": 0.39348, + "grad_norm": 2.09375, + "grad_norm_var": 0.014495595296223959, + "learning_rate": 0.0001, + "loss": 4.0302, + "loss/crossentropy": 1.7399682402610779, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16918571293354034, + "step": 19674 + }, + { + "epoch": 0.39352, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011356608072916666, + "learning_rate": 0.0001, + "loss": 4.0404, + "loss/crossentropy": 1.8636209964752197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17654836922883987, + "step": 19676 + }, + { + "epoch": 0.39356, + "grad_norm": 1.7421875, + "grad_norm_var": 0.015290323893229167, + "learning_rate": 0.0001, + "loss": 4.0157, + "loss/crossentropy": 2.143825590610504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125091552734375, + "step": 19678 + }, + { + "epoch": 0.3936, + "grad_norm": 1.6953125, + "grad_norm_var": 0.01873753865559896, + "learning_rate": 0.0001, + "loss": 3.7692, + "loss/crossentropy": 1.8222747445106506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17713025212287903, + "step": 19680 + }, + { + "epoch": 0.39364, + "grad_norm": 1.8515625, + "grad_norm_var": 0.01969172159830729, + "learning_rate": 0.0001, + "loss": 3.8254, + "loss/crossentropy": 1.7044820189476013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17669418454170227, + "step": 19682 + }, + { + "epoch": 0.39368, + "grad_norm": 1.953125, + "grad_norm_var": 0.020411936442057292, + "learning_rate": 0.0001, + "loss": 4.0987, + "loss/crossentropy": 2.144485831260681, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20580272376537323, + "step": 19684 + }, + { + "epoch": 0.39372, + "grad_norm": 1.984375, + "grad_norm_var": 0.019774373372395834, + "learning_rate": 0.0001, + "loss": 4.0994, + "loss/crossentropy": 2.17536997795105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21704821288585663, + "step": 19686 + }, + { + "epoch": 0.39376, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012963612874348959, + "learning_rate": 0.0001, + "loss": 4.2592, + "loss/crossentropy": 2.1875303983688354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1996757537126541, + "step": 19688 + }, + { + "epoch": 0.3938, + "grad_norm": 2.125, + "grad_norm_var": 0.014121246337890626, + "learning_rate": 0.0001, + "loss": 4.1943, + "loss/crossentropy": 2.270558476448059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23406681418418884, + "step": 19690 + }, + { + "epoch": 0.39384, + "grad_norm": 1.8671875, + "grad_norm_var": 0.015860748291015626, + "learning_rate": 0.0001, + "loss": 4.2097, + "loss/crossentropy": 2.0614060163497925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21256985515356064, + "step": 19692 + }, + { + "epoch": 0.39388, + "grad_norm": 1.9765625, + "grad_norm_var": 0.015091705322265624, + "learning_rate": 0.0001, + "loss": 4.042, + "loss/crossentropy": 2.1036725640296936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19555094093084335, + "step": 19694 + }, + { + "epoch": 0.39392, + "grad_norm": 1.8828125, + "grad_norm_var": 0.010396321614583334, + "learning_rate": 0.0001, + "loss": 3.7702, + "loss/crossentropy": 1.8713775277137756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1973343789577484, + "step": 19696 + }, + { + "epoch": 0.39396, + "grad_norm": 1.921875, + "grad_norm_var": 0.009411366780598958, + "learning_rate": 0.0001, + "loss": 3.9581, + "loss/crossentropy": 1.9986143708229065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1925104334950447, + "step": 19698 + }, + { + "epoch": 0.394, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0090484619140625, + "learning_rate": 0.0001, + "loss": 3.8134, + "loss/crossentropy": 1.8336694836616516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18453969806432724, + "step": 19700 + }, + { + "epoch": 0.39404, + "grad_norm": 1.9375, + "grad_norm_var": 0.01065673828125, + "learning_rate": 0.0001, + "loss": 3.9279, + "loss/crossentropy": 1.8948233723640442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17115242034196854, + "step": 19702 + }, + { + "epoch": 0.39408, + "grad_norm": 1.984375, + "grad_norm_var": 0.008958943684895833, + "learning_rate": 0.0001, + "loss": 4.244, + "loss/crossentropy": 2.5506935119628906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22056522965431213, + "step": 19704 + }, + { + "epoch": 0.39412, + "grad_norm": 1.984375, + "grad_norm_var": 0.02890625, + "learning_rate": 0.0001, + "loss": 4.1832, + "loss/crossentropy": 2.127421021461487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20175430178642273, + "step": 19706 + }, + { + "epoch": 0.39416, + "grad_norm": 1.9765625, + "grad_norm_var": 0.027197265625, + "learning_rate": 0.0001, + "loss": 4.0251, + "loss/crossentropy": 2.246693968772888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22178302705287933, + "step": 19708 + }, + { + "epoch": 0.3942, + "grad_norm": 1.6953125, + "grad_norm_var": 0.03328221638997396, + "learning_rate": 0.0001, + "loss": 3.9336, + "loss/crossentropy": 1.970005750656128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18967190384864807, + "step": 19710 + }, + { + "epoch": 0.39424, + "grad_norm": 1.921875, + "grad_norm_var": 0.032956695556640624, + "learning_rate": 0.0001, + "loss": 4.1035, + "loss/crossentropy": 1.944337785243988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1952769234776497, + "step": 19712 + }, + { + "epoch": 0.39428, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0328277587890625, + "learning_rate": 0.0001, + "loss": 4.2289, + "loss/crossentropy": 2.291213870048523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2077646553516388, + "step": 19714 + }, + { + "epoch": 0.39432, + "grad_norm": 2.015625, + "grad_norm_var": 0.03361790974934896, + "learning_rate": 0.0001, + "loss": 4.0433, + "loss/crossentropy": 2.404030203819275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2200116366147995, + "step": 19716 + }, + { + "epoch": 0.39436, + "grad_norm": 2.109375, + "grad_norm_var": 0.032364908854166666, + "learning_rate": 0.0001, + "loss": 3.8609, + "loss/crossentropy": 1.9242625832557678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19318503141403198, + "step": 19718 + }, + { + "epoch": 0.3944, + "grad_norm": 1.890625, + "grad_norm_var": 0.03240966796875, + "learning_rate": 0.0001, + "loss": 3.9789, + "loss/crossentropy": 2.319575071334839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21668671071529388, + "step": 19720 + }, + { + "epoch": 0.39444, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012287394205729166, + "learning_rate": 0.0001, + "loss": 4.2219, + "loss/crossentropy": 2.0032835006713867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19786667823791504, + "step": 19722 + }, + { + "epoch": 0.39448, + "grad_norm": 1.7734375, + "grad_norm_var": 0.0146636962890625, + "learning_rate": 0.0001, + "loss": 3.9509, + "loss/crossentropy": 1.9524562358856201, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18965402245521545, + "step": 19724 + }, + { + "epoch": 0.39452, + "grad_norm": 1.890625, + "grad_norm_var": 0.007521311442057292, + "learning_rate": 0.0001, + "loss": 3.5677, + "loss/crossentropy": 1.5887231826782227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1512351706624031, + "step": 19726 + }, + { + "epoch": 0.39456, + "grad_norm": 2.046875, + "grad_norm_var": 0.008576456705729167, + "learning_rate": 0.0001, + "loss": 4.1379, + "loss/crossentropy": 2.107685923576355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18645642697811127, + "step": 19728 + }, + { + "epoch": 0.3946, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009028879801432292, + "learning_rate": 0.0001, + "loss": 3.9594, + "loss/crossentropy": 2.3551766872406006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2207053080201149, + "step": 19730 + }, + { + "epoch": 0.39464, + "grad_norm": 2.0625, + "grad_norm_var": 0.010847727457682291, + "learning_rate": 0.0001, + "loss": 4.3542, + "loss/crossentropy": 2.1983554363250732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23001667857170105, + "step": 19732 + }, + { + "epoch": 0.39468, + "grad_norm": 2.03125, + "grad_norm_var": 0.009639485677083334, + "learning_rate": 0.0001, + "loss": 4.3158, + "loss/crossentropy": 2.0236815214157104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19230765849351883, + "step": 19734 + }, + { + "epoch": 0.39472, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010529581705729167, + "learning_rate": 0.0001, + "loss": 4.0511, + "loss/crossentropy": 2.2027645111083984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1864710971713066, + "step": 19736 + }, + { + "epoch": 0.39476, + "grad_norm": 2.15625, + "grad_norm_var": 0.013152821858723959, + "learning_rate": 0.0001, + "loss": 4.008, + "loss/crossentropy": 1.921549379825592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21788031607866287, + "step": 19738 + }, + { + "epoch": 0.3948, + "grad_norm": 1.953125, + "grad_norm_var": 0.010416666666666666, + "learning_rate": 0.0001, + "loss": 3.8353, + "loss/crossentropy": 2.121790587902069, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21380367130041122, + "step": 19740 + }, + { + "epoch": 0.39484, + "grad_norm": 1.9140625, + "grad_norm_var": 0.008217112223307291, + "learning_rate": 0.0001, + "loss": 3.9975, + "loss/crossentropy": 1.9595564007759094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17919503152370453, + "step": 19742 + }, + { + "epoch": 0.39488, + "grad_norm": 2.140625, + "grad_norm_var": 0.010993448893229167, + "learning_rate": 0.0001, + "loss": 3.9935, + "loss/crossentropy": 2.155984342098236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2002984657883644, + "step": 19744 + }, + { + "epoch": 0.39492, + "grad_norm": 1.7890625, + "grad_norm_var": 0.012589518229166667, + "learning_rate": 0.0001, + "loss": 3.875, + "loss/crossentropy": 1.766005277633667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1717890352010727, + "step": 19746 + }, + { + "epoch": 0.39496, + "grad_norm": 2.015625, + "grad_norm_var": 0.010424550374348958, + "learning_rate": 0.0001, + "loss": 3.9178, + "loss/crossentropy": 1.8891428112983704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1965286061167717, + "step": 19748 + }, + { + "epoch": 0.395, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010343170166015625, + "learning_rate": 0.0001, + "loss": 4.3124, + "loss/crossentropy": 1.9521282315254211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21058151125907898, + "step": 19750 + }, + { + "epoch": 0.39504, + "grad_norm": 1.8515625, + "grad_norm_var": 0.014711252848307292, + "learning_rate": 0.0001, + "loss": 4.2242, + "loss/crossentropy": 2.3142699003219604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24145027250051498, + "step": 19752 + }, + { + "epoch": 0.39508, + "grad_norm": 1.9375, + "grad_norm_var": 0.011777496337890625, + "learning_rate": 0.0001, + "loss": 4.1059, + "loss/crossentropy": 2.0375224351882935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19370558112859726, + "step": 19754 + }, + { + "epoch": 0.39512, + "grad_norm": 1.9140625, + "grad_norm_var": 0.011962890625, + "learning_rate": 0.0001, + "loss": 4.2367, + "loss/crossentropy": 2.193490743637085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2012534961104393, + "step": 19756 + }, + { + "epoch": 0.39516, + "grad_norm": 2.015625, + "grad_norm_var": 0.011993153889973959, + "learning_rate": 0.0001, + "loss": 4.0781, + "loss/crossentropy": 2.2397992610931396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19738413393497467, + "step": 19758 + }, + { + "epoch": 0.3952, + "grad_norm": 1.90625, + "grad_norm_var": 0.010223134358723959, + "learning_rate": 0.0001, + "loss": 3.9153, + "loss/crossentropy": 1.7602161169052124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18599014729261398, + "step": 19760 + }, + { + "epoch": 0.39524, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008640289306640625, + "learning_rate": 0.0001, + "loss": 4.182, + "loss/crossentropy": 2.536779046058655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2301289290189743, + "step": 19762 + }, + { + "epoch": 0.39528, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008965810139973959, + "learning_rate": 0.0001, + "loss": 3.9999, + "loss/crossentropy": 2.3237764835357666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21476060152053833, + "step": 19764 + }, + { + "epoch": 0.39532, + "grad_norm": 2.0625, + "grad_norm_var": 0.011107381184895833, + "learning_rate": 0.0001, + "loss": 4.2102, + "loss/crossentropy": 2.370723605155945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24277979880571365, + "step": 19766 + }, + { + "epoch": 0.39536, + "grad_norm": 2.09375, + "grad_norm_var": 0.006917063395182292, + "learning_rate": 0.0001, + "loss": 4.0838, + "loss/crossentropy": 1.6399320363998413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1740519106388092, + "step": 19768 + }, + { + "epoch": 0.3954, + "grad_norm": 1.8828125, + "grad_norm_var": 0.02394383748372396, + "learning_rate": 0.0001, + "loss": 3.6633, + "loss/crossentropy": 1.8633801341056824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1810051053762436, + "step": 19770 + }, + { + "epoch": 0.39544, + "grad_norm": 1.953125, + "grad_norm_var": 0.023583984375, + "learning_rate": 0.0001, + "loss": 4.1102, + "loss/crossentropy": 2.318315625190735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21214767545461655, + "step": 19772 + }, + { + "epoch": 0.39548, + "grad_norm": 1.9921875, + "grad_norm_var": 0.026387532552083332, + "learning_rate": 0.0001, + "loss": 4.1232, + "loss/crossentropy": 2.2803520560264587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21136894822120667, + "step": 19774 + }, + { + "epoch": 0.39552, + "grad_norm": 1.8359375, + "grad_norm_var": 0.026387532552083332, + "learning_rate": 0.0001, + "loss": 3.9034, + "loss/crossentropy": 2.1529780626296997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18172463029623032, + "step": 19776 + }, + { + "epoch": 0.39556, + "grad_norm": 2.015625, + "grad_norm_var": 0.025585683186848958, + "learning_rate": 0.0001, + "loss": 4.2851, + "loss/crossentropy": 2.1246083974838257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2181987464427948, + "step": 19778 + }, + { + "epoch": 0.3956, + "grad_norm": 1.90625, + "grad_norm_var": 0.029515584309895832, + "learning_rate": 0.0001, + "loss": 4.0097, + "loss/crossentropy": 2.027298629283905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1877325028181076, + "step": 19780 + }, + { + "epoch": 0.39564, + "grad_norm": 2.0, + "grad_norm_var": 0.0288482666015625, + "learning_rate": 0.0001, + "loss": 4.2326, + "loss/crossentropy": 1.9338072538375854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1746418997645378, + "step": 19782 + }, + { + "epoch": 0.39568, + "grad_norm": 2.1875, + "grad_norm_var": 0.03104222615559896, + "learning_rate": 0.0001, + "loss": 4.2551, + "loss/crossentropy": 2.050130307674408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2011866271495819, + "step": 19784 + }, + { + "epoch": 0.39572, + "grad_norm": 1.9765625, + "grad_norm_var": 0.014720662434895834, + "learning_rate": 0.0001, + "loss": 4.3227, + "loss/crossentropy": 2.48315691947937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22591856867074966, + "step": 19786 + }, + { + "epoch": 0.39576, + "grad_norm": 2.046875, + "grad_norm_var": 0.021019490559895833, + "learning_rate": 0.0001, + "loss": 4.3671, + "loss/crossentropy": 2.578279137611389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2216324657201767, + "step": 19788 + }, + { + "epoch": 0.3958, + "grad_norm": 1.8828125, + "grad_norm_var": 0.018192291259765625, + "learning_rate": 0.0001, + "loss": 4.259, + "loss/crossentropy": 1.7862395644187927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17009516060352325, + "step": 19790 + }, + { + "epoch": 0.39584, + "grad_norm": 1.8828125, + "grad_norm_var": 0.017032877604166666, + "learning_rate": 0.0001, + "loss": 3.8506, + "loss/crossentropy": 2.042613208293915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1844482272863388, + "step": 19792 + }, + { + "epoch": 0.39588, + "grad_norm": 1.7890625, + "grad_norm_var": 0.020926666259765626, + "learning_rate": 0.0001, + "loss": 4.1919, + "loss/crossentropy": 2.317251443862915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2135745733976364, + "step": 19794 + }, + { + "epoch": 0.39592, + "grad_norm": 2.109375, + "grad_norm_var": 0.01964111328125, + "learning_rate": 0.0001, + "loss": 4.0061, + "loss/crossentropy": 1.849199891090393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20549577474594116, + "step": 19796 + }, + { + "epoch": 0.39596, + "grad_norm": 2.015625, + "grad_norm_var": 0.0199859619140625, + "learning_rate": 0.0001, + "loss": 4.0551, + "loss/crossentropy": 2.1876507997512817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20098386704921722, + "step": 19798 + }, + { + "epoch": 0.396, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01718317667643229, + "learning_rate": 0.0001, + "loss": 4.154, + "loss/crossentropy": 1.9740530848503113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19494586437940598, + "step": 19800 + }, + { + "epoch": 0.39604, + "grad_norm": 1.9375, + "grad_norm_var": 0.017577107747395834, + "learning_rate": 0.0001, + "loss": 4.1405, + "loss/crossentropy": 2.208943724632263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22833283245563507, + "step": 19802 + }, + { + "epoch": 0.39608, + "grad_norm": 1.96875, + "grad_norm_var": 0.009488677978515625, + "learning_rate": 0.0001, + "loss": 3.9082, + "loss/crossentropy": 2.024399518966675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19786083698272705, + "step": 19804 + }, + { + "epoch": 0.39612, + "grad_norm": 2.03125, + "grad_norm_var": 0.009356435139973958, + "learning_rate": 0.0001, + "loss": 4.1475, + "loss/crossentropy": 2.1283441185951233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21942409127950668, + "step": 19806 + }, + { + "epoch": 0.39616, + "grad_norm": 1.953125, + "grad_norm_var": 0.01080322265625, + "learning_rate": 0.0001, + "loss": 4.0101, + "loss/crossentropy": 2.3675626516342163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22466859221458435, + "step": 19808 + }, + { + "epoch": 0.3962, + "grad_norm": 2.078125, + "grad_norm_var": 0.0082672119140625, + "learning_rate": 0.0001, + "loss": 3.8682, + "loss/crossentropy": 1.886322796344757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1904991939663887, + "step": 19810 + }, + { + "epoch": 0.39624, + "grad_norm": 2.03125, + "grad_norm_var": 0.005012003580729166, + "learning_rate": 0.0001, + "loss": 3.9071, + "loss/crossentropy": 2.098154127597809, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2065964937210083, + "step": 19812 + }, + { + "epoch": 0.39628, + "grad_norm": 1.8515625, + "grad_norm_var": 0.006788889567057292, + "learning_rate": 0.0001, + "loss": 3.8447, + "loss/crossentropy": 2.0527199506759644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19717100262641907, + "step": 19814 + }, + { + "epoch": 0.39632, + "grad_norm": 2.03125, + "grad_norm_var": 0.011244455973307291, + "learning_rate": 0.0001, + "loss": 4.2692, + "loss/crossentropy": 2.0650912523269653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2021411582827568, + "step": 19816 + }, + { + "epoch": 0.39636, + "grad_norm": 2.34375, + "grad_norm_var": 0.018700154622395833, + "learning_rate": 0.0001, + "loss": 4.3611, + "loss/crossentropy": 2.2239702939987183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19744005054235458, + "step": 19818 + }, + { + "epoch": 0.3964, + "grad_norm": 1.9375, + "grad_norm_var": 0.020369211832682293, + "learning_rate": 0.0001, + "loss": 3.9029, + "loss/crossentropy": 2.123336434364319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18976306170225143, + "step": 19820 + }, + { + "epoch": 0.39644, + "grad_norm": 1.9296875, + "grad_norm_var": 0.02154515584309896, + "learning_rate": 0.0001, + "loss": 3.9966, + "loss/crossentropy": 2.121657133102417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2159118503332138, + "step": 19822 + }, + { + "epoch": 0.39648, + "grad_norm": 2.078125, + "grad_norm_var": 0.0185455322265625, + "learning_rate": 0.0001, + "loss": 3.7621, + "loss/crossentropy": 1.7323416471481323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18072299659252167, + "step": 19824 + }, + { + "epoch": 0.39652, + "grad_norm": 2.21875, + "grad_norm_var": 0.020243072509765626, + "learning_rate": 0.0001, + "loss": 4.0183, + "loss/crossentropy": 2.181841015815735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21452204138040543, + "step": 19826 + }, + { + "epoch": 0.39656, + "grad_norm": 1.9296875, + "grad_norm_var": 0.02104670206705729, + "learning_rate": 0.0001, + "loss": 4.1181, + "loss/crossentropy": 1.9022215008735657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18924792110919952, + "step": 19828 + }, + { + "epoch": 0.3966, + "grad_norm": 1.984375, + "grad_norm_var": 0.018317667643229167, + "learning_rate": 0.0001, + "loss": 4.2505, + "loss/crossentropy": 2.1240022778511047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18618234246969223, + "step": 19830 + }, + { + "epoch": 0.39664, + "grad_norm": 1.9921875, + "grad_norm_var": 0.015710194905598957, + "learning_rate": 0.0001, + "loss": 3.8611, + "loss/crossentropy": 1.5959683060646057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17317308485507965, + "step": 19832 + }, + { + "epoch": 0.39668, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007682291666666666, + "learning_rate": 0.0001, + "loss": 4.1281, + "loss/crossentropy": 2.1812866926193237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.192174032330513, + "step": 19834 + }, + { + "epoch": 0.39672, + "grad_norm": 1.953125, + "grad_norm_var": 0.010651652018229167, + "learning_rate": 0.0001, + "loss": 4.188, + "loss/crossentropy": 1.8964568972587585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20922592282295227, + "step": 19836 + }, + { + "epoch": 0.39676, + "grad_norm": 1.8359375, + "grad_norm_var": 0.012013498942057292, + "learning_rate": 0.0001, + "loss": 4.1369, + "loss/crossentropy": 2.0822665691375732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18019618093967438, + "step": 19838 + }, + { + "epoch": 0.3968, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010985310872395833, + "learning_rate": 0.0001, + "loss": 4.0271, + "loss/crossentropy": 2.116270899772644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21604043990373611, + "step": 19840 + }, + { + "epoch": 0.39684, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007759602864583334, + "learning_rate": 0.0001, + "loss": 3.9132, + "loss/crossentropy": 1.7761988639831543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20087965577840805, + "step": 19842 + }, + { + "epoch": 0.39688, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008649698893229167, + "learning_rate": 0.0001, + "loss": 4.0187, + "loss/crossentropy": 1.6091360449790955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18282026052474976, + "step": 19844 + }, + { + "epoch": 0.39692, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009547678629557292, + "learning_rate": 0.0001, + "loss": 4.4441, + "loss/crossentropy": 2.6394678354263306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20579150319099426, + "step": 19846 + }, + { + "epoch": 0.39696, + "grad_norm": 2.0, + "grad_norm_var": 0.010011545817057292, + "learning_rate": 0.0001, + "loss": 4.1778, + "loss/crossentropy": 2.0911704897880554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20075388252735138, + "step": 19848 + }, + { + "epoch": 0.397, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010139719645182291, + "learning_rate": 0.0001, + "loss": 4.3458, + "loss/crossentropy": 2.1299456357955933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2067631408572197, + "step": 19850 + }, + { + "epoch": 0.39704, + "grad_norm": 1.875, + "grad_norm_var": 0.004937489827473958, + "learning_rate": 0.0001, + "loss": 3.9037, + "loss/crossentropy": 1.9630563855171204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.187811940908432, + "step": 19852 + }, + { + "epoch": 0.39708, + "grad_norm": 1.90625, + "grad_norm_var": 0.004808553059895833, + "learning_rate": 0.0001, + "loss": 4.1681, + "loss/crossentropy": 2.1914453506469727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064526304602623, + "step": 19854 + }, + { + "epoch": 0.39712, + "grad_norm": 2.046875, + "grad_norm_var": 0.012412261962890626, + "learning_rate": 0.0001, + "loss": 4.1404, + "loss/crossentropy": 2.3003474473953247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20841488242149353, + "step": 19856 + }, + { + "epoch": 0.39716, + "grad_norm": 1.84375, + "grad_norm_var": 0.01682306925455729, + "learning_rate": 0.0001, + "loss": 3.96, + "loss/crossentropy": 2.1625255346298218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20437289774417877, + "step": 19858 + }, + { + "epoch": 0.3972, + "grad_norm": 1.9765625, + "grad_norm_var": 0.015868123372395834, + "learning_rate": 0.0001, + "loss": 3.8594, + "loss/crossentropy": 1.6742416620254517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16787201166152954, + "step": 19860 + }, + { + "epoch": 0.39724, + "grad_norm": 1.921875, + "grad_norm_var": 0.01546630859375, + "learning_rate": 0.0001, + "loss": 3.8889, + "loss/crossentropy": 2.0682146549224854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1715894490480423, + "step": 19862 + }, + { + "epoch": 0.39728, + "grad_norm": 1.96875, + "grad_norm_var": 0.017071278889973958, + "learning_rate": 0.0001, + "loss": 4.0319, + "loss/crossentropy": 1.99592924118042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19570383429527283, + "step": 19864 + }, + { + "epoch": 0.39732, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01967137654622396, + "learning_rate": 0.0001, + "loss": 4.0123, + "loss/crossentropy": 2.086844265460968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2123936414718628, + "step": 19866 + }, + { + "epoch": 0.39736, + "grad_norm": 1.9921875, + "grad_norm_var": 0.017256673177083334, + "learning_rate": 0.0001, + "loss": 4.2245, + "loss/crossentropy": 1.7985658645629883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18117651343345642, + "step": 19868 + }, + { + "epoch": 0.3974, + "grad_norm": 1.9140625, + "grad_norm_var": 0.018949381510416665, + "learning_rate": 0.0001, + "loss": 3.8449, + "loss/crossentropy": 1.9804646372795105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18682067096233368, + "step": 19870 + }, + { + "epoch": 0.39744, + "grad_norm": 1.796875, + "grad_norm_var": 0.01546630859375, + "learning_rate": 0.0001, + "loss": 4.0692, + "loss/crossentropy": 1.9027678966522217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18195898830890656, + "step": 19872 + }, + { + "epoch": 0.39748, + "grad_norm": 1.9296875, + "grad_norm_var": 0.01660944620768229, + "learning_rate": 0.0001, + "loss": 4.1191, + "loss/crossentropy": 2.3370686769485474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2376203015446663, + "step": 19874 + }, + { + "epoch": 0.39752, + "grad_norm": 1.890625, + "grad_norm_var": 0.018363189697265626, + "learning_rate": 0.0001, + "loss": 3.8472, + "loss/crossentropy": 1.7463279366493225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16485755145549774, + "step": 19876 + }, + { + "epoch": 0.39756, + "grad_norm": 2.015625, + "grad_norm_var": 0.017967732747395833, + "learning_rate": 0.0001, + "loss": 4.4039, + "loss/crossentropy": 2.1171544194221497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20631500333547592, + "step": 19878 + }, + { + "epoch": 0.3976, + "grad_norm": 1.84375, + "grad_norm_var": 0.016733551025390626, + "learning_rate": 0.0001, + "loss": 3.7242, + "loss/crossentropy": 2.1109176874160767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18512246757745743, + "step": 19880 + }, + { + "epoch": 0.39764, + "grad_norm": 1.9375, + "grad_norm_var": 0.013618977864583333, + "learning_rate": 0.0001, + "loss": 3.6843, + "loss/crossentropy": 1.927691638469696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19118069112300873, + "step": 19882 + }, + { + "epoch": 0.39768, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01337890625, + "learning_rate": 0.0001, + "loss": 3.9539, + "loss/crossentropy": 2.041896104812622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21281379461288452, + "step": 19884 + }, + { + "epoch": 0.39772, + "grad_norm": 1.7890625, + "grad_norm_var": 0.013272857666015625, + "learning_rate": 0.0001, + "loss": 3.9146, + "loss/crossentropy": 1.8779407739639282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.180915005505085, + "step": 19886 + }, + { + "epoch": 0.39776, + "grad_norm": 1.9296875, + "grad_norm_var": 0.019001261393229166, + "learning_rate": 0.0001, + "loss": 4.2158, + "loss/crossentropy": 1.9899646639823914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19102467596530914, + "step": 19888 + }, + { + "epoch": 0.3978, + "grad_norm": 2.0, + "grad_norm_var": 0.012562815348307292, + "learning_rate": 0.0001, + "loss": 4.1822, + "loss/crossentropy": 2.3294299840927124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20739557594060898, + "step": 19890 + }, + { + "epoch": 0.39784, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011864980061848959, + "learning_rate": 0.0001, + "loss": 4.2829, + "loss/crossentropy": 2.2126539945602417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23066286742687225, + "step": 19892 + }, + { + "epoch": 0.39788, + "grad_norm": 2.1875, + "grad_norm_var": 0.016454060872395832, + "learning_rate": 0.0001, + "loss": 3.8665, + "loss/crossentropy": 1.9949330687522888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19172564148902893, + "step": 19894 + }, + { + "epoch": 0.39792, + "grad_norm": 2.0625, + "grad_norm_var": 0.02158177693684896, + "learning_rate": 0.0001, + "loss": 3.9567, + "loss/crossentropy": 2.0896310210227966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19692128896713257, + "step": 19896 + }, + { + "epoch": 0.39796, + "grad_norm": 1.921875, + "grad_norm_var": 0.020310211181640624, + "learning_rate": 0.0001, + "loss": 4.1856, + "loss/crossentropy": 2.191789746284485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2073511779308319, + "step": 19898 + }, + { + "epoch": 0.398, + "grad_norm": 1.921875, + "grad_norm_var": 0.022415924072265624, + "learning_rate": 0.0001, + "loss": 3.9965, + "loss/crossentropy": 1.9925153255462646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20123805850744247, + "step": 19900 + }, + { + "epoch": 0.39804, + "grad_norm": 1.8125, + "grad_norm_var": 0.021476236979166667, + "learning_rate": 0.0001, + "loss": 3.7917, + "loss/crossentropy": 1.8903232216835022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18577788770198822, + "step": 19902 + }, + { + "epoch": 0.39808, + "grad_norm": 2.078125, + "grad_norm_var": 0.01817804972330729, + "learning_rate": 0.0001, + "loss": 4.2818, + "loss/crossentropy": 2.055288314819336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19745147973299026, + "step": 19904 + }, + { + "epoch": 0.39812, + "grad_norm": 2.0625, + "grad_norm_var": 0.018570709228515624, + "learning_rate": 0.0001, + "loss": 4.196, + "loss/crossentropy": 1.7709746360778809, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17147859930992126, + "step": 19906 + }, + { + "epoch": 0.39816, + "grad_norm": 2.0, + "grad_norm_var": 0.01843846638997396, + "learning_rate": 0.0001, + "loss": 4.182, + "loss/crossentropy": 2.3295364379882812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20680927485227585, + "step": 19908 + }, + { + "epoch": 0.3982, + "grad_norm": 1.9765625, + "grad_norm_var": 0.014414215087890625, + "learning_rate": 0.0001, + "loss": 4.1633, + "loss/crossentropy": 1.9828099608421326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.202943354845047, + "step": 19910 + }, + { + "epoch": 0.39824, + "grad_norm": 2.046875, + "grad_norm_var": 0.009352366129557291, + "learning_rate": 0.0001, + "loss": 4.2294, + "loss/crossentropy": 2.140046715736389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1967788189649582, + "step": 19912 + }, + { + "epoch": 0.39828, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008736165364583333, + "learning_rate": 0.0001, + "loss": 4.1673, + "loss/crossentropy": 2.3365899324417114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2261614426970482, + "step": 19914 + }, + { + "epoch": 0.39832, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0075642903645833336, + "learning_rate": 0.0001, + "loss": 3.8716, + "loss/crossentropy": 2.101326584815979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19668899476528168, + "step": 19916 + }, + { + "epoch": 0.39836, + "grad_norm": 1.96875, + "grad_norm_var": 0.005037180582682292, + "learning_rate": 0.0001, + "loss": 4.1572, + "loss/crossentropy": 1.9804238080978394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2117360234260559, + "step": 19918 + }, + { + "epoch": 0.3984, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0037676493326822915, + "learning_rate": 0.0001, + "loss": 3.945, + "loss/crossentropy": 1.858399510383606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20013604313135147, + "step": 19920 + }, + { + "epoch": 0.39844, + "grad_norm": 1.90625, + "grad_norm_var": 0.005832672119140625, + "learning_rate": 0.0001, + "loss": 4.0302, + "loss/crossentropy": 2.087529957294464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19001878052949905, + "step": 19922 + }, + { + "epoch": 0.39848, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0068267822265625, + "learning_rate": 0.0001, + "loss": 3.9425, + "loss/crossentropy": 2.2521530389785767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20182852447032928, + "step": 19924 + }, + { + "epoch": 0.39852, + "grad_norm": 2.109375, + "grad_norm_var": 0.00784912109375, + "learning_rate": 0.0001, + "loss": 4.3362, + "loss/crossentropy": 2.370557188987732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27234383672475815, + "step": 19926 + }, + { + "epoch": 0.39856, + "grad_norm": 13.0625, + "grad_norm_var": 7.6871192932128904, + "learning_rate": 0.0001, + "loss": 4.0383, + "loss/crossentropy": 2.18235445022583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20694056898355484, + "step": 19928 + }, + { + "epoch": 0.3986, + "grad_norm": 2.125, + "grad_norm_var": 7.659780883789063, + "learning_rate": 0.0001, + "loss": 3.8946, + "loss/crossentropy": 1.9220272898674011, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21540776640176773, + "step": 19930 + }, + { + "epoch": 0.39864, + "grad_norm": 2.015625, + "grad_norm_var": 7.6551513671875, + "learning_rate": 0.0001, + "loss": 4.015, + "loss/crossentropy": 2.064575970172882, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20172829926013947, + "step": 19932 + }, + { + "epoch": 0.39868, + "grad_norm": 2.296875, + "grad_norm_var": 7.620402018229167, + "learning_rate": 0.0001, + "loss": 4.5782, + "loss/crossentropy": 2.1824090480804443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23097260296344757, + "step": 19934 + }, + { + "epoch": 0.39872, + "grad_norm": 2.0, + "grad_norm_var": 7.594489542643229, + "learning_rate": 0.0001, + "loss": 3.9227, + "loss/crossentropy": 2.211961567401886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2150636538863182, + "step": 19936 + }, + { + "epoch": 0.39876, + "grad_norm": 2.4375, + "grad_norm_var": 7.586128743489583, + "learning_rate": 0.0001, + "loss": 4.1724, + "loss/crossentropy": 1.8789254426956177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18855369836091995, + "step": 19938 + }, + { + "epoch": 0.3988, + "grad_norm": 2.03125, + "grad_norm_var": 7.576968383789063, + "learning_rate": 0.0001, + "loss": 4.212, + "loss/crossentropy": 2.322842240333557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19432100653648376, + "step": 19940 + }, + { + "epoch": 0.39884, + "grad_norm": 1.9375, + "grad_norm_var": 7.590311686197917, + "learning_rate": 0.0001, + "loss": 4.2201, + "loss/crossentropy": 2.1524535417556763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19931253790855408, + "step": 19942 + }, + { + "epoch": 0.39888, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0280670166015625, + "learning_rate": 0.0001, + "loss": 4.0166, + "loss/crossentropy": 2.0835599303245544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18674355000257492, + "step": 19944 + }, + { + "epoch": 0.39892, + "grad_norm": 1.96875, + "grad_norm_var": 0.026718902587890624, + "learning_rate": 0.0001, + "loss": 3.9398, + "loss/crossentropy": 1.8204763531684875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1970055252313614, + "step": 19946 + }, + { + "epoch": 0.39896, + "grad_norm": 1.9375, + "grad_norm_var": 0.02769953409830729, + "learning_rate": 0.0001, + "loss": 3.8403, + "loss/crossentropy": 2.1432100534439087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1960032731294632, + "step": 19948 + }, + { + "epoch": 0.399, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0227294921875, + "learning_rate": 0.0001, + "loss": 4.1183, + "loss/crossentropy": 1.9959533214569092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18608924746513367, + "step": 19950 + }, + { + "epoch": 0.39904, + "grad_norm": 2.015625, + "grad_norm_var": 0.02399266560872396, + "learning_rate": 0.0001, + "loss": 4.0235, + "loss/crossentropy": 1.9874342679977417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18600600212812424, + "step": 19952 + }, + { + "epoch": 0.39908, + "grad_norm": 1.9296875, + "grad_norm_var": 0.00849609375, + "learning_rate": 0.0001, + "loss": 4.2266, + "loss/crossentropy": 2.5082361698150635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22386416047811508, + "step": 19954 + }, + { + "epoch": 0.39912, + "grad_norm": 1.90625, + "grad_norm_var": 0.008153279622395834, + "learning_rate": 0.0001, + "loss": 3.9735, + "loss/crossentropy": 2.1550523042678833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2169199213385582, + "step": 19956 + }, + { + "epoch": 0.39916, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008172353108723959, + "learning_rate": 0.0001, + "loss": 3.8834, + "loss/crossentropy": 2.079386830329895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19080224633216858, + "step": 19958 + }, + { + "epoch": 0.3992, + "grad_norm": 1.8828125, + "grad_norm_var": 0.006030019124348958, + "learning_rate": 0.0001, + "loss": 3.973, + "loss/crossentropy": 1.8762348890304565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1813136711716652, + "step": 19960 + }, + { + "epoch": 0.39924, + "grad_norm": 1.84375, + "grad_norm_var": 0.006455230712890625, + "learning_rate": 0.0001, + "loss": 4.0965, + "loss/crossentropy": 1.789110004901886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17166541516780853, + "step": 19962 + }, + { + "epoch": 0.39928, + "grad_norm": 1.859375, + "grad_norm_var": 0.009056599934895833, + "learning_rate": 0.0001, + "loss": 4.0912, + "loss/crossentropy": 2.0196239948272705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20005135238170624, + "step": 19964 + }, + { + "epoch": 0.39932, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0080718994140625, + "learning_rate": 0.0001, + "loss": 4.2787, + "loss/crossentropy": 2.192206025123596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1979481726884842, + "step": 19966 + }, + { + "epoch": 0.39936, + "grad_norm": 2.0625, + "grad_norm_var": 0.007157135009765625, + "learning_rate": 0.0001, + "loss": 4.3155, + "loss/crossentropy": 2.263342499732971, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2187722995877266, + "step": 19968 + }, + { + "epoch": 0.3994, + "grad_norm": 2.109375, + "grad_norm_var": 0.008161417643229167, + "learning_rate": 0.0001, + "loss": 4.2727, + "loss/crossentropy": 2.3387625217437744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21375280618667603, + "step": 19970 + }, + { + "epoch": 0.39944, + "grad_norm": 1.96875, + "grad_norm_var": 0.008733876546223958, + "learning_rate": 0.0001, + "loss": 4.0442, + "loss/crossentropy": 1.5837730765342712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17303457856178284, + "step": 19972 + }, + { + "epoch": 0.39948, + "grad_norm": 1.984375, + "grad_norm_var": 0.008678944905598958, + "learning_rate": 0.0001, + "loss": 3.8398, + "loss/crossentropy": 1.9090858697891235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1838761642575264, + "step": 19974 + }, + { + "epoch": 0.39952, + "grad_norm": 1.875, + "grad_norm_var": 0.008780924479166667, + "learning_rate": 0.0001, + "loss": 3.9345, + "loss/crossentropy": 2.107620596885681, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19132380187511444, + "step": 19976 + }, + { + "epoch": 0.39956, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009200032552083333, + "learning_rate": 0.0001, + "loss": 4.0709, + "loss/crossentropy": 2.1706109046936035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19275012612342834, + "step": 19978 + }, + { + "epoch": 0.3996, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007356516520182292, + "learning_rate": 0.0001, + "loss": 3.8486, + "loss/crossentropy": 2.0092907547950745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18047627061605453, + "step": 19980 + }, + { + "epoch": 0.39964, + "grad_norm": 2.046875, + "grad_norm_var": 0.025349934895833332, + "learning_rate": 0.0001, + "loss": 4.2226, + "loss/crossentropy": 2.1216511726379395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20591023564338684, + "step": 19982 + }, + { + "epoch": 0.39968, + "grad_norm": 2.046875, + "grad_norm_var": 0.02671076456705729, + "learning_rate": 0.0001, + "loss": 3.9895, + "loss/crossentropy": 2.3002817630767822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20734255760908127, + "step": 19984 + }, + { + "epoch": 0.39972, + "grad_norm": 2.09375, + "grad_norm_var": 0.02664972941080729, + "learning_rate": 0.0001, + "loss": 4.1494, + "loss/crossentropy": 2.3097801208496094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20194757729768753, + "step": 19986 + }, + { + "epoch": 0.39976, + "grad_norm": 2.15625, + "grad_norm_var": 0.029130045572916666, + "learning_rate": 0.0001, + "loss": 3.9706, + "loss/crossentropy": 2.2645580768585205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21866093575954437, + "step": 19988 + }, + { + "epoch": 0.3998, + "grad_norm": 1.875, + "grad_norm_var": 0.03004150390625, + "learning_rate": 0.0001, + "loss": 4.0634, + "loss/crossentropy": 2.224185347557068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19011163711547852, + "step": 19990 + }, + { + "epoch": 0.39984, + "grad_norm": 1.859375, + "grad_norm_var": 0.030248006184895832, + "learning_rate": 0.0001, + "loss": 3.9552, + "loss/crossentropy": 1.958588182926178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18359722197055817, + "step": 19992 + }, + { + "epoch": 0.39988, + "grad_norm": 1.9375, + "grad_norm_var": 0.029361724853515625, + "learning_rate": 0.0001, + "loss": 4.0449, + "loss/crossentropy": 2.5081194639205933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22576630860567093, + "step": 19994 + }, + { + "epoch": 0.39992, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0306549072265625, + "learning_rate": 0.0001, + "loss": 3.8833, + "loss/crossentropy": 1.9870144724845886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19262682646512985, + "step": 19996 + }, + { + "epoch": 0.39996, + "grad_norm": 1.828125, + "grad_norm_var": 0.0118560791015625, + "learning_rate": 0.0001, + "loss": 4.1321, + "loss/crossentropy": 2.227096438407898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2056111991405487, + "step": 19998 + }, + { + "epoch": 0.4, + "grad_norm": 1.796875, + "grad_norm_var": 0.012837473551432292, + "learning_rate": 0.0001, + "loss": 4.1659, + "loss/crossentropy": 1.8458907008171082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18857256323099136, + "step": 20000 + } + ], + "logging_steps": 2, + "max_steps": 50000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.08295310655488e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}