diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9998 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 1424, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.08667389329222297, + "learning_rate": 5e-05, + "loss": 1.2996, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.09045727907955421, + "learning_rate": 5e-05, + "loss": 1.2954, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 0.09563085455184005, + "learning_rate": 5e-05, + "loss": 1.3904, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 0.09509304597101893, + "learning_rate": 5e-05, + "loss": 1.3129, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 0.11253528120530865, + "learning_rate": 5e-05, + "loss": 1.3848, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 0.10089982798628681, + "learning_rate": 5e-05, + "loss": 1.4346, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 0.10089306432221856, + "learning_rate": 5e-05, + "loss": 1.2888, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 0.10144502034709778, + "learning_rate": 5e-05, + "loss": 1.4566, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 0.0961164132224369, + "learning_rate": 5e-05, + "loss": 1.3295, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 0.09776728005057, + "learning_rate": 5e-05, + "loss": 1.3634, + "step": 10 + }, + { + "epoch": 0.02, + "grad_norm": 0.08499071966520562, + "learning_rate": 5e-05, + "loss": 1.2536, + "step": 11 + }, + { + "epoch": 0.02, + "grad_norm": 0.10825720796344761, + "learning_rate": 5e-05, + "loss": 1.3407, + "step": 12 + }, + { + "epoch": 0.02, + "grad_norm": 0.07801898249414914, + "learning_rate": 5e-05, + "loss": 1.3189, + "step": 13 + }, + { + "epoch": 0.02, + "grad_norm": 0.07574794344316574, + "learning_rate": 5e-05, + "loss": 1.211, + "step": 14 + }, + { + "epoch": 0.02, + "grad_norm": 0.24315419692813534, + "learning_rate": 5e-05, + "loss": 1.3227, + "step": 15 + }, + { + "epoch": 0.02, + "grad_norm": 0.08425261542212245, + "learning_rate": 5e-05, + "loss": 1.2089, + "step": 16 + }, + { + "epoch": 0.02, + "grad_norm": 0.08183851026470287, + "learning_rate": 5e-05, + "loss": 1.2836, + "step": 17 + }, + { + "epoch": 0.03, + "grad_norm": 0.09223487431428871, + "learning_rate": 5e-05, + "loss": 1.2208, + "step": 18 + }, + { + "epoch": 0.03, + "grad_norm": 0.11228483070641347, + "learning_rate": 5e-05, + "loss": 1.2418, + "step": 19 + }, + { + "epoch": 0.03, + "grad_norm": 0.08879742195116058, + "learning_rate": 5e-05, + "loss": 1.2513, + "step": 20 + }, + { + "epoch": 0.03, + "grad_norm": 0.09483580915314582, + "learning_rate": 5e-05, + "loss": 1.3948, + "step": 21 + }, + { + "epoch": 0.03, + "grad_norm": 0.08603858239218602, + "learning_rate": 5e-05, + "loss": 1.351, + "step": 22 + }, + { + "epoch": 0.03, + "grad_norm": 0.08992366856768315, + "learning_rate": 5e-05, + "loss": 1.3144, + "step": 23 + }, + { + "epoch": 0.03, + "grad_norm": 0.08406961509231002, + "learning_rate": 5e-05, + "loss": 1.3585, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.08228049509215797, + "learning_rate": 5e-05, + "loss": 1.326, + "step": 25 + }, + { + "epoch": 0.04, + "grad_norm": 0.07592322737164406, + "learning_rate": 5e-05, + "loss": 1.3401, + "step": 26 + }, + { + "epoch": 0.04, + "grad_norm": 0.07112576203109268, + "learning_rate": 5e-05, + "loss": 1.1088, + "step": 27 + }, + { + "epoch": 0.04, + "grad_norm": 0.07324863402169346, + "learning_rate": 5e-05, + "loss": 1.246, + "step": 28 + }, + { + "epoch": 0.04, + "grad_norm": 0.08175218998309366, + "learning_rate": 5e-05, + "loss": 1.2348, + "step": 29 + }, + { + "epoch": 0.04, + "grad_norm": 0.08583913360130707, + "learning_rate": 5e-05, + "loss": 1.3035, + "step": 30 + }, + { + "epoch": 0.04, + "grad_norm": 0.07502711247215474, + "learning_rate": 5e-05, + "loss": 1.3474, + "step": 31 + }, + { + "epoch": 0.04, + "grad_norm": 0.07696046839654648, + "learning_rate": 5e-05, + "loss": 1.1426, + "step": 32 + }, + { + "epoch": 0.05, + "grad_norm": 0.06820817315021704, + "learning_rate": 5e-05, + "loss": 1.2306, + "step": 33 + }, + { + "epoch": 0.05, + "grad_norm": 0.06819731681879962, + "learning_rate": 5e-05, + "loss": 1.3403, + "step": 34 + }, + { + "epoch": 0.05, + "grad_norm": 0.0709871911694856, + "learning_rate": 5e-05, + "loss": 1.3955, + "step": 35 + }, + { + "epoch": 0.05, + "grad_norm": 0.06763821790570135, + "learning_rate": 5e-05, + "loss": 1.2616, + "step": 36 + }, + { + "epoch": 0.05, + "grad_norm": 0.07062528242297304, + "learning_rate": 5e-05, + "loss": 1.325, + "step": 37 + }, + { + "epoch": 0.05, + "grad_norm": 0.06805561973834255, + "learning_rate": 5e-05, + "loss": 1.3175, + "step": 38 + }, + { + "epoch": 0.05, + "grad_norm": 0.08790825664947328, + "learning_rate": 5e-05, + "loss": 1.3043, + "step": 39 + }, + { + "epoch": 0.06, + "grad_norm": 0.07561069106434462, + "learning_rate": 5e-05, + "loss": 1.1732, + "step": 40 + }, + { + "epoch": 0.06, + "grad_norm": 0.07459745434540017, + "learning_rate": 5e-05, + "loss": 1.341, + "step": 41 + }, + { + "epoch": 0.06, + "grad_norm": 0.07394792035518542, + "learning_rate": 5e-05, + "loss": 1.2382, + "step": 42 + }, + { + "epoch": 0.06, + "grad_norm": 0.07712869041324996, + "learning_rate": 5e-05, + "loss": 1.3714, + "step": 43 + }, + { + "epoch": 0.06, + "grad_norm": 0.06940306526975415, + "learning_rate": 5e-05, + "loss": 1.309, + "step": 44 + }, + { + "epoch": 0.06, + "grad_norm": 0.06771778026975195, + "learning_rate": 5e-05, + "loss": 1.2311, + "step": 45 + }, + { + "epoch": 0.06, + "grad_norm": 0.0740395369124851, + "learning_rate": 5e-05, + "loss": 1.2489, + "step": 46 + }, + { + "epoch": 0.07, + "grad_norm": 0.07077413357484925, + "learning_rate": 5e-05, + "loss": 1.2386, + "step": 47 + }, + { + "epoch": 0.07, + "grad_norm": 0.0720737094586865, + "learning_rate": 5e-05, + "loss": 1.1696, + "step": 48 + }, + { + "epoch": 0.07, + "grad_norm": 0.07198070989104487, + "learning_rate": 5e-05, + "loss": 1.3194, + "step": 49 + }, + { + "epoch": 0.07, + "grad_norm": 0.08452050939944894, + "learning_rate": 5e-05, + "loss": 1.2328, + "step": 50 + }, + { + "epoch": 0.07, + "grad_norm": 0.08048845476567269, + "learning_rate": 5e-05, + "loss": 1.328, + "step": 51 + }, + { + "epoch": 0.07, + "grad_norm": 0.06994386093892287, + "learning_rate": 5e-05, + "loss": 1.2783, + "step": 52 + }, + { + "epoch": 0.07, + "grad_norm": 0.08769492336853892, + "learning_rate": 5e-05, + "loss": 1.2528, + "step": 53 + }, + { + "epoch": 0.08, + "grad_norm": 0.07783706354156507, + "learning_rate": 5e-05, + "loss": 1.2972, + "step": 54 + }, + { + "epoch": 0.08, + "grad_norm": 0.07037998648664866, + "learning_rate": 5e-05, + "loss": 1.2901, + "step": 55 + }, + { + "epoch": 0.08, + "grad_norm": 0.07248406059256986, + "learning_rate": 5e-05, + "loss": 1.2696, + "step": 56 + }, + { + "epoch": 0.08, + "grad_norm": 0.0730426997946999, + "learning_rate": 5e-05, + "loss": 1.3199, + "step": 57 + }, + { + "epoch": 0.08, + "grad_norm": 0.07371245539543211, + "learning_rate": 5e-05, + "loss": 1.1871, + "step": 58 + }, + { + "epoch": 0.08, + "grad_norm": 0.06789304450900832, + "learning_rate": 5e-05, + "loss": 1.2348, + "step": 59 + }, + { + "epoch": 0.08, + "grad_norm": 0.1476525689556603, + "learning_rate": 5e-05, + "loss": 1.2089, + "step": 60 + }, + { + "epoch": 0.09, + "grad_norm": 0.08237346280480178, + "learning_rate": 5e-05, + "loss": 1.307, + "step": 61 + }, + { + "epoch": 0.09, + "grad_norm": 0.07725835838817337, + "learning_rate": 5e-05, + "loss": 1.2057, + "step": 62 + }, + { + "epoch": 0.09, + "grad_norm": 0.07711157991602562, + "learning_rate": 5e-05, + "loss": 1.2259, + "step": 63 + }, + { + "epoch": 0.09, + "grad_norm": 0.07470511046688118, + "learning_rate": 5e-05, + "loss": 1.201, + "step": 64 + }, + { + "epoch": 0.09, + "grad_norm": 0.07309944563226711, + "learning_rate": 5e-05, + "loss": 1.2396, + "step": 65 + }, + { + "epoch": 0.09, + "grad_norm": 0.07933719744464587, + "learning_rate": 5e-05, + "loss": 1.1584, + "step": 66 + }, + { + "epoch": 0.09, + "grad_norm": 0.07649496283235623, + "learning_rate": 5e-05, + "loss": 1.333, + "step": 67 + }, + { + "epoch": 0.1, + "grad_norm": 0.11695602061284746, + "learning_rate": 5e-05, + "loss": 1.3591, + "step": 68 + }, + { + "epoch": 0.1, + "grad_norm": 0.07811310800925421, + "learning_rate": 5e-05, + "loss": 1.3388, + "step": 69 + }, + { + "epoch": 0.1, + "grad_norm": 0.07584429948947581, + "learning_rate": 5e-05, + "loss": 1.295, + "step": 70 + }, + { + "epoch": 0.1, + "grad_norm": 0.08073403862737152, + "learning_rate": 5e-05, + "loss": 1.2076, + "step": 71 + }, + { + "epoch": 0.1, + "grad_norm": 0.07720820931346752, + "learning_rate": 5e-05, + "loss": 1.297, + "step": 72 + }, + { + "epoch": 0.1, + "grad_norm": 0.07036654399509103, + "learning_rate": 5e-05, + "loss": 1.2311, + "step": 73 + }, + { + "epoch": 0.1, + "grad_norm": 0.0752611690575668, + "learning_rate": 5e-05, + "loss": 1.2585, + "step": 74 + }, + { + "epoch": 0.11, + "grad_norm": 0.10117990124794089, + "learning_rate": 5e-05, + "loss": 1.2026, + "step": 75 + }, + { + "epoch": 0.11, + "grad_norm": 0.07538403647241076, + "learning_rate": 5e-05, + "loss": 1.2268, + "step": 76 + }, + { + "epoch": 0.11, + "grad_norm": 0.07331262441864544, + "learning_rate": 5e-05, + "loss": 1.2595, + "step": 77 + }, + { + "epoch": 0.11, + "grad_norm": 0.09199888694287987, + "learning_rate": 5e-05, + "loss": 1.2137, + "step": 78 + }, + { + "epoch": 0.11, + "grad_norm": 0.08374160867498517, + "learning_rate": 5e-05, + "loss": 1.3628, + "step": 79 + }, + { + "epoch": 0.11, + "grad_norm": 0.07717125781677839, + "learning_rate": 5e-05, + "loss": 1.2651, + "step": 80 + }, + { + "epoch": 0.11, + "grad_norm": 0.08093764355996728, + "learning_rate": 5e-05, + "loss": 1.3122, + "step": 81 + }, + { + "epoch": 0.12, + "grad_norm": 0.08556679449347177, + "learning_rate": 5e-05, + "loss": 1.3326, + "step": 82 + }, + { + "epoch": 0.12, + "grad_norm": 0.09227553466496595, + "learning_rate": 5e-05, + "loss": 1.249, + "step": 83 + }, + { + "epoch": 0.12, + "grad_norm": 0.08102953860671439, + "learning_rate": 5e-05, + "loss": 1.302, + "step": 84 + }, + { + "epoch": 0.12, + "grad_norm": 0.07754129003679357, + "learning_rate": 5e-05, + "loss": 1.2505, + "step": 85 + }, + { + "epoch": 0.12, + "grad_norm": 0.07108985280852988, + "learning_rate": 5e-05, + "loss": 1.2064, + "step": 86 + }, + { + "epoch": 0.12, + "grad_norm": 0.07158298922127657, + "learning_rate": 5e-05, + "loss": 1.1823, + "step": 87 + }, + { + "epoch": 0.12, + "grad_norm": 0.08135521726556208, + "learning_rate": 5e-05, + "loss": 1.3405, + "step": 88 + }, + { + "epoch": 0.12, + "grad_norm": 0.08307657353416888, + "learning_rate": 5e-05, + "loss": 1.305, + "step": 89 + }, + { + "epoch": 0.13, + "grad_norm": 0.07889121728559233, + "learning_rate": 5e-05, + "loss": 1.2913, + "step": 90 + }, + { + "epoch": 0.13, + "grad_norm": 0.08268613381268089, + "learning_rate": 5e-05, + "loss": 1.2941, + "step": 91 + }, + { + "epoch": 0.13, + "grad_norm": 0.07894491012006483, + "learning_rate": 5e-05, + "loss": 1.3321, + "step": 92 + }, + { + "epoch": 0.13, + "grad_norm": 0.08455122305124131, + "learning_rate": 5e-05, + "loss": 1.192, + "step": 93 + }, + { + "epoch": 0.13, + "grad_norm": 0.1677625516738703, + "learning_rate": 5e-05, + "loss": 1.2806, + "step": 94 + }, + { + "epoch": 0.13, + "grad_norm": 0.25154398832533, + "learning_rate": 5e-05, + "loss": 1.3535, + "step": 95 + }, + { + "epoch": 0.13, + "grad_norm": 0.07611303051014763, + "learning_rate": 5e-05, + "loss": 1.2624, + "step": 96 + }, + { + "epoch": 0.14, + "grad_norm": 0.11344652547119116, + "learning_rate": 5e-05, + "loss": 1.1627, + "step": 97 + }, + { + "epoch": 0.14, + "grad_norm": 0.14434115169168604, + "learning_rate": 5e-05, + "loss": 1.1876, + "step": 98 + }, + { + "epoch": 0.14, + "grad_norm": 0.07458715411644555, + "learning_rate": 5e-05, + "loss": 1.2946, + "step": 99 + }, + { + "epoch": 0.14, + "grad_norm": 0.08544912002632542, + "learning_rate": 5e-05, + "loss": 1.2444, + "step": 100 + }, + { + "epoch": 0.14, + "grad_norm": 0.08171618538200655, + "learning_rate": 5e-05, + "loss": 1.2129, + "step": 101 + }, + { + "epoch": 0.14, + "grad_norm": 0.08255798880682443, + "learning_rate": 5e-05, + "loss": 1.3372, + "step": 102 + }, + { + "epoch": 0.14, + "grad_norm": 0.08947951396867447, + "learning_rate": 5e-05, + "loss": 1.332, + "step": 103 + }, + { + "epoch": 0.15, + "grad_norm": 0.0777659290508948, + "learning_rate": 5e-05, + "loss": 1.2014, + "step": 104 + }, + { + "epoch": 0.15, + "grad_norm": 0.08952849723664662, + "learning_rate": 5e-05, + "loss": 1.2483, + "step": 105 + }, + { + "epoch": 0.15, + "grad_norm": 0.09078615775230477, + "learning_rate": 5e-05, + "loss": 1.2853, + "step": 106 + }, + { + "epoch": 0.15, + "grad_norm": 0.0896534354898437, + "learning_rate": 5e-05, + "loss": 1.1306, + "step": 107 + }, + { + "epoch": 0.15, + "grad_norm": 0.08636961732695114, + "learning_rate": 5e-05, + "loss": 1.1913, + "step": 108 + }, + { + "epoch": 0.15, + "grad_norm": 0.08402924358963194, + "learning_rate": 5e-05, + "loss": 1.2316, + "step": 109 + }, + { + "epoch": 0.15, + "grad_norm": 0.08614677250811724, + "learning_rate": 5e-05, + "loss": 1.1496, + "step": 110 + }, + { + "epoch": 0.16, + "grad_norm": 0.09758783646953327, + "learning_rate": 5e-05, + "loss": 1.2526, + "step": 111 + }, + { + "epoch": 0.16, + "grad_norm": 0.11123906322423639, + "learning_rate": 5e-05, + "loss": 1.3143, + "step": 112 + }, + { + "epoch": 0.16, + "grad_norm": 0.07953490903350872, + "learning_rate": 5e-05, + "loss": 1.2608, + "step": 113 + }, + { + "epoch": 0.16, + "grad_norm": 0.08096533975103022, + "learning_rate": 5e-05, + "loss": 1.2899, + "step": 114 + }, + { + "epoch": 0.16, + "grad_norm": 0.09569417661442663, + "learning_rate": 5e-05, + "loss": 1.2204, + "step": 115 + }, + { + "epoch": 0.16, + "grad_norm": 0.07797798808759553, + "learning_rate": 5e-05, + "loss": 1.3607, + "step": 116 + }, + { + "epoch": 0.16, + "grad_norm": 0.09212254796343204, + "learning_rate": 5e-05, + "loss": 1.2617, + "step": 117 + }, + { + "epoch": 0.17, + "grad_norm": 0.08699394943161823, + "learning_rate": 5e-05, + "loss": 1.2948, + "step": 118 + }, + { + "epoch": 0.17, + "grad_norm": 0.08381286973096141, + "learning_rate": 5e-05, + "loss": 1.1897, + "step": 119 + }, + { + "epoch": 0.17, + "grad_norm": 0.08132336656290666, + "learning_rate": 5e-05, + "loss": 1.281, + "step": 120 + }, + { + "epoch": 0.17, + "grad_norm": 0.08386795924919085, + "learning_rate": 5e-05, + "loss": 1.2912, + "step": 121 + }, + { + "epoch": 0.17, + "grad_norm": 0.07665657252389507, + "learning_rate": 5e-05, + "loss": 1.2528, + "step": 122 + }, + { + "epoch": 0.17, + "grad_norm": 0.09075112331474103, + "learning_rate": 5e-05, + "loss": 1.3052, + "step": 123 + }, + { + "epoch": 0.17, + "grad_norm": 0.0998634337377465, + "learning_rate": 5e-05, + "loss": 1.2649, + "step": 124 + }, + { + "epoch": 0.18, + "grad_norm": 0.08488815637357822, + "learning_rate": 5e-05, + "loss": 1.2227, + "step": 125 + }, + { + "epoch": 0.18, + "grad_norm": 0.10212173798210869, + "learning_rate": 5e-05, + "loss": 1.2753, + "step": 126 + }, + { + "epoch": 0.18, + "grad_norm": 0.08261475705898018, + "learning_rate": 5e-05, + "loss": 1.2333, + "step": 127 + }, + { + "epoch": 0.18, + "grad_norm": 0.08383736010274866, + "learning_rate": 5e-05, + "loss": 1.2227, + "step": 128 + }, + { + "epoch": 0.18, + "grad_norm": 0.08707820434533624, + "learning_rate": 5e-05, + "loss": 1.2376, + "step": 129 + }, + { + "epoch": 0.18, + "grad_norm": 0.08073196834288653, + "learning_rate": 5e-05, + "loss": 1.2409, + "step": 130 + }, + { + "epoch": 0.18, + "grad_norm": 0.09335661069349774, + "learning_rate": 5e-05, + "loss": 1.2679, + "step": 131 + }, + { + "epoch": 0.19, + "grad_norm": 0.07975046183728626, + "learning_rate": 5e-05, + "loss": 1.1778, + "step": 132 + }, + { + "epoch": 0.19, + "grad_norm": 0.10411241749696298, + "learning_rate": 5e-05, + "loss": 1.2002, + "step": 133 + }, + { + "epoch": 0.19, + "grad_norm": 0.09271681351921757, + "learning_rate": 5e-05, + "loss": 1.1609, + "step": 134 + }, + { + "epoch": 0.19, + "grad_norm": 0.08175749124919046, + "learning_rate": 5e-05, + "loss": 1.2824, + "step": 135 + }, + { + "epoch": 0.19, + "grad_norm": 0.08489842461646079, + "learning_rate": 5e-05, + "loss": 1.1674, + "step": 136 + }, + { + "epoch": 0.19, + "grad_norm": 0.09042100379932565, + "learning_rate": 5e-05, + "loss": 1.1701, + "step": 137 + }, + { + "epoch": 0.19, + "grad_norm": 0.09804655799498443, + "learning_rate": 5e-05, + "loss": 1.2956, + "step": 138 + }, + { + "epoch": 0.2, + "grad_norm": 0.07847442132135601, + "learning_rate": 5e-05, + "loss": 1.3093, + "step": 139 + }, + { + "epoch": 0.2, + "grad_norm": 0.08770750121763184, + "learning_rate": 5e-05, + "loss": 1.2946, + "step": 140 + }, + { + "epoch": 0.2, + "grad_norm": 0.10158480784213657, + "learning_rate": 5e-05, + "loss": 1.2356, + "step": 141 + }, + { + "epoch": 0.2, + "grad_norm": 0.10135335196013266, + "learning_rate": 5e-05, + "loss": 1.3034, + "step": 142 + }, + { + "epoch": 0.2, + "grad_norm": 0.08382812027073917, + "learning_rate": 5e-05, + "loss": 1.3398, + "step": 143 + }, + { + "epoch": 0.2, + "grad_norm": 0.09584092766963259, + "learning_rate": 5e-05, + "loss": 1.2887, + "step": 144 + }, + { + "epoch": 0.2, + "grad_norm": 0.08976747468288296, + "learning_rate": 5e-05, + "loss": 1.2122, + "step": 145 + }, + { + "epoch": 0.21, + "grad_norm": 0.0824040159135501, + "learning_rate": 5e-05, + "loss": 1.2985, + "step": 146 + }, + { + "epoch": 0.21, + "grad_norm": 0.09428935899731097, + "learning_rate": 5e-05, + "loss": 1.2339, + "step": 147 + }, + { + "epoch": 0.21, + "grad_norm": 0.07975866363163563, + "learning_rate": 5e-05, + "loss": 1.2496, + "step": 148 + }, + { + "epoch": 0.21, + "grad_norm": 0.08671985922920832, + "learning_rate": 5e-05, + "loss": 1.3155, + "step": 149 + }, + { + "epoch": 0.21, + "grad_norm": 0.08501817899150449, + "learning_rate": 5e-05, + "loss": 1.216, + "step": 150 + }, + { + "epoch": 0.21, + "grad_norm": 0.09083889660830718, + "learning_rate": 5e-05, + "loss": 1.1607, + "step": 151 + }, + { + "epoch": 0.21, + "grad_norm": 0.08645635978804701, + "learning_rate": 5e-05, + "loss": 1.2367, + "step": 152 + }, + { + "epoch": 0.21, + "grad_norm": 0.09008336643659964, + "learning_rate": 5e-05, + "loss": 1.2224, + "step": 153 + }, + { + "epoch": 0.22, + "grad_norm": 0.08930348517009218, + "learning_rate": 5e-05, + "loss": 1.2816, + "step": 154 + }, + { + "epoch": 0.22, + "grad_norm": 0.07800593774723825, + "learning_rate": 5e-05, + "loss": 1.2283, + "step": 155 + }, + { + "epoch": 0.22, + "grad_norm": 0.07870429263081696, + "learning_rate": 5e-05, + "loss": 1.2229, + "step": 156 + }, + { + "epoch": 0.22, + "grad_norm": 0.08841870671121418, + "learning_rate": 5e-05, + "loss": 1.1674, + "step": 157 + }, + { + "epoch": 0.22, + "grad_norm": 0.15926423427086853, + "learning_rate": 5e-05, + "loss": 1.2827, + "step": 158 + }, + { + "epoch": 0.22, + "grad_norm": 0.09104370482645933, + "learning_rate": 5e-05, + "loss": 1.1922, + "step": 159 + }, + { + "epoch": 0.22, + "grad_norm": 0.08996245356427444, + "learning_rate": 5e-05, + "loss": 1.2393, + "step": 160 + }, + { + "epoch": 0.23, + "grad_norm": 0.08355593256579182, + "learning_rate": 5e-05, + "loss": 1.18, + "step": 161 + }, + { + "epoch": 0.23, + "grad_norm": 0.08613963433468404, + "learning_rate": 5e-05, + "loss": 1.2513, + "step": 162 + }, + { + "epoch": 0.23, + "grad_norm": 0.07824373219506703, + "learning_rate": 5e-05, + "loss": 1.2672, + "step": 163 + }, + { + "epoch": 0.23, + "grad_norm": 0.18032874105981436, + "learning_rate": 5e-05, + "loss": 1.2608, + "step": 164 + }, + { + "epoch": 0.23, + "grad_norm": 0.08775610122970826, + "learning_rate": 5e-05, + "loss": 1.2987, + "step": 165 + }, + { + "epoch": 0.23, + "grad_norm": 0.09364314363077682, + "learning_rate": 5e-05, + "loss": 1.3052, + "step": 166 + }, + { + "epoch": 0.23, + "grad_norm": 0.09100701495842761, + "learning_rate": 5e-05, + "loss": 1.2144, + "step": 167 + }, + { + "epoch": 0.24, + "grad_norm": 0.0847083094989299, + "learning_rate": 5e-05, + "loss": 1.2426, + "step": 168 + }, + { + "epoch": 0.24, + "grad_norm": 0.08867714842702075, + "learning_rate": 5e-05, + "loss": 1.1743, + "step": 169 + }, + { + "epoch": 0.24, + "grad_norm": 0.09247456282992296, + "learning_rate": 5e-05, + "loss": 1.2495, + "step": 170 + }, + { + "epoch": 0.24, + "grad_norm": 0.1060254132672116, + "learning_rate": 5e-05, + "loss": 1.2256, + "step": 171 + }, + { + "epoch": 0.24, + "grad_norm": 0.09214434586631197, + "learning_rate": 5e-05, + "loss": 1.2797, + "step": 172 + }, + { + "epoch": 0.24, + "grad_norm": 0.0818323059400522, + "learning_rate": 5e-05, + "loss": 1.1637, + "step": 173 + }, + { + "epoch": 0.24, + "grad_norm": 0.08731967992349708, + "learning_rate": 5e-05, + "loss": 1.2443, + "step": 174 + }, + { + "epoch": 0.25, + "grad_norm": 0.09012749151558586, + "learning_rate": 5e-05, + "loss": 1.2174, + "step": 175 + }, + { + "epoch": 0.25, + "grad_norm": 0.08863238488247215, + "learning_rate": 5e-05, + "loss": 1.296, + "step": 176 + }, + { + "epoch": 0.25, + "grad_norm": 0.09286714960933078, + "learning_rate": 5e-05, + "loss": 1.2342, + "step": 177 + }, + { + "epoch": 0.25, + "grad_norm": 0.09316153696727396, + "learning_rate": 5e-05, + "loss": 1.1848, + "step": 178 + }, + { + "epoch": 0.25, + "grad_norm": 0.08252849611179802, + "learning_rate": 5e-05, + "loss": 1.2113, + "step": 179 + }, + { + "epoch": 0.25, + "grad_norm": 0.09483532321449605, + "learning_rate": 5e-05, + "loss": 1.3477, + "step": 180 + }, + { + "epoch": 0.25, + "grad_norm": 0.08529609510104497, + "learning_rate": 5e-05, + "loss": 1.2457, + "step": 181 + }, + { + "epoch": 0.26, + "grad_norm": 0.08694739448894302, + "learning_rate": 5e-05, + "loss": 1.3046, + "step": 182 + }, + { + "epoch": 0.26, + "grad_norm": 0.09279919556230308, + "learning_rate": 5e-05, + "loss": 1.3031, + "step": 183 + }, + { + "epoch": 0.26, + "grad_norm": 0.08702565087109819, + "learning_rate": 5e-05, + "loss": 1.2848, + "step": 184 + }, + { + "epoch": 0.26, + "grad_norm": 0.09380380233696196, + "learning_rate": 5e-05, + "loss": 1.1916, + "step": 185 + }, + { + "epoch": 0.26, + "grad_norm": 0.08688165004745885, + "learning_rate": 5e-05, + "loss": 1.2123, + "step": 186 + }, + { + "epoch": 0.26, + "grad_norm": 0.09820840635443695, + "learning_rate": 5e-05, + "loss": 1.2032, + "step": 187 + }, + { + "epoch": 0.26, + "grad_norm": 0.09589255206537063, + "learning_rate": 5e-05, + "loss": 1.241, + "step": 188 + }, + { + "epoch": 0.27, + "grad_norm": 0.08807557974050134, + "learning_rate": 5e-05, + "loss": 1.3048, + "step": 189 + }, + { + "epoch": 0.27, + "grad_norm": 0.0851856228850121, + "learning_rate": 5e-05, + "loss": 1.2052, + "step": 190 + }, + { + "epoch": 0.27, + "grad_norm": 0.0934937983165827, + "learning_rate": 5e-05, + "loss": 1.2121, + "step": 191 + }, + { + "epoch": 0.27, + "grad_norm": 0.08374744566540891, + "learning_rate": 5e-05, + "loss": 1.2914, + "step": 192 + }, + { + "epoch": 0.27, + "grad_norm": 0.08750918563451558, + "learning_rate": 5e-05, + "loss": 1.2823, + "step": 193 + }, + { + "epoch": 0.27, + "grad_norm": 0.08869708414841139, + "learning_rate": 5e-05, + "loss": 1.2801, + "step": 194 + }, + { + "epoch": 0.27, + "grad_norm": 0.11960800847754802, + "learning_rate": 5e-05, + "loss": 1.3148, + "step": 195 + }, + { + "epoch": 0.28, + "grad_norm": 0.08976487676811867, + "learning_rate": 5e-05, + "loss": 1.2082, + "step": 196 + }, + { + "epoch": 0.28, + "grad_norm": 0.08453661418908805, + "learning_rate": 5e-05, + "loss": 1.3163, + "step": 197 + }, + { + "epoch": 0.28, + "grad_norm": 0.08962171718158313, + "learning_rate": 5e-05, + "loss": 1.1934, + "step": 198 + }, + { + "epoch": 0.28, + "grad_norm": 0.08257769819917804, + "learning_rate": 5e-05, + "loss": 1.2528, + "step": 199 + }, + { + "epoch": 0.28, + "grad_norm": 0.09925347017579209, + "learning_rate": 5e-05, + "loss": 1.2031, + "step": 200 + }, + { + "epoch": 0.28, + "grad_norm": 0.08906225826845884, + "learning_rate": 5e-05, + "loss": 1.2194, + "step": 201 + }, + { + "epoch": 0.28, + "grad_norm": 0.0942395349237732, + "learning_rate": 5e-05, + "loss": 1.178, + "step": 202 + }, + { + "epoch": 0.29, + "grad_norm": 0.09824155177800245, + "learning_rate": 5e-05, + "loss": 1.2475, + "step": 203 + }, + { + "epoch": 0.29, + "grad_norm": 0.10058855273546559, + "learning_rate": 5e-05, + "loss": 1.2937, + "step": 204 + }, + { + "epoch": 0.29, + "grad_norm": 0.09414156661731717, + "learning_rate": 5e-05, + "loss": 1.1772, + "step": 205 + }, + { + "epoch": 0.29, + "grad_norm": 0.09054559747969201, + "learning_rate": 5e-05, + "loss": 1.193, + "step": 206 + }, + { + "epoch": 0.29, + "grad_norm": 0.09536717414530912, + "learning_rate": 5e-05, + "loss": 1.307, + "step": 207 + }, + { + "epoch": 0.29, + "grad_norm": 0.08494148514851217, + "learning_rate": 5e-05, + "loss": 1.2225, + "step": 208 + }, + { + "epoch": 0.29, + "grad_norm": 0.08863261369887773, + "learning_rate": 5e-05, + "loss": 1.227, + "step": 209 + }, + { + "epoch": 0.29, + "grad_norm": 0.08582994520739551, + "learning_rate": 5e-05, + "loss": 1.2782, + "step": 210 + }, + { + "epoch": 0.3, + "grad_norm": 0.08790341158498588, + "learning_rate": 5e-05, + "loss": 1.2166, + "step": 211 + }, + { + "epoch": 0.3, + "grad_norm": 0.08519174171612676, + "learning_rate": 5e-05, + "loss": 1.1953, + "step": 212 + }, + { + "epoch": 0.3, + "grad_norm": 0.09248080147412965, + "learning_rate": 5e-05, + "loss": 1.3265, + "step": 213 + }, + { + "epoch": 0.3, + "grad_norm": 0.08567765786036889, + "learning_rate": 5e-05, + "loss": 1.312, + "step": 214 + }, + { + "epoch": 0.3, + "grad_norm": 0.0835142660273622, + "learning_rate": 5e-05, + "loss": 1.2368, + "step": 215 + }, + { + "epoch": 0.3, + "grad_norm": 0.08856004333772806, + "learning_rate": 5e-05, + "loss": 1.1784, + "step": 216 + }, + { + "epoch": 0.3, + "grad_norm": 0.08873223128935914, + "learning_rate": 5e-05, + "loss": 1.3025, + "step": 217 + }, + { + "epoch": 0.31, + "grad_norm": 0.08647493299244241, + "learning_rate": 5e-05, + "loss": 1.1954, + "step": 218 + }, + { + "epoch": 0.31, + "grad_norm": 0.09445285988148235, + "learning_rate": 5e-05, + "loss": 1.25, + "step": 219 + }, + { + "epoch": 0.31, + "grad_norm": 0.11092729662636047, + "learning_rate": 5e-05, + "loss": 1.3051, + "step": 220 + }, + { + "epoch": 0.31, + "grad_norm": 0.09774815964460999, + "learning_rate": 5e-05, + "loss": 1.2498, + "step": 221 + }, + { + "epoch": 0.31, + "grad_norm": 0.09827733571327216, + "learning_rate": 5e-05, + "loss": 1.2697, + "step": 222 + }, + { + "epoch": 0.31, + "grad_norm": 0.08569612523718341, + "learning_rate": 5e-05, + "loss": 1.2516, + "step": 223 + }, + { + "epoch": 0.31, + "grad_norm": 0.08163320323139941, + "learning_rate": 5e-05, + "loss": 1.2842, + "step": 224 + }, + { + "epoch": 0.32, + "grad_norm": 0.13696947151285582, + "learning_rate": 5e-05, + "loss": 1.1765, + "step": 225 + }, + { + "epoch": 0.32, + "grad_norm": 0.09525249130971891, + "learning_rate": 5e-05, + "loss": 1.2997, + "step": 226 + }, + { + "epoch": 0.32, + "grad_norm": 0.10987658522507797, + "learning_rate": 5e-05, + "loss": 1.2643, + "step": 227 + }, + { + "epoch": 0.32, + "grad_norm": 0.08843542171154735, + "learning_rate": 5e-05, + "loss": 1.2358, + "step": 228 + }, + { + "epoch": 0.32, + "grad_norm": 0.08739208171659757, + "learning_rate": 5e-05, + "loss": 1.2802, + "step": 229 + }, + { + "epoch": 0.32, + "grad_norm": 0.10011323980243902, + "learning_rate": 5e-05, + "loss": 1.1967, + "step": 230 + }, + { + "epoch": 0.32, + "grad_norm": 0.08597111030102798, + "learning_rate": 5e-05, + "loss": 1.2302, + "step": 231 + }, + { + "epoch": 0.33, + "grad_norm": 0.09179633281317666, + "learning_rate": 5e-05, + "loss": 1.2575, + "step": 232 + }, + { + "epoch": 0.33, + "grad_norm": 0.09528150968513077, + "learning_rate": 5e-05, + "loss": 1.2282, + "step": 233 + }, + { + "epoch": 0.33, + "grad_norm": 0.08106734899347699, + "learning_rate": 5e-05, + "loss": 1.2606, + "step": 234 + }, + { + "epoch": 0.33, + "grad_norm": 0.0930618107576243, + "learning_rate": 5e-05, + "loss": 1.2677, + "step": 235 + }, + { + "epoch": 0.33, + "grad_norm": 0.08303705141327221, + "learning_rate": 5e-05, + "loss": 1.3289, + "step": 236 + }, + { + "epoch": 0.33, + "grad_norm": 0.09159494271145804, + "learning_rate": 5e-05, + "loss": 1.2534, + "step": 237 + }, + { + "epoch": 0.33, + "grad_norm": 0.09025281712028028, + "learning_rate": 5e-05, + "loss": 1.1577, + "step": 238 + }, + { + "epoch": 0.34, + "grad_norm": 0.0886485132543316, + "learning_rate": 5e-05, + "loss": 1.1677, + "step": 239 + }, + { + "epoch": 0.34, + "grad_norm": 0.09261389531754648, + "learning_rate": 5e-05, + "loss": 1.2752, + "step": 240 + }, + { + "epoch": 0.34, + "grad_norm": 0.08496260150385654, + "learning_rate": 5e-05, + "loss": 1.2353, + "step": 241 + }, + { + "epoch": 0.34, + "grad_norm": 0.08356974381967604, + "learning_rate": 5e-05, + "loss": 1.1633, + "step": 242 + }, + { + "epoch": 0.34, + "grad_norm": 0.13720586549759095, + "learning_rate": 5e-05, + "loss": 1.1981, + "step": 243 + }, + { + "epoch": 0.34, + "grad_norm": 0.08991492270952986, + "learning_rate": 5e-05, + "loss": 1.2277, + "step": 244 + }, + { + "epoch": 0.34, + "grad_norm": 0.08280429043868678, + "learning_rate": 5e-05, + "loss": 1.2415, + "step": 245 + }, + { + "epoch": 0.35, + "grad_norm": 0.0850877529579615, + "learning_rate": 5e-05, + "loss": 1.0818, + "step": 246 + }, + { + "epoch": 0.35, + "grad_norm": 0.09180702494444304, + "learning_rate": 5e-05, + "loss": 1.2495, + "step": 247 + }, + { + "epoch": 0.35, + "grad_norm": 0.08742281930183199, + "learning_rate": 5e-05, + "loss": 1.309, + "step": 248 + }, + { + "epoch": 0.35, + "grad_norm": 0.09480410539183527, + "learning_rate": 5e-05, + "loss": 1.2493, + "step": 249 + }, + { + "epoch": 0.35, + "grad_norm": 0.08864291334102126, + "learning_rate": 5e-05, + "loss": 1.1224, + "step": 250 + }, + { + "epoch": 0.35, + "grad_norm": 0.1331463933659882, + "learning_rate": 5e-05, + "loss": 1.1975, + "step": 251 + }, + { + "epoch": 0.35, + "grad_norm": 0.08559446666097127, + "learning_rate": 5e-05, + "loss": 1.2604, + "step": 252 + }, + { + "epoch": 0.36, + "grad_norm": 0.0873395718049463, + "learning_rate": 5e-05, + "loss": 1.1965, + "step": 253 + }, + { + "epoch": 0.36, + "grad_norm": 0.09091016884233578, + "learning_rate": 5e-05, + "loss": 1.2809, + "step": 254 + }, + { + "epoch": 0.36, + "grad_norm": 0.07786084061903843, + "learning_rate": 5e-05, + "loss": 1.209, + "step": 255 + }, + { + "epoch": 0.36, + "grad_norm": 0.10081388065490703, + "learning_rate": 5e-05, + "loss": 1.3263, + "step": 256 + }, + { + "epoch": 0.36, + "grad_norm": 0.10082059674988177, + "learning_rate": 5e-05, + "loss": 1.2148, + "step": 257 + }, + { + "epoch": 0.36, + "grad_norm": 0.08564770363821494, + "learning_rate": 5e-05, + "loss": 1.3174, + "step": 258 + }, + { + "epoch": 0.36, + "grad_norm": 0.0952758942019198, + "learning_rate": 5e-05, + "loss": 1.2204, + "step": 259 + }, + { + "epoch": 0.37, + "grad_norm": 0.09881073855543142, + "learning_rate": 5e-05, + "loss": 1.2996, + "step": 260 + }, + { + "epoch": 0.37, + "grad_norm": 0.08733032423974088, + "learning_rate": 5e-05, + "loss": 1.2173, + "step": 261 + }, + { + "epoch": 0.37, + "grad_norm": 0.11515922448818783, + "learning_rate": 5e-05, + "loss": 1.2242, + "step": 262 + }, + { + "epoch": 0.37, + "grad_norm": 0.09177119070282434, + "learning_rate": 5e-05, + "loss": 1.2302, + "step": 263 + }, + { + "epoch": 0.37, + "grad_norm": 0.10216888435274363, + "learning_rate": 5e-05, + "loss": 1.2541, + "step": 264 + }, + { + "epoch": 0.37, + "grad_norm": 0.0944990608652097, + "learning_rate": 5e-05, + "loss": 1.1954, + "step": 265 + }, + { + "epoch": 0.37, + "grad_norm": 0.09399366717246797, + "learning_rate": 5e-05, + "loss": 1.3233, + "step": 266 + }, + { + "epoch": 0.38, + "grad_norm": 0.08753966644581422, + "learning_rate": 5e-05, + "loss": 1.2376, + "step": 267 + }, + { + "epoch": 0.38, + "grad_norm": 0.08766716380581922, + "learning_rate": 5e-05, + "loss": 1.2428, + "step": 268 + }, + { + "epoch": 0.38, + "grad_norm": 0.09700285494711312, + "learning_rate": 5e-05, + "loss": 1.2304, + "step": 269 + }, + { + "epoch": 0.38, + "grad_norm": 0.10359687500650869, + "learning_rate": 5e-05, + "loss": 1.2355, + "step": 270 + }, + { + "epoch": 0.38, + "grad_norm": 0.09285186919480592, + "learning_rate": 5e-05, + "loss": 1.2518, + "step": 271 + }, + { + "epoch": 0.38, + "grad_norm": 0.08843497139723314, + "learning_rate": 5e-05, + "loss": 1.2243, + "step": 272 + }, + { + "epoch": 0.38, + "grad_norm": 0.09033544598942958, + "learning_rate": 5e-05, + "loss": 1.1521, + "step": 273 + }, + { + "epoch": 0.38, + "grad_norm": 0.08935014610866102, + "learning_rate": 5e-05, + "loss": 1.2891, + "step": 274 + }, + { + "epoch": 0.39, + "grad_norm": 0.08877209475664807, + "learning_rate": 5e-05, + "loss": 1.2338, + "step": 275 + }, + { + "epoch": 0.39, + "grad_norm": 0.08620882239728929, + "learning_rate": 5e-05, + "loss": 1.3411, + "step": 276 + }, + { + "epoch": 0.39, + "grad_norm": 0.09866190499148263, + "learning_rate": 5e-05, + "loss": 1.2749, + "step": 277 + }, + { + "epoch": 0.39, + "grad_norm": 0.08740411789029733, + "learning_rate": 5e-05, + "loss": 1.2063, + "step": 278 + }, + { + "epoch": 0.39, + "grad_norm": 0.08539683234832028, + "learning_rate": 5e-05, + "loss": 1.1568, + "step": 279 + }, + { + "epoch": 0.39, + "grad_norm": 0.10362328643593488, + "learning_rate": 5e-05, + "loss": 1.1842, + "step": 280 + }, + { + "epoch": 0.39, + "grad_norm": 0.08632925887920927, + "learning_rate": 5e-05, + "loss": 1.2825, + "step": 281 + }, + { + "epoch": 0.4, + "grad_norm": 0.1474505092935928, + "learning_rate": 5e-05, + "loss": 1.1881, + "step": 282 + }, + { + "epoch": 0.4, + "grad_norm": 0.09000910585705812, + "learning_rate": 5e-05, + "loss": 1.2739, + "step": 283 + }, + { + "epoch": 0.4, + "grad_norm": 0.09833682566759666, + "learning_rate": 5e-05, + "loss": 1.1984, + "step": 284 + }, + { + "epoch": 0.4, + "grad_norm": 0.08079206912854661, + "learning_rate": 5e-05, + "loss": 1.2278, + "step": 285 + }, + { + "epoch": 0.4, + "grad_norm": 0.09010068492829859, + "learning_rate": 5e-05, + "loss": 1.2837, + "step": 286 + }, + { + "epoch": 0.4, + "grad_norm": 0.09983988726064333, + "learning_rate": 5e-05, + "loss": 1.2364, + "step": 287 + }, + { + "epoch": 0.4, + "grad_norm": 0.1949319967540023, + "learning_rate": 5e-05, + "loss": 1.2683, + "step": 288 + }, + { + "epoch": 0.41, + "grad_norm": 0.08987585052214384, + "learning_rate": 5e-05, + "loss": 1.1786, + "step": 289 + }, + { + "epoch": 0.41, + "grad_norm": 0.08979447515949317, + "learning_rate": 5e-05, + "loss": 1.2406, + "step": 290 + }, + { + "epoch": 0.41, + "grad_norm": 0.0868775798788883, + "learning_rate": 5e-05, + "loss": 1.1654, + "step": 291 + }, + { + "epoch": 0.41, + "grad_norm": 0.08790675919121381, + "learning_rate": 5e-05, + "loss": 1.3515, + "step": 292 + }, + { + "epoch": 0.41, + "grad_norm": 0.094346400666408, + "learning_rate": 5e-05, + "loss": 1.2509, + "step": 293 + }, + { + "epoch": 0.41, + "grad_norm": 0.18108569024807403, + "learning_rate": 5e-05, + "loss": 1.1853, + "step": 294 + }, + { + "epoch": 0.41, + "grad_norm": 0.08669665678623893, + "learning_rate": 5e-05, + "loss": 1.2177, + "step": 295 + }, + { + "epoch": 0.42, + "grad_norm": 0.09400313374318035, + "learning_rate": 5e-05, + "loss": 1.2741, + "step": 296 + }, + { + "epoch": 0.42, + "grad_norm": 0.08596356200590288, + "learning_rate": 5e-05, + "loss": 1.3281, + "step": 297 + }, + { + "epoch": 0.42, + "grad_norm": 0.0897491130014772, + "learning_rate": 5e-05, + "loss": 1.2531, + "step": 298 + }, + { + "epoch": 0.42, + "grad_norm": 0.08733436125874126, + "learning_rate": 5e-05, + "loss": 1.3443, + "step": 299 + }, + { + "epoch": 0.42, + "grad_norm": 0.08573312367040001, + "learning_rate": 5e-05, + "loss": 1.224, + "step": 300 + }, + { + "epoch": 0.42, + "grad_norm": 0.09557952530027969, + "learning_rate": 5e-05, + "loss": 1.1734, + "step": 301 + }, + { + "epoch": 0.42, + "grad_norm": 0.09774252440125493, + "learning_rate": 5e-05, + "loss": 1.3189, + "step": 302 + }, + { + "epoch": 0.43, + "grad_norm": 0.09209862644609934, + "learning_rate": 5e-05, + "loss": 1.1251, + "step": 303 + }, + { + "epoch": 0.43, + "grad_norm": 0.5862160971018138, + "learning_rate": 5e-05, + "loss": 1.2312, + "step": 304 + }, + { + "epoch": 0.43, + "grad_norm": 0.09270307269906179, + "learning_rate": 5e-05, + "loss": 1.2261, + "step": 305 + }, + { + "epoch": 0.43, + "grad_norm": 0.09279760165281023, + "learning_rate": 5e-05, + "loss": 1.2397, + "step": 306 + }, + { + "epoch": 0.43, + "grad_norm": 0.09814505877494192, + "learning_rate": 5e-05, + "loss": 1.2174, + "step": 307 + }, + { + "epoch": 0.43, + "grad_norm": 0.0990096436993457, + "learning_rate": 5e-05, + "loss": 1.2312, + "step": 308 + }, + { + "epoch": 0.43, + "grad_norm": 0.0829754840596578, + "learning_rate": 5e-05, + "loss": 1.1931, + "step": 309 + }, + { + "epoch": 0.44, + "grad_norm": 0.39215539694720825, + "learning_rate": 5e-05, + "loss": 1.2053, + "step": 310 + }, + { + "epoch": 0.44, + "grad_norm": 0.09334861604699533, + "learning_rate": 5e-05, + "loss": 1.2118, + "step": 311 + }, + { + "epoch": 0.44, + "grad_norm": 0.09386393833378456, + "learning_rate": 5e-05, + "loss": 1.1864, + "step": 312 + }, + { + "epoch": 0.44, + "grad_norm": 0.09608657479012335, + "learning_rate": 5e-05, + "loss": 1.21, + "step": 313 + }, + { + "epoch": 0.44, + "grad_norm": 0.0889092637084613, + "learning_rate": 5e-05, + "loss": 1.2773, + "step": 314 + }, + { + "epoch": 0.44, + "grad_norm": 0.08852000900017744, + "learning_rate": 5e-05, + "loss": 1.2122, + "step": 315 + }, + { + "epoch": 0.44, + "grad_norm": 0.0906176855815127, + "learning_rate": 5e-05, + "loss": 1.2912, + "step": 316 + }, + { + "epoch": 0.45, + "grad_norm": 0.13550666694900487, + "learning_rate": 5e-05, + "loss": 1.1409, + "step": 317 + }, + { + "epoch": 0.45, + "grad_norm": 0.20633475665889736, + "learning_rate": 5e-05, + "loss": 1.234, + "step": 318 + }, + { + "epoch": 0.45, + "grad_norm": 0.09273369896241038, + "learning_rate": 5e-05, + "loss": 1.2523, + "step": 319 + }, + { + "epoch": 0.45, + "grad_norm": 0.09021179515999266, + "learning_rate": 5e-05, + "loss": 1.1997, + "step": 320 + }, + { + "epoch": 0.45, + "grad_norm": 0.11573990924013423, + "learning_rate": 5e-05, + "loss": 1.2277, + "step": 321 + }, + { + "epoch": 0.45, + "grad_norm": 0.11133888827055353, + "learning_rate": 5e-05, + "loss": 1.2349, + "step": 322 + }, + { + "epoch": 0.45, + "grad_norm": 0.09036884546944374, + "learning_rate": 5e-05, + "loss": 1.2692, + "step": 323 + }, + { + "epoch": 0.46, + "grad_norm": 0.09273460183617792, + "learning_rate": 5e-05, + "loss": 1.2262, + "step": 324 + }, + { + "epoch": 0.46, + "grad_norm": 0.0897203674321421, + "learning_rate": 5e-05, + "loss": 1.2686, + "step": 325 + }, + { + "epoch": 0.46, + "grad_norm": 0.08586164598598854, + "learning_rate": 5e-05, + "loss": 1.2992, + "step": 326 + }, + { + "epoch": 0.46, + "grad_norm": 0.0876429588090545, + "learning_rate": 5e-05, + "loss": 1.1948, + "step": 327 + }, + { + "epoch": 0.46, + "grad_norm": 0.08454254473801266, + "learning_rate": 5e-05, + "loss": 1.2082, + "step": 328 + }, + { + "epoch": 0.46, + "grad_norm": 0.10205281527688667, + "learning_rate": 5e-05, + "loss": 1.247, + "step": 329 + }, + { + "epoch": 0.46, + "grad_norm": 0.1241707532303333, + "learning_rate": 5e-05, + "loss": 1.2656, + "step": 330 + }, + { + "epoch": 0.46, + "grad_norm": 0.09014618495881649, + "learning_rate": 5e-05, + "loss": 1.3096, + "step": 331 + }, + { + "epoch": 0.47, + "grad_norm": 0.09124011462605543, + "learning_rate": 5e-05, + "loss": 1.2806, + "step": 332 + }, + { + "epoch": 0.47, + "grad_norm": 0.09168935573514018, + "learning_rate": 5e-05, + "loss": 1.2553, + "step": 333 + }, + { + "epoch": 0.47, + "grad_norm": 0.08620339663683575, + "learning_rate": 5e-05, + "loss": 1.1868, + "step": 334 + }, + { + "epoch": 0.47, + "grad_norm": 0.08924777077687464, + "learning_rate": 5e-05, + "loss": 1.2245, + "step": 335 + }, + { + "epoch": 0.47, + "grad_norm": 0.10178521535423679, + "learning_rate": 5e-05, + "loss": 1.3737, + "step": 336 + }, + { + "epoch": 0.47, + "grad_norm": 0.09398893654170977, + "learning_rate": 5e-05, + "loss": 1.2885, + "step": 337 + }, + { + "epoch": 0.47, + "grad_norm": 0.08729984124548136, + "learning_rate": 5e-05, + "loss": 1.2033, + "step": 338 + }, + { + "epoch": 0.48, + "grad_norm": 0.08649677383098985, + "learning_rate": 5e-05, + "loss": 1.3026, + "step": 339 + }, + { + "epoch": 0.48, + "grad_norm": 0.09231926812728508, + "learning_rate": 5e-05, + "loss": 1.2129, + "step": 340 + }, + { + "epoch": 0.48, + "grad_norm": 0.09491184898321346, + "learning_rate": 5e-05, + "loss": 1.2036, + "step": 341 + }, + { + "epoch": 0.48, + "grad_norm": 0.0878659924898569, + "learning_rate": 5e-05, + "loss": 1.3329, + "step": 342 + }, + { + "epoch": 0.48, + "grad_norm": 0.09791214775489751, + "learning_rate": 5e-05, + "loss": 1.2281, + "step": 343 + }, + { + "epoch": 0.48, + "grad_norm": 0.08689911635107521, + "learning_rate": 5e-05, + "loss": 1.1929, + "step": 344 + }, + { + "epoch": 0.48, + "grad_norm": 0.1525002969052707, + "learning_rate": 5e-05, + "loss": 1.181, + "step": 345 + }, + { + "epoch": 0.49, + "grad_norm": 0.08497046240149246, + "learning_rate": 5e-05, + "loss": 1.1695, + "step": 346 + }, + { + "epoch": 0.49, + "grad_norm": 0.09489748293122932, + "learning_rate": 5e-05, + "loss": 1.239, + "step": 347 + }, + { + "epoch": 0.49, + "grad_norm": 0.08258793688853842, + "learning_rate": 5e-05, + "loss": 1.2295, + "step": 348 + }, + { + "epoch": 0.49, + "grad_norm": 0.08796622630175002, + "learning_rate": 5e-05, + "loss": 1.2632, + "step": 349 + }, + { + "epoch": 0.49, + "grad_norm": 0.08831850516260249, + "learning_rate": 5e-05, + "loss": 1.2755, + "step": 350 + }, + { + "epoch": 0.49, + "grad_norm": 0.0910930028581069, + "learning_rate": 5e-05, + "loss": 1.292, + "step": 351 + }, + { + "epoch": 0.49, + "grad_norm": 0.11500036507032467, + "learning_rate": 5e-05, + "loss": 1.3629, + "step": 352 + }, + { + "epoch": 0.5, + "grad_norm": 0.11617820145471644, + "learning_rate": 5e-05, + "loss": 1.2841, + "step": 353 + }, + { + "epoch": 0.5, + "grad_norm": 0.08536448061894295, + "learning_rate": 5e-05, + "loss": 1.2707, + "step": 354 + }, + { + "epoch": 0.5, + "grad_norm": 0.08905101614762531, + "learning_rate": 5e-05, + "loss": 1.2845, + "step": 355 + }, + { + "epoch": 0.5, + "grad_norm": 0.09715136566493858, + "learning_rate": 5e-05, + "loss": 1.1882, + "step": 356 + }, + { + "epoch": 0.5, + "grad_norm": 0.08700238865941963, + "learning_rate": 5e-05, + "loss": 1.218, + "step": 357 + }, + { + "epoch": 0.5, + "grad_norm": 0.08471677447492601, + "learning_rate": 5e-05, + "loss": 1.267, + "step": 358 + }, + { + "epoch": 0.5, + "grad_norm": 0.10297351790993844, + "learning_rate": 5e-05, + "loss": 1.2394, + "step": 359 + }, + { + "epoch": 0.51, + "grad_norm": 0.09972387244753816, + "learning_rate": 5e-05, + "loss": 1.2333, + "step": 360 + }, + { + "epoch": 0.51, + "grad_norm": 0.09585641462137544, + "learning_rate": 5e-05, + "loss": 1.1838, + "step": 361 + }, + { + "epoch": 0.51, + "grad_norm": 0.09222648513487403, + "learning_rate": 5e-05, + "loss": 1.1864, + "step": 362 + }, + { + "epoch": 0.51, + "grad_norm": 0.10905085748487696, + "learning_rate": 5e-05, + "loss": 1.2761, + "step": 363 + }, + { + "epoch": 0.51, + "grad_norm": 0.0870480307132673, + "learning_rate": 5e-05, + "loss": 1.1832, + "step": 364 + }, + { + "epoch": 0.51, + "grad_norm": 0.09611971365786509, + "learning_rate": 5e-05, + "loss": 1.2055, + "step": 365 + }, + { + "epoch": 0.51, + "grad_norm": 0.08708476499588201, + "learning_rate": 5e-05, + "loss": 1.2615, + "step": 366 + }, + { + "epoch": 0.52, + "grad_norm": 0.08627345445970362, + "learning_rate": 5e-05, + "loss": 1.2535, + "step": 367 + }, + { + "epoch": 0.52, + "grad_norm": 0.15348060051465828, + "learning_rate": 5e-05, + "loss": 1.2564, + "step": 368 + }, + { + "epoch": 0.52, + "grad_norm": 0.08297376468665693, + "learning_rate": 5e-05, + "loss": 1.2254, + "step": 369 + }, + { + "epoch": 0.52, + "grad_norm": 0.09508719239018505, + "learning_rate": 5e-05, + "loss": 1.2934, + "step": 370 + }, + { + "epoch": 0.52, + "grad_norm": 0.12194038273230035, + "learning_rate": 5e-05, + "loss": 1.2128, + "step": 371 + }, + { + "epoch": 0.52, + "grad_norm": 0.11423944805152593, + "learning_rate": 5e-05, + "loss": 1.2313, + "step": 372 + }, + { + "epoch": 0.52, + "grad_norm": 0.09659870039819161, + "learning_rate": 5e-05, + "loss": 1.3193, + "step": 373 + }, + { + "epoch": 0.53, + "grad_norm": 0.10532303440785686, + "learning_rate": 5e-05, + "loss": 1.1658, + "step": 374 + }, + { + "epoch": 0.53, + "grad_norm": 0.09396037558006054, + "learning_rate": 5e-05, + "loss": 1.3109, + "step": 375 + }, + { + "epoch": 0.53, + "grad_norm": 0.08646485393289115, + "learning_rate": 5e-05, + "loss": 1.234, + "step": 376 + }, + { + "epoch": 0.53, + "grad_norm": 0.6808951630234666, + "learning_rate": 5e-05, + "loss": 1.2478, + "step": 377 + }, + { + "epoch": 0.53, + "grad_norm": 0.09923824159858374, + "learning_rate": 5e-05, + "loss": 1.2106, + "step": 378 + }, + { + "epoch": 0.53, + "grad_norm": 0.09179741428225503, + "learning_rate": 5e-05, + "loss": 1.2459, + "step": 379 + }, + { + "epoch": 0.53, + "grad_norm": 0.08445702707658428, + "learning_rate": 5e-05, + "loss": 1.2344, + "step": 380 + }, + { + "epoch": 0.54, + "grad_norm": 0.09014028580273542, + "learning_rate": 5e-05, + "loss": 1.2846, + "step": 381 + }, + { + "epoch": 0.54, + "grad_norm": 0.09160934500793338, + "learning_rate": 5e-05, + "loss": 1.2917, + "step": 382 + }, + { + "epoch": 0.54, + "grad_norm": 0.08807606894331109, + "learning_rate": 5e-05, + "loss": 1.2065, + "step": 383 + }, + { + "epoch": 0.54, + "grad_norm": 0.274390352758179, + "learning_rate": 5e-05, + "loss": 1.2391, + "step": 384 + }, + { + "epoch": 0.54, + "grad_norm": 0.09427427568172006, + "learning_rate": 5e-05, + "loss": 1.2742, + "step": 385 + }, + { + "epoch": 0.54, + "grad_norm": 0.08959931932453016, + "learning_rate": 5e-05, + "loss": 1.156, + "step": 386 + }, + { + "epoch": 0.54, + "grad_norm": 0.09111988378067222, + "learning_rate": 5e-05, + "loss": 1.263, + "step": 387 + }, + { + "epoch": 0.54, + "grad_norm": 0.09200529039851428, + "learning_rate": 5e-05, + "loss": 1.262, + "step": 388 + }, + { + "epoch": 0.55, + "grad_norm": 0.10735809551883739, + "learning_rate": 5e-05, + "loss": 1.1889, + "step": 389 + }, + { + "epoch": 0.55, + "grad_norm": 0.0949718786848268, + "learning_rate": 5e-05, + "loss": 1.2839, + "step": 390 + }, + { + "epoch": 0.55, + "grad_norm": 0.0961142978717631, + "learning_rate": 5e-05, + "loss": 1.1699, + "step": 391 + }, + { + "epoch": 0.55, + "grad_norm": 0.09605304671582335, + "learning_rate": 5e-05, + "loss": 1.2185, + "step": 392 + }, + { + "epoch": 0.55, + "grad_norm": 0.09513429121879642, + "learning_rate": 5e-05, + "loss": 1.145, + "step": 393 + }, + { + "epoch": 0.55, + "grad_norm": 0.12298501280152162, + "learning_rate": 5e-05, + "loss": 1.2845, + "step": 394 + }, + { + "epoch": 0.55, + "grad_norm": 0.09036302179963046, + "learning_rate": 5e-05, + "loss": 1.2965, + "step": 395 + }, + { + "epoch": 0.56, + "grad_norm": 0.09264349991284292, + "learning_rate": 5e-05, + "loss": 1.2258, + "step": 396 + }, + { + "epoch": 0.56, + "grad_norm": 0.08976172362633462, + "learning_rate": 5e-05, + "loss": 1.1609, + "step": 397 + }, + { + "epoch": 0.56, + "grad_norm": 0.08993401888120962, + "learning_rate": 5e-05, + "loss": 1.132, + "step": 398 + }, + { + "epoch": 0.56, + "grad_norm": 0.1062719885812111, + "learning_rate": 5e-05, + "loss": 1.2533, + "step": 399 + }, + { + "epoch": 0.56, + "grad_norm": 0.09830204964453033, + "learning_rate": 5e-05, + "loss": 1.2583, + "step": 400 + }, + { + "epoch": 0.56, + "grad_norm": 0.23416064668117736, + "learning_rate": 5e-05, + "loss": 1.2447, + "step": 401 + }, + { + "epoch": 0.56, + "grad_norm": 0.12406687473928911, + "learning_rate": 5e-05, + "loss": 1.175, + "step": 402 + }, + { + "epoch": 0.57, + "grad_norm": 0.09451579111057025, + "learning_rate": 5e-05, + "loss": 1.2168, + "step": 403 + }, + { + "epoch": 0.57, + "grad_norm": 0.10898561049412687, + "learning_rate": 5e-05, + "loss": 1.1535, + "step": 404 + }, + { + "epoch": 0.57, + "grad_norm": 0.0925113621388591, + "learning_rate": 5e-05, + "loss": 1.1484, + "step": 405 + }, + { + "epoch": 0.57, + "grad_norm": 0.15223288687881376, + "learning_rate": 5e-05, + "loss": 1.1294, + "step": 406 + }, + { + "epoch": 0.57, + "grad_norm": 0.09556984444055779, + "learning_rate": 5e-05, + "loss": 1.2794, + "step": 407 + }, + { + "epoch": 0.57, + "grad_norm": 0.09785022213423893, + "learning_rate": 5e-05, + "loss": 1.176, + "step": 408 + }, + { + "epoch": 0.57, + "grad_norm": 0.10533900637109996, + "learning_rate": 5e-05, + "loss": 1.2352, + "step": 409 + }, + { + "epoch": 0.58, + "grad_norm": 0.10272929322025146, + "learning_rate": 5e-05, + "loss": 1.2611, + "step": 410 + }, + { + "epoch": 0.58, + "grad_norm": 0.10061997728558097, + "learning_rate": 5e-05, + "loss": 1.1915, + "step": 411 + }, + { + "epoch": 0.58, + "grad_norm": 0.10088267664927592, + "learning_rate": 5e-05, + "loss": 1.2349, + "step": 412 + }, + { + "epoch": 0.58, + "grad_norm": 0.09374930987913425, + "learning_rate": 5e-05, + "loss": 1.185, + "step": 413 + }, + { + "epoch": 0.58, + "grad_norm": 0.11521303955050079, + "learning_rate": 5e-05, + "loss": 1.2758, + "step": 414 + }, + { + "epoch": 0.58, + "grad_norm": 0.10178700476680842, + "learning_rate": 5e-05, + "loss": 1.2672, + "step": 415 + }, + { + "epoch": 0.58, + "grad_norm": 0.09804258149927716, + "learning_rate": 5e-05, + "loss": 1.1785, + "step": 416 + }, + { + "epoch": 0.59, + "grad_norm": 0.1052279442753329, + "learning_rate": 5e-05, + "loss": 1.2223, + "step": 417 + }, + { + "epoch": 0.59, + "grad_norm": 0.18317490027472128, + "learning_rate": 5e-05, + "loss": 1.2159, + "step": 418 + }, + { + "epoch": 0.59, + "grad_norm": 0.08991206745450332, + "learning_rate": 5e-05, + "loss": 1.2249, + "step": 419 + }, + { + "epoch": 0.59, + "grad_norm": 0.10437348818281557, + "learning_rate": 5e-05, + "loss": 1.2469, + "step": 420 + }, + { + "epoch": 0.59, + "grad_norm": 0.12639935743830863, + "learning_rate": 5e-05, + "loss": 1.1964, + "step": 421 + }, + { + "epoch": 0.59, + "grad_norm": 0.17550772921542432, + "learning_rate": 5e-05, + "loss": 1.2546, + "step": 422 + }, + { + "epoch": 0.59, + "grad_norm": 0.10916765791086883, + "learning_rate": 5e-05, + "loss": 1.287, + "step": 423 + }, + { + "epoch": 0.6, + "grad_norm": 0.11195095125404325, + "learning_rate": 5e-05, + "loss": 1.1399, + "step": 424 + }, + { + "epoch": 0.6, + "grad_norm": 0.10464162033679592, + "learning_rate": 5e-05, + "loss": 1.252, + "step": 425 + }, + { + "epoch": 0.6, + "grad_norm": 0.1422007878111932, + "learning_rate": 5e-05, + "loss": 1.3093, + "step": 426 + }, + { + "epoch": 0.6, + "grad_norm": 0.09646644642726107, + "learning_rate": 5e-05, + "loss": 1.1038, + "step": 427 + }, + { + "epoch": 0.6, + "grad_norm": 0.11969252954059995, + "learning_rate": 5e-05, + "loss": 1.3556, + "step": 428 + }, + { + "epoch": 0.6, + "grad_norm": 0.14302305454050254, + "learning_rate": 5e-05, + "loss": 1.253, + "step": 429 + }, + { + "epoch": 0.6, + "grad_norm": 0.1016643997546001, + "learning_rate": 5e-05, + "loss": 1.2539, + "step": 430 + }, + { + "epoch": 0.61, + "grad_norm": 0.09740846997917411, + "learning_rate": 5e-05, + "loss": 1.1299, + "step": 431 + }, + { + "epoch": 0.61, + "grad_norm": 0.11960995656060532, + "learning_rate": 5e-05, + "loss": 1.2244, + "step": 432 + }, + { + "epoch": 0.61, + "grad_norm": 0.13189794138210517, + "learning_rate": 5e-05, + "loss": 1.1457, + "step": 433 + }, + { + "epoch": 0.61, + "grad_norm": 0.10424668133969454, + "learning_rate": 5e-05, + "loss": 1.1982, + "step": 434 + }, + { + "epoch": 0.61, + "grad_norm": 0.0981321081963098, + "learning_rate": 5e-05, + "loss": 1.2044, + "step": 435 + }, + { + "epoch": 0.61, + "grad_norm": 0.09649032177723192, + "learning_rate": 5e-05, + "loss": 1.3012, + "step": 436 + }, + { + "epoch": 0.61, + "grad_norm": 0.10150706324371488, + "learning_rate": 5e-05, + "loss": 1.3317, + "step": 437 + }, + { + "epoch": 0.62, + "grad_norm": 0.10160318429416466, + "learning_rate": 5e-05, + "loss": 1.2627, + "step": 438 + }, + { + "epoch": 0.62, + "grad_norm": 0.0890625144677584, + "learning_rate": 5e-05, + "loss": 1.1692, + "step": 439 + }, + { + "epoch": 0.62, + "grad_norm": 0.11171587166184754, + "learning_rate": 5e-05, + "loss": 1.2245, + "step": 440 + }, + { + "epoch": 0.62, + "grad_norm": 0.13847591569097098, + "learning_rate": 5e-05, + "loss": 1.2836, + "step": 441 + }, + { + "epoch": 0.62, + "grad_norm": 0.09529199853803245, + "learning_rate": 5e-05, + "loss": 1.1555, + "step": 442 + }, + { + "epoch": 0.62, + "grad_norm": 0.09678894591057594, + "learning_rate": 5e-05, + "loss": 1.1751, + "step": 443 + }, + { + "epoch": 0.62, + "grad_norm": 0.11432300921186853, + "learning_rate": 5e-05, + "loss": 1.1568, + "step": 444 + }, + { + "epoch": 0.62, + "grad_norm": 0.08495476342246677, + "learning_rate": 5e-05, + "loss": 1.3804, + "step": 445 + }, + { + "epoch": 0.63, + "grad_norm": 0.09547703516468427, + "learning_rate": 5e-05, + "loss": 1.2004, + "step": 446 + }, + { + "epoch": 0.63, + "grad_norm": 0.10241446863877374, + "learning_rate": 5e-05, + "loss": 1.2716, + "step": 447 + }, + { + "epoch": 0.63, + "grad_norm": 0.10552735100687363, + "learning_rate": 5e-05, + "loss": 1.2046, + "step": 448 + }, + { + "epoch": 0.63, + "grad_norm": 0.08432451153889137, + "learning_rate": 5e-05, + "loss": 1.2623, + "step": 449 + }, + { + "epoch": 0.63, + "grad_norm": 0.09588923581765316, + "learning_rate": 5e-05, + "loss": 1.2843, + "step": 450 + }, + { + "epoch": 0.63, + "grad_norm": 0.09654932304570306, + "learning_rate": 5e-05, + "loss": 1.2058, + "step": 451 + }, + { + "epoch": 0.63, + "grad_norm": 0.09891455377934905, + "learning_rate": 5e-05, + "loss": 1.3826, + "step": 452 + }, + { + "epoch": 0.64, + "grad_norm": 0.0839148273966323, + "learning_rate": 5e-05, + "loss": 1.1561, + "step": 453 + }, + { + "epoch": 0.64, + "grad_norm": 0.09134099897491745, + "learning_rate": 5e-05, + "loss": 1.2411, + "step": 454 + }, + { + "epoch": 0.64, + "grad_norm": 0.11136514392856785, + "learning_rate": 5e-05, + "loss": 1.3289, + "step": 455 + }, + { + "epoch": 0.64, + "grad_norm": 0.09395579557584285, + "learning_rate": 5e-05, + "loss": 1.242, + "step": 456 + }, + { + "epoch": 0.64, + "grad_norm": 0.08882218477068878, + "learning_rate": 5e-05, + "loss": 1.2912, + "step": 457 + }, + { + "epoch": 0.64, + "grad_norm": 0.09720463330678311, + "learning_rate": 5e-05, + "loss": 1.164, + "step": 458 + }, + { + "epoch": 0.64, + "grad_norm": 0.09536872864789071, + "learning_rate": 5e-05, + "loss": 1.3001, + "step": 459 + }, + { + "epoch": 0.65, + "grad_norm": 0.10461146139466083, + "learning_rate": 5e-05, + "loss": 1.2665, + "step": 460 + }, + { + "epoch": 0.65, + "grad_norm": 0.09655182371714942, + "learning_rate": 5e-05, + "loss": 1.3421, + "step": 461 + }, + { + "epoch": 0.65, + "grad_norm": 0.08707150535037736, + "learning_rate": 5e-05, + "loss": 1.1887, + "step": 462 + }, + { + "epoch": 0.65, + "grad_norm": 0.08649148684083302, + "learning_rate": 5e-05, + "loss": 1.2537, + "step": 463 + }, + { + "epoch": 0.65, + "grad_norm": 0.08864329898350386, + "learning_rate": 5e-05, + "loss": 1.2289, + "step": 464 + }, + { + "epoch": 0.65, + "grad_norm": 0.08237439158374964, + "learning_rate": 5e-05, + "loss": 1.3317, + "step": 465 + }, + { + "epoch": 0.65, + "grad_norm": 0.08672733104514327, + "learning_rate": 5e-05, + "loss": 1.2911, + "step": 466 + }, + { + "epoch": 0.66, + "grad_norm": 0.0827910775243905, + "learning_rate": 5e-05, + "loss": 1.1791, + "step": 467 + }, + { + "epoch": 0.66, + "grad_norm": 0.1106203002142719, + "learning_rate": 5e-05, + "loss": 1.2744, + "step": 468 + }, + { + "epoch": 0.66, + "grad_norm": 0.0901945367612223, + "learning_rate": 5e-05, + "loss": 1.3298, + "step": 469 + }, + { + "epoch": 0.66, + "grad_norm": 0.09230318467875029, + "learning_rate": 5e-05, + "loss": 1.2271, + "step": 470 + }, + { + "epoch": 0.66, + "grad_norm": 0.09521443609898626, + "learning_rate": 5e-05, + "loss": 1.1468, + "step": 471 + }, + { + "epoch": 0.66, + "grad_norm": 0.0942535507748726, + "learning_rate": 5e-05, + "loss": 1.1731, + "step": 472 + }, + { + "epoch": 0.66, + "grad_norm": 0.09201271955453857, + "learning_rate": 5e-05, + "loss": 1.2212, + "step": 473 + }, + { + "epoch": 0.67, + "grad_norm": 0.08128850455351448, + "learning_rate": 5e-05, + "loss": 1.2136, + "step": 474 + }, + { + "epoch": 0.67, + "grad_norm": 0.10620042740932711, + "learning_rate": 5e-05, + "loss": 1.1852, + "step": 475 + }, + { + "epoch": 0.67, + "grad_norm": 0.0915936719583377, + "learning_rate": 5e-05, + "loss": 1.268, + "step": 476 + }, + { + "epoch": 0.67, + "grad_norm": 0.09057021272981364, + "learning_rate": 5e-05, + "loss": 1.1531, + "step": 477 + }, + { + "epoch": 0.67, + "grad_norm": 0.0883804485429764, + "learning_rate": 5e-05, + "loss": 1.3008, + "step": 478 + }, + { + "epoch": 0.67, + "grad_norm": 0.09701027219637508, + "learning_rate": 5e-05, + "loss": 1.3001, + "step": 479 + }, + { + "epoch": 0.67, + "grad_norm": 0.0992623444273281, + "learning_rate": 5e-05, + "loss": 1.3357, + "step": 480 + }, + { + "epoch": 0.68, + "grad_norm": 0.0928861673772843, + "learning_rate": 5e-05, + "loss": 1.1852, + "step": 481 + }, + { + "epoch": 0.68, + "grad_norm": 0.12092322563789983, + "learning_rate": 5e-05, + "loss": 1.2453, + "step": 482 + }, + { + "epoch": 0.68, + "grad_norm": 0.09600770042201742, + "learning_rate": 5e-05, + "loss": 1.255, + "step": 483 + }, + { + "epoch": 0.68, + "grad_norm": 0.11193934781672676, + "learning_rate": 5e-05, + "loss": 1.2766, + "step": 484 + }, + { + "epoch": 0.68, + "grad_norm": 0.09242666432521533, + "learning_rate": 5e-05, + "loss": 1.2101, + "step": 485 + }, + { + "epoch": 0.68, + "grad_norm": 0.09568248927731744, + "learning_rate": 5e-05, + "loss": 1.0978, + "step": 486 + }, + { + "epoch": 0.68, + "grad_norm": 0.08582432543883815, + "learning_rate": 5e-05, + "loss": 1.1554, + "step": 487 + }, + { + "epoch": 0.69, + "grad_norm": 0.09485167361850859, + "learning_rate": 5e-05, + "loss": 1.2416, + "step": 488 + }, + { + "epoch": 0.69, + "grad_norm": 0.09022911381360664, + "learning_rate": 5e-05, + "loss": 1.1792, + "step": 489 + }, + { + "epoch": 0.69, + "grad_norm": 0.10119753928947335, + "learning_rate": 5e-05, + "loss": 1.2419, + "step": 490 + }, + { + "epoch": 0.69, + "grad_norm": 0.09273871341221016, + "learning_rate": 5e-05, + "loss": 1.2451, + "step": 491 + }, + { + "epoch": 0.69, + "grad_norm": 0.12935815409522153, + "learning_rate": 5e-05, + "loss": 1.2029, + "step": 492 + }, + { + "epoch": 0.69, + "grad_norm": 0.08332668684798196, + "learning_rate": 5e-05, + "loss": 1.2665, + "step": 493 + }, + { + "epoch": 0.69, + "grad_norm": 0.09306656364478559, + "learning_rate": 5e-05, + "loss": 1.1738, + "step": 494 + }, + { + "epoch": 0.7, + "grad_norm": 0.0969375533472487, + "learning_rate": 5e-05, + "loss": 1.3214, + "step": 495 + }, + { + "epoch": 0.7, + "grad_norm": 0.08338803886851001, + "learning_rate": 5e-05, + "loss": 1.2015, + "step": 496 + }, + { + "epoch": 0.7, + "grad_norm": 0.09456316110967951, + "learning_rate": 5e-05, + "loss": 1.1432, + "step": 497 + }, + { + "epoch": 0.7, + "grad_norm": 0.08693678933618375, + "learning_rate": 5e-05, + "loss": 1.1972, + "step": 498 + }, + { + "epoch": 0.7, + "grad_norm": 0.09780253129721403, + "learning_rate": 5e-05, + "loss": 1.0717, + "step": 499 + }, + { + "epoch": 0.7, + "grad_norm": 0.10569441197996789, + "learning_rate": 5e-05, + "loss": 1.2729, + "step": 500 + }, + { + "epoch": 0.7, + "grad_norm": 0.10286492143515646, + "learning_rate": 5e-05, + "loss": 1.234, + "step": 501 + }, + { + "epoch": 0.71, + "grad_norm": 0.0910185735041866, + "learning_rate": 5e-05, + "loss": 1.1907, + "step": 502 + }, + { + "epoch": 0.71, + "grad_norm": 0.09275664617617299, + "learning_rate": 5e-05, + "loss": 1.2794, + "step": 503 + }, + { + "epoch": 0.71, + "grad_norm": 0.09128487009140039, + "learning_rate": 5e-05, + "loss": 1.2783, + "step": 504 + }, + { + "epoch": 0.71, + "grad_norm": 0.09904955332557805, + "learning_rate": 5e-05, + "loss": 1.1796, + "step": 505 + }, + { + "epoch": 0.71, + "grad_norm": 0.08925947605079272, + "learning_rate": 5e-05, + "loss": 1.2086, + "step": 506 + }, + { + "epoch": 0.71, + "grad_norm": 0.22704931044503435, + "learning_rate": 5e-05, + "loss": 1.3014, + "step": 507 + }, + { + "epoch": 0.71, + "grad_norm": 0.09220561130196987, + "learning_rate": 5e-05, + "loss": 1.1986, + "step": 508 + }, + { + "epoch": 0.71, + "grad_norm": 0.10748878494268495, + "learning_rate": 5e-05, + "loss": 1.2742, + "step": 509 + }, + { + "epoch": 0.72, + "grad_norm": 0.09099677912935496, + "learning_rate": 5e-05, + "loss": 1.1609, + "step": 510 + }, + { + "epoch": 0.72, + "grad_norm": 0.09169039515261505, + "learning_rate": 5e-05, + "loss": 1.2536, + "step": 511 + }, + { + "epoch": 0.72, + "grad_norm": 0.08524246790215907, + "learning_rate": 5e-05, + "loss": 1.206, + "step": 512 + }, + { + "epoch": 0.72, + "grad_norm": 0.17719467129480335, + "learning_rate": 5e-05, + "loss": 1.2534, + "step": 513 + }, + { + "epoch": 0.72, + "grad_norm": 0.09519313583555802, + "learning_rate": 5e-05, + "loss": 1.2505, + "step": 514 + }, + { + "epoch": 0.72, + "grad_norm": 0.09352280046231452, + "learning_rate": 5e-05, + "loss": 1.1092, + "step": 515 + }, + { + "epoch": 0.72, + "grad_norm": 0.0951263810551215, + "learning_rate": 5e-05, + "loss": 1.2591, + "step": 516 + }, + { + "epoch": 0.73, + "grad_norm": 0.09838751970931506, + "learning_rate": 5e-05, + "loss": 1.2617, + "step": 517 + }, + { + "epoch": 0.73, + "grad_norm": 0.0948229297871239, + "learning_rate": 5e-05, + "loss": 1.2743, + "step": 518 + }, + { + "epoch": 0.73, + "grad_norm": 0.0945440215643022, + "learning_rate": 5e-05, + "loss": 1.331, + "step": 519 + }, + { + "epoch": 0.73, + "grad_norm": 0.11637647970404298, + "learning_rate": 5e-05, + "loss": 1.2419, + "step": 520 + }, + { + "epoch": 0.73, + "grad_norm": 0.08932495455100235, + "learning_rate": 5e-05, + "loss": 1.2856, + "step": 521 + }, + { + "epoch": 0.73, + "grad_norm": 0.08908148234874731, + "learning_rate": 5e-05, + "loss": 1.2732, + "step": 522 + }, + { + "epoch": 0.73, + "grad_norm": 0.09684259375493993, + "learning_rate": 5e-05, + "loss": 1.2399, + "step": 523 + }, + { + "epoch": 0.74, + "grad_norm": 0.09624041473631541, + "learning_rate": 5e-05, + "loss": 1.2203, + "step": 524 + }, + { + "epoch": 0.74, + "grad_norm": 0.09789307170040781, + "learning_rate": 5e-05, + "loss": 1.2056, + "step": 525 + }, + { + "epoch": 0.74, + "grad_norm": 0.09270663219504559, + "learning_rate": 5e-05, + "loss": 1.2565, + "step": 526 + }, + { + "epoch": 0.74, + "grad_norm": 0.10274518440927816, + "learning_rate": 5e-05, + "loss": 1.2024, + "step": 527 + }, + { + "epoch": 0.74, + "grad_norm": 0.08963594413715927, + "learning_rate": 5e-05, + "loss": 1.3096, + "step": 528 + }, + { + "epoch": 0.74, + "grad_norm": 0.09075285786227485, + "learning_rate": 5e-05, + "loss": 1.13, + "step": 529 + }, + { + "epoch": 0.74, + "grad_norm": 0.0938384051794185, + "learning_rate": 5e-05, + "loss": 1.2874, + "step": 530 + }, + { + "epoch": 0.75, + "grad_norm": 0.08974315394544331, + "learning_rate": 5e-05, + "loss": 1.1766, + "step": 531 + }, + { + "epoch": 0.75, + "grad_norm": 0.10208489459703247, + "learning_rate": 5e-05, + "loss": 1.3835, + "step": 532 + }, + { + "epoch": 0.75, + "grad_norm": 0.11970068038104684, + "learning_rate": 5e-05, + "loss": 1.2526, + "step": 533 + }, + { + "epoch": 0.75, + "grad_norm": 0.09175875498268529, + "learning_rate": 5e-05, + "loss": 1.2134, + "step": 534 + }, + { + "epoch": 0.75, + "grad_norm": 0.16304765015201575, + "learning_rate": 5e-05, + "loss": 1.1869, + "step": 535 + }, + { + "epoch": 0.75, + "grad_norm": 0.0934152696898247, + "learning_rate": 5e-05, + "loss": 1.1683, + "step": 536 + }, + { + "epoch": 0.75, + "grad_norm": 0.0958139159703967, + "learning_rate": 5e-05, + "loss": 1.242, + "step": 537 + }, + { + "epoch": 0.76, + "grad_norm": 0.08766778926036489, + "learning_rate": 5e-05, + "loss": 1.2168, + "step": 538 + }, + { + "epoch": 0.76, + "grad_norm": 0.08617009411449289, + "learning_rate": 5e-05, + "loss": 1.1738, + "step": 539 + }, + { + "epoch": 0.76, + "grad_norm": 0.09322306508154198, + "learning_rate": 5e-05, + "loss": 1.2388, + "step": 540 + }, + { + "epoch": 0.76, + "grad_norm": 0.08797703950745679, + "learning_rate": 5e-05, + "loss": 1.2238, + "step": 541 + }, + { + "epoch": 0.76, + "grad_norm": 0.21576889587028528, + "learning_rate": 5e-05, + "loss": 1.3287, + "step": 542 + }, + { + "epoch": 0.76, + "grad_norm": 0.10736353355137965, + "learning_rate": 5e-05, + "loss": 1.1682, + "step": 543 + }, + { + "epoch": 0.76, + "grad_norm": 0.10113374100373243, + "learning_rate": 5e-05, + "loss": 1.3353, + "step": 544 + }, + { + "epoch": 0.77, + "grad_norm": 0.09712275336017097, + "learning_rate": 5e-05, + "loss": 1.2325, + "step": 545 + }, + { + "epoch": 0.77, + "grad_norm": 0.09476641064653353, + "learning_rate": 5e-05, + "loss": 1.2156, + "step": 546 + }, + { + "epoch": 0.77, + "grad_norm": 0.0965186360988305, + "learning_rate": 5e-05, + "loss": 1.2718, + "step": 547 + }, + { + "epoch": 0.77, + "grad_norm": 0.09150452088005474, + "learning_rate": 5e-05, + "loss": 1.2425, + "step": 548 + }, + { + "epoch": 0.77, + "grad_norm": 0.09151462508356896, + "learning_rate": 5e-05, + "loss": 1.2619, + "step": 549 + }, + { + "epoch": 0.77, + "grad_norm": 0.09072441382198769, + "learning_rate": 5e-05, + "loss": 1.2418, + "step": 550 + }, + { + "epoch": 0.77, + "grad_norm": 0.09455576398670734, + "learning_rate": 5e-05, + "loss": 1.2499, + "step": 551 + }, + { + "epoch": 0.78, + "grad_norm": 0.11715209715835109, + "learning_rate": 5e-05, + "loss": 1.2512, + "step": 552 + }, + { + "epoch": 0.78, + "grad_norm": 0.09015069313677429, + "learning_rate": 5e-05, + "loss": 1.2221, + "step": 553 + }, + { + "epoch": 0.78, + "grad_norm": 0.09099730119951754, + "learning_rate": 5e-05, + "loss": 1.3887, + "step": 554 + }, + { + "epoch": 0.78, + "grad_norm": 0.12008260193989902, + "learning_rate": 5e-05, + "loss": 1.2303, + "step": 555 + }, + { + "epoch": 0.78, + "grad_norm": 0.0954853601698297, + "learning_rate": 5e-05, + "loss": 1.3012, + "step": 556 + }, + { + "epoch": 0.78, + "grad_norm": 0.09033149982015323, + "learning_rate": 5e-05, + "loss": 1.198, + "step": 557 + }, + { + "epoch": 0.78, + "grad_norm": 0.08959245434326961, + "learning_rate": 5e-05, + "loss": 1.2328, + "step": 558 + }, + { + "epoch": 0.79, + "grad_norm": 0.09406315705504768, + "learning_rate": 5e-05, + "loss": 1.1844, + "step": 559 + }, + { + "epoch": 0.79, + "grad_norm": 0.09872861081261884, + "learning_rate": 5e-05, + "loss": 1.3282, + "step": 560 + }, + { + "epoch": 0.79, + "grad_norm": 0.09041718087545901, + "learning_rate": 5e-05, + "loss": 1.1816, + "step": 561 + }, + { + "epoch": 0.79, + "grad_norm": 0.09627804804863976, + "learning_rate": 5e-05, + "loss": 1.2859, + "step": 562 + }, + { + "epoch": 0.79, + "grad_norm": 0.09321543397429712, + "learning_rate": 5e-05, + "loss": 1.1685, + "step": 563 + }, + { + "epoch": 0.79, + "grad_norm": 0.10167814792024299, + "learning_rate": 5e-05, + "loss": 1.1443, + "step": 564 + }, + { + "epoch": 0.79, + "grad_norm": 0.08859079919809663, + "learning_rate": 5e-05, + "loss": 1.1557, + "step": 565 + }, + { + "epoch": 0.79, + "grad_norm": 0.12122065680973139, + "learning_rate": 5e-05, + "loss": 1.2822, + "step": 566 + }, + { + "epoch": 0.8, + "grad_norm": 0.1332255815827339, + "learning_rate": 5e-05, + "loss": 1.3126, + "step": 567 + }, + { + "epoch": 0.8, + "grad_norm": 0.09323771362490287, + "learning_rate": 5e-05, + "loss": 1.2537, + "step": 568 + }, + { + "epoch": 0.8, + "grad_norm": 0.09042911726751882, + "learning_rate": 5e-05, + "loss": 1.2238, + "step": 569 + }, + { + "epoch": 0.8, + "grad_norm": 0.09452288700761455, + "learning_rate": 5e-05, + "loss": 1.2769, + "step": 570 + }, + { + "epoch": 0.8, + "grad_norm": 0.11626814151913445, + "learning_rate": 5e-05, + "loss": 1.228, + "step": 571 + }, + { + "epoch": 0.8, + "grad_norm": 0.1002843815686134, + "learning_rate": 5e-05, + "loss": 1.1853, + "step": 572 + }, + { + "epoch": 0.8, + "grad_norm": 0.08672825325069114, + "learning_rate": 5e-05, + "loss": 1.2448, + "step": 573 + }, + { + "epoch": 0.81, + "grad_norm": 0.09913534884868135, + "learning_rate": 5e-05, + "loss": 1.1606, + "step": 574 + }, + { + "epoch": 0.81, + "grad_norm": 0.09593606249520897, + "learning_rate": 5e-05, + "loss": 1.2264, + "step": 575 + }, + { + "epoch": 0.81, + "grad_norm": 0.12973964748908556, + "learning_rate": 5e-05, + "loss": 1.2514, + "step": 576 + }, + { + "epoch": 0.81, + "grad_norm": 0.09657199392730052, + "learning_rate": 5e-05, + "loss": 1.2813, + "step": 577 + }, + { + "epoch": 0.81, + "grad_norm": 0.11761070678818837, + "learning_rate": 5e-05, + "loss": 1.1241, + "step": 578 + }, + { + "epoch": 0.81, + "grad_norm": 0.14490021925724572, + "learning_rate": 5e-05, + "loss": 1.1771, + "step": 579 + }, + { + "epoch": 0.81, + "grad_norm": 0.09146456504920918, + "learning_rate": 5e-05, + "loss": 1.2572, + "step": 580 + }, + { + "epoch": 0.82, + "grad_norm": 0.0932619127350976, + "learning_rate": 5e-05, + "loss": 1.2448, + "step": 581 + }, + { + "epoch": 0.82, + "grad_norm": 0.12824831041809026, + "learning_rate": 5e-05, + "loss": 1.2898, + "step": 582 + }, + { + "epoch": 0.82, + "grad_norm": 0.09178186690319479, + "learning_rate": 5e-05, + "loss": 1.2588, + "step": 583 + }, + { + "epoch": 0.82, + "grad_norm": 0.08677081631609211, + "learning_rate": 5e-05, + "loss": 1.228, + "step": 584 + }, + { + "epoch": 0.82, + "grad_norm": 0.09087600851766289, + "learning_rate": 5e-05, + "loss": 1.2739, + "step": 585 + }, + { + "epoch": 0.82, + "grad_norm": 0.08937232399362025, + "learning_rate": 5e-05, + "loss": 1.3397, + "step": 586 + }, + { + "epoch": 0.82, + "grad_norm": 0.09181498093360832, + "learning_rate": 5e-05, + "loss": 1.2337, + "step": 587 + }, + { + "epoch": 0.83, + "grad_norm": 0.08660421458268343, + "learning_rate": 5e-05, + "loss": 1.3066, + "step": 588 + }, + { + "epoch": 0.83, + "grad_norm": 0.09405547367950175, + "learning_rate": 5e-05, + "loss": 1.1865, + "step": 589 + }, + { + "epoch": 0.83, + "grad_norm": 0.10008203744489265, + "learning_rate": 5e-05, + "loss": 1.2958, + "step": 590 + }, + { + "epoch": 0.83, + "grad_norm": 0.08623872911953744, + "learning_rate": 5e-05, + "loss": 1.2459, + "step": 591 + }, + { + "epoch": 0.83, + "grad_norm": 0.08620542665467425, + "learning_rate": 5e-05, + "loss": 1.196, + "step": 592 + }, + { + "epoch": 0.83, + "grad_norm": 0.09154759789920437, + "learning_rate": 5e-05, + "loss": 1.2083, + "step": 593 + }, + { + "epoch": 0.83, + "grad_norm": 0.13411544381515045, + "learning_rate": 5e-05, + "loss": 1.2967, + "step": 594 + }, + { + "epoch": 0.84, + "grad_norm": 0.08207561526451417, + "learning_rate": 5e-05, + "loss": 1.3127, + "step": 595 + }, + { + "epoch": 0.84, + "grad_norm": 0.07989637752894571, + "learning_rate": 5e-05, + "loss": 1.2393, + "step": 596 + }, + { + "epoch": 0.84, + "grad_norm": 0.15303683065994178, + "learning_rate": 5e-05, + "loss": 1.2035, + "step": 597 + }, + { + "epoch": 0.84, + "grad_norm": 0.21126660740808512, + "learning_rate": 5e-05, + "loss": 1.2646, + "step": 598 + }, + { + "epoch": 0.84, + "grad_norm": 0.09235709112995891, + "learning_rate": 5e-05, + "loss": 1.1644, + "step": 599 + }, + { + "epoch": 0.84, + "grad_norm": 0.09107701199389144, + "learning_rate": 5e-05, + "loss": 1.2198, + "step": 600 + }, + { + "epoch": 0.84, + "grad_norm": 0.18076245165187252, + "learning_rate": 5e-05, + "loss": 1.2244, + "step": 601 + }, + { + "epoch": 0.85, + "grad_norm": 0.08540692042816288, + "learning_rate": 5e-05, + "loss": 1.2701, + "step": 602 + }, + { + "epoch": 0.85, + "grad_norm": 0.08928729191853955, + "learning_rate": 5e-05, + "loss": 1.182, + "step": 603 + }, + { + "epoch": 0.85, + "grad_norm": 0.1297085133397325, + "learning_rate": 5e-05, + "loss": 1.293, + "step": 604 + }, + { + "epoch": 0.85, + "grad_norm": 0.08886073686153889, + "learning_rate": 5e-05, + "loss": 1.2193, + "step": 605 + }, + { + "epoch": 0.85, + "grad_norm": 0.11184508387835462, + "learning_rate": 5e-05, + "loss": 1.3524, + "step": 606 + }, + { + "epoch": 0.85, + "grad_norm": 0.28617427730766865, + "learning_rate": 5e-05, + "loss": 1.2289, + "step": 607 + }, + { + "epoch": 0.85, + "grad_norm": 0.08913132440783965, + "learning_rate": 5e-05, + "loss": 1.276, + "step": 608 + }, + { + "epoch": 0.86, + "grad_norm": 0.09371699908947943, + "learning_rate": 5e-05, + "loss": 1.2011, + "step": 609 + }, + { + "epoch": 0.86, + "grad_norm": 0.08915675529087298, + "learning_rate": 5e-05, + "loss": 1.2712, + "step": 610 + }, + { + "epoch": 0.86, + "grad_norm": 0.09515513145486774, + "learning_rate": 5e-05, + "loss": 1.2654, + "step": 611 + }, + { + "epoch": 0.86, + "grad_norm": 0.09724889297080566, + "learning_rate": 5e-05, + "loss": 1.237, + "step": 612 + }, + { + "epoch": 0.86, + "grad_norm": 0.08500359571002307, + "learning_rate": 5e-05, + "loss": 1.2901, + "step": 613 + }, + { + "epoch": 0.86, + "grad_norm": 0.09873900572833598, + "learning_rate": 5e-05, + "loss": 1.2974, + "step": 614 + }, + { + "epoch": 0.86, + "grad_norm": 0.10190340329060169, + "learning_rate": 5e-05, + "loss": 1.2078, + "step": 615 + }, + { + "epoch": 0.87, + "grad_norm": 0.09278816113127145, + "learning_rate": 5e-05, + "loss": 1.2899, + "step": 616 + }, + { + "epoch": 0.87, + "grad_norm": 0.09368617495170674, + "learning_rate": 5e-05, + "loss": 1.1452, + "step": 617 + }, + { + "epoch": 0.87, + "grad_norm": 0.09206700755880094, + "learning_rate": 5e-05, + "loss": 1.2304, + "step": 618 + }, + { + "epoch": 0.87, + "grad_norm": 0.09539147296869752, + "learning_rate": 5e-05, + "loss": 1.1477, + "step": 619 + }, + { + "epoch": 0.87, + "grad_norm": 0.0943090483093122, + "learning_rate": 5e-05, + "loss": 1.3099, + "step": 620 + }, + { + "epoch": 0.87, + "grad_norm": 0.09840131619711731, + "learning_rate": 5e-05, + "loss": 1.2777, + "step": 621 + }, + { + "epoch": 0.87, + "grad_norm": 0.1017436445672807, + "learning_rate": 5e-05, + "loss": 1.3135, + "step": 622 + }, + { + "epoch": 0.88, + "grad_norm": 0.09353027935319465, + "learning_rate": 5e-05, + "loss": 1.2939, + "step": 623 + }, + { + "epoch": 0.88, + "grad_norm": 0.0992666673551347, + "learning_rate": 5e-05, + "loss": 1.1588, + "step": 624 + }, + { + "epoch": 0.88, + "grad_norm": 0.09621536648270135, + "learning_rate": 5e-05, + "loss": 1.1561, + "step": 625 + }, + { + "epoch": 0.88, + "grad_norm": 0.1803848705385315, + "learning_rate": 5e-05, + "loss": 1.2266, + "step": 626 + }, + { + "epoch": 0.88, + "grad_norm": 0.09940497512751059, + "learning_rate": 5e-05, + "loss": 1.1527, + "step": 627 + }, + { + "epoch": 0.88, + "grad_norm": 0.22301331748075162, + "learning_rate": 5e-05, + "loss": 1.1786, + "step": 628 + }, + { + "epoch": 0.88, + "grad_norm": 0.092299402565119, + "learning_rate": 5e-05, + "loss": 1.2165, + "step": 629 + }, + { + "epoch": 0.88, + "grad_norm": 0.08987088544034491, + "learning_rate": 5e-05, + "loss": 1.2376, + "step": 630 + }, + { + "epoch": 0.89, + "grad_norm": 0.09467737714738493, + "learning_rate": 5e-05, + "loss": 1.0809, + "step": 631 + }, + { + "epoch": 0.89, + "grad_norm": 0.09016867287270872, + "learning_rate": 5e-05, + "loss": 1.2832, + "step": 632 + }, + { + "epoch": 0.89, + "grad_norm": 0.09263823621699638, + "learning_rate": 5e-05, + "loss": 1.2468, + "step": 633 + }, + { + "epoch": 0.89, + "grad_norm": 0.10845579464729647, + "learning_rate": 5e-05, + "loss": 1.1989, + "step": 634 + }, + { + "epoch": 0.89, + "grad_norm": 0.0915070201812124, + "learning_rate": 5e-05, + "loss": 1.1566, + "step": 635 + }, + { + "epoch": 0.89, + "grad_norm": 0.10727575446689461, + "learning_rate": 5e-05, + "loss": 1.1929, + "step": 636 + }, + { + "epoch": 0.89, + "grad_norm": 0.09880718654639797, + "learning_rate": 5e-05, + "loss": 1.2715, + "step": 637 + }, + { + "epoch": 0.9, + "grad_norm": 0.08695192845612108, + "learning_rate": 5e-05, + "loss": 1.2251, + "step": 638 + }, + { + "epoch": 0.9, + "grad_norm": 0.13295672913473008, + "learning_rate": 5e-05, + "loss": 1.265, + "step": 639 + }, + { + "epoch": 0.9, + "grad_norm": 0.09226160620278051, + "learning_rate": 5e-05, + "loss": 1.1655, + "step": 640 + }, + { + "epoch": 0.9, + "grad_norm": 0.08658185721218865, + "learning_rate": 5e-05, + "loss": 1.2737, + "step": 641 + }, + { + "epoch": 0.9, + "grad_norm": 0.09485911206032513, + "learning_rate": 5e-05, + "loss": 1.3013, + "step": 642 + }, + { + "epoch": 0.9, + "grad_norm": 0.10327747818516794, + "learning_rate": 5e-05, + "loss": 1.23, + "step": 643 + }, + { + "epoch": 0.9, + "grad_norm": 0.08875036083861511, + "learning_rate": 5e-05, + "loss": 1.3161, + "step": 644 + }, + { + "epoch": 0.91, + "grad_norm": 0.09802151021128591, + "learning_rate": 5e-05, + "loss": 1.2579, + "step": 645 + }, + { + "epoch": 0.91, + "grad_norm": 0.09787376381900295, + "learning_rate": 5e-05, + "loss": 1.2211, + "step": 646 + }, + { + "epoch": 0.91, + "grad_norm": 0.09293215004126011, + "learning_rate": 5e-05, + "loss": 1.2328, + "step": 647 + }, + { + "epoch": 0.91, + "grad_norm": 0.40838285000644514, + "learning_rate": 5e-05, + "loss": 1.2187, + "step": 648 + }, + { + "epoch": 0.91, + "grad_norm": 0.09207947832303415, + "learning_rate": 5e-05, + "loss": 1.1493, + "step": 649 + }, + { + "epoch": 0.91, + "grad_norm": 0.08581277394743936, + "learning_rate": 5e-05, + "loss": 1.2583, + "step": 650 + }, + { + "epoch": 0.91, + "grad_norm": 0.08972806336651389, + "learning_rate": 5e-05, + "loss": 1.237, + "step": 651 + }, + { + "epoch": 0.92, + "grad_norm": 0.11051487901082263, + "learning_rate": 5e-05, + "loss": 1.1829, + "step": 652 + }, + { + "epoch": 0.92, + "grad_norm": 0.08982255157991624, + "learning_rate": 5e-05, + "loss": 1.2128, + "step": 653 + }, + { + "epoch": 0.92, + "grad_norm": 0.14123425840531084, + "learning_rate": 5e-05, + "loss": 1.2416, + "step": 654 + }, + { + "epoch": 0.92, + "grad_norm": 0.12880759387579782, + "learning_rate": 5e-05, + "loss": 1.2207, + "step": 655 + }, + { + "epoch": 0.92, + "grad_norm": 0.10248050833816658, + "learning_rate": 5e-05, + "loss": 1.2631, + "step": 656 + }, + { + "epoch": 0.92, + "grad_norm": 0.08455841358023947, + "learning_rate": 5e-05, + "loss": 1.2567, + "step": 657 + }, + { + "epoch": 0.92, + "grad_norm": 0.12583544117745254, + "learning_rate": 5e-05, + "loss": 1.1778, + "step": 658 + }, + { + "epoch": 0.93, + "grad_norm": 0.11324965961424767, + "learning_rate": 5e-05, + "loss": 1.1386, + "step": 659 + }, + { + "epoch": 0.93, + "grad_norm": 0.09691490439817214, + "learning_rate": 5e-05, + "loss": 1.1911, + "step": 660 + }, + { + "epoch": 0.93, + "grad_norm": 0.1072333014220056, + "learning_rate": 5e-05, + "loss": 1.1999, + "step": 661 + }, + { + "epoch": 0.93, + "grad_norm": 0.0950265977451683, + "learning_rate": 5e-05, + "loss": 1.1978, + "step": 662 + }, + { + "epoch": 0.93, + "grad_norm": 0.0929892818223546, + "learning_rate": 5e-05, + "loss": 1.2169, + "step": 663 + }, + { + "epoch": 0.93, + "grad_norm": 0.09469469542770795, + "learning_rate": 5e-05, + "loss": 1.2761, + "step": 664 + }, + { + "epoch": 0.93, + "grad_norm": 0.21751987444270846, + "learning_rate": 5e-05, + "loss": 1.1807, + "step": 665 + }, + { + "epoch": 0.94, + "grad_norm": 0.08556131295751274, + "learning_rate": 5e-05, + "loss": 1.2711, + "step": 666 + }, + { + "epoch": 0.94, + "grad_norm": 0.09282570232212695, + "learning_rate": 5e-05, + "loss": 1.1027, + "step": 667 + }, + { + "epoch": 0.94, + "grad_norm": 0.08955806312088513, + "learning_rate": 5e-05, + "loss": 1.2454, + "step": 668 + }, + { + "epoch": 0.94, + "grad_norm": 0.0845899407602786, + "learning_rate": 5e-05, + "loss": 1.1794, + "step": 669 + }, + { + "epoch": 0.94, + "grad_norm": 0.09356526088253121, + "learning_rate": 5e-05, + "loss": 1.2057, + "step": 670 + }, + { + "epoch": 0.94, + "grad_norm": 0.10643714572041506, + "learning_rate": 5e-05, + "loss": 1.2645, + "step": 671 + }, + { + "epoch": 0.94, + "grad_norm": 0.09507456774404864, + "learning_rate": 5e-05, + "loss": 1.3194, + "step": 672 + }, + { + "epoch": 0.95, + "grad_norm": 0.2056777048175019, + "learning_rate": 5e-05, + "loss": 1.2427, + "step": 673 + }, + { + "epoch": 0.95, + "grad_norm": 0.10110036392248033, + "learning_rate": 5e-05, + "loss": 1.2327, + "step": 674 + }, + { + "epoch": 0.95, + "grad_norm": 0.13002768789325456, + "learning_rate": 5e-05, + "loss": 1.1952, + "step": 675 + }, + { + "epoch": 0.95, + "grad_norm": 0.10033679290166875, + "learning_rate": 5e-05, + "loss": 1.2775, + "step": 676 + }, + { + "epoch": 0.95, + "grad_norm": 0.09391553624024601, + "learning_rate": 5e-05, + "loss": 1.2047, + "step": 677 + }, + { + "epoch": 0.95, + "grad_norm": 0.09611763781707118, + "learning_rate": 5e-05, + "loss": 1.102, + "step": 678 + }, + { + "epoch": 0.95, + "grad_norm": 0.10562455047638551, + "learning_rate": 5e-05, + "loss": 1.29, + "step": 679 + }, + { + "epoch": 0.96, + "grad_norm": 0.09690049590453258, + "learning_rate": 5e-05, + "loss": 1.1889, + "step": 680 + }, + { + "epoch": 0.96, + "grad_norm": 0.09350339672467371, + "learning_rate": 5e-05, + "loss": 1.2132, + "step": 681 + }, + { + "epoch": 0.96, + "grad_norm": 0.09794819805073643, + "learning_rate": 5e-05, + "loss": 1.1108, + "step": 682 + }, + { + "epoch": 0.96, + "grad_norm": 0.09235058797869976, + "learning_rate": 5e-05, + "loss": 1.2115, + "step": 683 + }, + { + "epoch": 0.96, + "grad_norm": 0.09624917985554393, + "learning_rate": 5e-05, + "loss": 1.3051, + "step": 684 + }, + { + "epoch": 0.96, + "grad_norm": 0.31721839996806295, + "learning_rate": 5e-05, + "loss": 1.2784, + "step": 685 + }, + { + "epoch": 0.96, + "grad_norm": 0.09556114849876468, + "learning_rate": 5e-05, + "loss": 1.2236, + "step": 686 + }, + { + "epoch": 0.96, + "grad_norm": 0.09368485871183255, + "learning_rate": 5e-05, + "loss": 1.1814, + "step": 687 + }, + { + "epoch": 0.97, + "grad_norm": 0.09802703013849685, + "learning_rate": 5e-05, + "loss": 1.2432, + "step": 688 + }, + { + "epoch": 0.97, + "grad_norm": 0.10087628906498335, + "learning_rate": 5e-05, + "loss": 1.2858, + "step": 689 + }, + { + "epoch": 0.97, + "grad_norm": 0.09413510463018016, + "learning_rate": 5e-05, + "loss": 1.3659, + "step": 690 + }, + { + "epoch": 0.97, + "grad_norm": 0.10392111027868062, + "learning_rate": 5e-05, + "loss": 1.1542, + "step": 691 + }, + { + "epoch": 0.97, + "grad_norm": 0.0906566542543004, + "learning_rate": 5e-05, + "loss": 1.2487, + "step": 692 + }, + { + "epoch": 0.97, + "grad_norm": 0.08715045299668721, + "learning_rate": 5e-05, + "loss": 1.2352, + "step": 693 + }, + { + "epoch": 0.97, + "grad_norm": 0.12929957216629626, + "learning_rate": 5e-05, + "loss": 1.3195, + "step": 694 + }, + { + "epoch": 0.98, + "grad_norm": 0.10703264606883131, + "learning_rate": 5e-05, + "loss": 1.2241, + "step": 695 + }, + { + "epoch": 0.98, + "grad_norm": 0.10154138617521206, + "learning_rate": 5e-05, + "loss": 1.2146, + "step": 696 + }, + { + "epoch": 0.98, + "grad_norm": 0.1179671151272554, + "learning_rate": 5e-05, + "loss": 1.2687, + "step": 697 + }, + { + "epoch": 0.98, + "grad_norm": 0.14179304986899552, + "learning_rate": 5e-05, + "loss": 1.1992, + "step": 698 + }, + { + "epoch": 0.98, + "grad_norm": 0.10046483798948673, + "learning_rate": 5e-05, + "loss": 1.3317, + "step": 699 + }, + { + "epoch": 0.98, + "grad_norm": 0.10141024718873726, + "learning_rate": 5e-05, + "loss": 1.2044, + "step": 700 + }, + { + "epoch": 0.98, + "grad_norm": 0.09932798087483034, + "learning_rate": 5e-05, + "loss": 1.3029, + "step": 701 + }, + { + "epoch": 0.99, + "grad_norm": 0.17373024837647805, + "learning_rate": 5e-05, + "loss": 1.2626, + "step": 702 + }, + { + "epoch": 0.99, + "grad_norm": 0.10303981160118214, + "learning_rate": 5e-05, + "loss": 1.1873, + "step": 703 + }, + { + "epoch": 0.99, + "grad_norm": 0.09963180786967149, + "learning_rate": 5e-05, + "loss": 1.2595, + "step": 704 + }, + { + "epoch": 0.99, + "grad_norm": 0.12652797136155272, + "learning_rate": 5e-05, + "loss": 1.3829, + "step": 705 + }, + { + "epoch": 0.99, + "grad_norm": 0.1012165700917821, + "learning_rate": 5e-05, + "loss": 1.1874, + "step": 706 + }, + { + "epoch": 0.99, + "grad_norm": 0.10453598557896276, + "learning_rate": 5e-05, + "loss": 1.3104, + "step": 707 + }, + { + "epoch": 0.99, + "grad_norm": 0.09942899214105919, + "learning_rate": 5e-05, + "loss": 1.2672, + "step": 708 + }, + { + "epoch": 1.0, + "grad_norm": 0.15326373835553098, + "learning_rate": 5e-05, + "loss": 1.362, + "step": 709 + }, + { + "epoch": 1.0, + "grad_norm": 1.429975024253803, + "learning_rate": 5e-05, + "loss": 1.2764, + "step": 710 + }, + { + "epoch": 1.0, + "grad_norm": 0.11153291143807398, + "learning_rate": 5e-05, + "loss": 1.2682, + "step": 711 + }, + { + "epoch": 1.0, + "grad_norm": 0.15133533987538889, + "learning_rate": 5e-05, + "loss": 1.1378, + "step": 712 + }, + { + "epoch": 1.0, + "grad_norm": 0.25668416002800687, + "learning_rate": 5e-05, + "loss": 1.1951, + "step": 713 + }, + { + "epoch": 1.0, + "grad_norm": 0.0963067674586202, + "learning_rate": 5e-05, + "loss": 1.2414, + "step": 714 + }, + { + "epoch": 1.0, + "grad_norm": 0.15451942034952437, + "learning_rate": 5e-05, + "loss": 1.2113, + "step": 715 + }, + { + "epoch": 1.01, + "grad_norm": 0.09981210233391376, + "learning_rate": 5e-05, + "loss": 1.2978, + "step": 716 + }, + { + "epoch": 1.01, + "grad_norm": 0.10076216800241435, + "learning_rate": 5e-05, + "loss": 1.2255, + "step": 717 + }, + { + "epoch": 1.01, + "grad_norm": 0.14548778281671867, + "learning_rate": 5e-05, + "loss": 1.2835, + "step": 718 + }, + { + "epoch": 1.01, + "grad_norm": 0.1016741952863245, + "learning_rate": 5e-05, + "loss": 1.2912, + "step": 719 + }, + { + "epoch": 1.01, + "grad_norm": 0.09960716934644352, + "learning_rate": 5e-05, + "loss": 0.9963, + "step": 720 + }, + { + "epoch": 1.01, + "grad_norm": 0.10640237229489667, + "learning_rate": 5e-05, + "loss": 1.3254, + "step": 721 + }, + { + "epoch": 1.01, + "grad_norm": 0.10233325113756409, + "learning_rate": 5e-05, + "loss": 1.1384, + "step": 722 + }, + { + "epoch": 1.02, + "grad_norm": 0.10014095700336226, + "learning_rate": 5e-05, + "loss": 1.1717, + "step": 723 + }, + { + "epoch": 1.02, + "grad_norm": 0.24673393128935722, + "learning_rate": 5e-05, + "loss": 1.1833, + "step": 724 + }, + { + "epoch": 1.02, + "grad_norm": 0.1765370153980564, + "learning_rate": 5e-05, + "loss": 1.167, + "step": 725 + }, + { + "epoch": 1.02, + "grad_norm": 0.09069059043020448, + "learning_rate": 5e-05, + "loss": 1.2299, + "step": 726 + }, + { + "epoch": 1.02, + "grad_norm": 0.09919444575477596, + "learning_rate": 5e-05, + "loss": 1.2114, + "step": 727 + }, + { + "epoch": 1.02, + "grad_norm": 0.10624614036476415, + "learning_rate": 5e-05, + "loss": 1.2148, + "step": 728 + }, + { + "epoch": 1.02, + "grad_norm": 0.11414826824845992, + "learning_rate": 5e-05, + "loss": 1.2473, + "step": 729 + }, + { + "epoch": 1.03, + "grad_norm": 0.09746044971388516, + "learning_rate": 5e-05, + "loss": 1.2744, + "step": 730 + }, + { + "epoch": 1.03, + "grad_norm": 0.09264686915487945, + "learning_rate": 5e-05, + "loss": 1.2411, + "step": 731 + }, + { + "epoch": 1.03, + "grad_norm": 0.1285248529691951, + "learning_rate": 5e-05, + "loss": 1.2455, + "step": 732 + }, + { + "epoch": 1.03, + "grad_norm": 0.10819000040646083, + "learning_rate": 5e-05, + "loss": 1.3297, + "step": 733 + }, + { + "epoch": 1.03, + "grad_norm": 0.10741312350360947, + "learning_rate": 5e-05, + "loss": 1.2665, + "step": 734 + }, + { + "epoch": 1.03, + "grad_norm": 0.08981878384046382, + "learning_rate": 5e-05, + "loss": 1.1661, + "step": 735 + }, + { + "epoch": 1.03, + "grad_norm": 0.09710771553318064, + "learning_rate": 5e-05, + "loss": 1.1958, + "step": 736 + }, + { + "epoch": 1.04, + "grad_norm": 0.09834809704812088, + "learning_rate": 5e-05, + "loss": 1.1861, + "step": 737 + }, + { + "epoch": 1.04, + "grad_norm": 0.11873488763336909, + "learning_rate": 5e-05, + "loss": 1.2457, + "step": 738 + }, + { + "epoch": 1.04, + "grad_norm": 0.4534657268027245, + "learning_rate": 5e-05, + "loss": 1.2153, + "step": 739 + }, + { + "epoch": 1.04, + "grad_norm": 0.09197575469543723, + "learning_rate": 5e-05, + "loss": 1.233, + "step": 740 + }, + { + "epoch": 1.04, + "grad_norm": 0.09298135208306929, + "learning_rate": 5e-05, + "loss": 1.2487, + "step": 741 + }, + { + "epoch": 1.04, + "grad_norm": 0.1002334003010323, + "learning_rate": 5e-05, + "loss": 1.2212, + "step": 742 + }, + { + "epoch": 1.04, + "grad_norm": 0.09679819558535252, + "learning_rate": 5e-05, + "loss": 1.2832, + "step": 743 + }, + { + "epoch": 1.04, + "grad_norm": 0.09623395785492658, + "learning_rate": 5e-05, + "loss": 1.1877, + "step": 744 + }, + { + "epoch": 1.05, + "grad_norm": 0.09631927188132637, + "learning_rate": 5e-05, + "loss": 1.325, + "step": 745 + }, + { + "epoch": 1.05, + "grad_norm": 0.10247557565649974, + "learning_rate": 5e-05, + "loss": 1.1731, + "step": 746 + }, + { + "epoch": 1.05, + "grad_norm": 0.1252184897594414, + "learning_rate": 5e-05, + "loss": 1.2998, + "step": 747 + }, + { + "epoch": 1.05, + "grad_norm": 0.10370387262827341, + "learning_rate": 5e-05, + "loss": 1.3172, + "step": 748 + }, + { + "epoch": 1.05, + "grad_norm": 0.0974762587376287, + "learning_rate": 5e-05, + "loss": 1.2268, + "step": 749 + }, + { + "epoch": 1.05, + "grad_norm": 0.1300763773985203, + "learning_rate": 5e-05, + "loss": 1.28, + "step": 750 + }, + { + "epoch": 1.05, + "grad_norm": 0.10416697475393953, + "learning_rate": 5e-05, + "loss": 1.2415, + "step": 751 + }, + { + "epoch": 1.06, + "grad_norm": 0.0953165467431355, + "learning_rate": 5e-05, + "loss": 1.224, + "step": 752 + }, + { + "epoch": 1.06, + "grad_norm": 0.15906087999039972, + "learning_rate": 5e-05, + "loss": 1.2541, + "step": 753 + }, + { + "epoch": 1.06, + "grad_norm": 0.10705539272146689, + "learning_rate": 5e-05, + "loss": 1.2195, + "step": 754 + }, + { + "epoch": 1.06, + "grad_norm": 0.10064863761184746, + "learning_rate": 5e-05, + "loss": 1.1485, + "step": 755 + }, + { + "epoch": 1.06, + "grad_norm": 0.09754148334053919, + "learning_rate": 5e-05, + "loss": 1.2517, + "step": 756 + }, + { + "epoch": 1.06, + "grad_norm": 0.10763212145768898, + "learning_rate": 5e-05, + "loss": 1.1744, + "step": 757 + }, + { + "epoch": 1.06, + "grad_norm": 0.1445971539574533, + "learning_rate": 5e-05, + "loss": 1.2038, + "step": 758 + }, + { + "epoch": 1.07, + "grad_norm": 0.10008128712969973, + "learning_rate": 5e-05, + "loss": 1.2221, + "step": 759 + }, + { + "epoch": 1.07, + "grad_norm": 0.11696802050684159, + "learning_rate": 5e-05, + "loss": 1.2129, + "step": 760 + }, + { + "epoch": 1.07, + "grad_norm": 0.10167310507443662, + "learning_rate": 5e-05, + "loss": 1.2254, + "step": 761 + }, + { + "epoch": 1.07, + "grad_norm": 0.79581504789477, + "learning_rate": 5e-05, + "loss": 1.2186, + "step": 762 + }, + { + "epoch": 1.07, + "grad_norm": 0.09638098726069541, + "learning_rate": 5e-05, + "loss": 1.2259, + "step": 763 + }, + { + "epoch": 1.07, + "grad_norm": 0.10875965264550522, + "learning_rate": 5e-05, + "loss": 1.2789, + "step": 764 + }, + { + "epoch": 1.07, + "grad_norm": 0.109777838254128, + "learning_rate": 5e-05, + "loss": 1.277, + "step": 765 + }, + { + "epoch": 1.08, + "grad_norm": 0.10856749686495373, + "learning_rate": 5e-05, + "loss": 1.2077, + "step": 766 + }, + { + "epoch": 1.08, + "grad_norm": 0.09838759573209178, + "learning_rate": 5e-05, + "loss": 1.2296, + "step": 767 + }, + { + "epoch": 1.08, + "grad_norm": 0.09877121767531931, + "learning_rate": 5e-05, + "loss": 1.2447, + "step": 768 + }, + { + "epoch": 1.08, + "grad_norm": 0.097516506898902, + "learning_rate": 5e-05, + "loss": 1.1415, + "step": 769 + }, + { + "epoch": 1.08, + "grad_norm": 0.15640033301410253, + "learning_rate": 5e-05, + "loss": 1.2098, + "step": 770 + }, + { + "epoch": 1.08, + "grad_norm": 0.43103323678783306, + "learning_rate": 5e-05, + "loss": 1.2478, + "step": 771 + }, + { + "epoch": 1.08, + "grad_norm": 0.1263132691072992, + "learning_rate": 5e-05, + "loss": 1.1549, + "step": 772 + }, + { + "epoch": 1.09, + "grad_norm": 0.22807536100092707, + "learning_rate": 5e-05, + "loss": 1.2675, + "step": 773 + }, + { + "epoch": 1.09, + "grad_norm": 0.10972162409171408, + "learning_rate": 5e-05, + "loss": 1.1569, + "step": 774 + }, + { + "epoch": 1.09, + "grad_norm": 0.11038434717003007, + "learning_rate": 5e-05, + "loss": 1.3608, + "step": 775 + }, + { + "epoch": 1.09, + "grad_norm": 0.17568265985553402, + "learning_rate": 5e-05, + "loss": 1.2267, + "step": 776 + }, + { + "epoch": 1.09, + "grad_norm": 0.11545287877132578, + "learning_rate": 5e-05, + "loss": 1.1519, + "step": 777 + }, + { + "epoch": 1.09, + "grad_norm": 0.14936270776039093, + "learning_rate": 5e-05, + "loss": 1.1398, + "step": 778 + }, + { + "epoch": 1.09, + "grad_norm": 0.4545001978738308, + "learning_rate": 5e-05, + "loss": 1.2197, + "step": 779 + }, + { + "epoch": 1.1, + "grad_norm": 0.1196390269385417, + "learning_rate": 5e-05, + "loss": 1.2172, + "step": 780 + }, + { + "epoch": 1.1, + "grad_norm": 0.11043124850191112, + "learning_rate": 5e-05, + "loss": 1.2616, + "step": 781 + }, + { + "epoch": 1.1, + "grad_norm": 0.11378426834643288, + "learning_rate": 5e-05, + "loss": 1.2639, + "step": 782 + }, + { + "epoch": 1.1, + "grad_norm": 0.11296570989843585, + "learning_rate": 5e-05, + "loss": 1.2778, + "step": 783 + }, + { + "epoch": 1.1, + "grad_norm": 0.11792262542869875, + "learning_rate": 5e-05, + "loss": 1.2381, + "step": 784 + }, + { + "epoch": 1.1, + "grad_norm": 0.1040175773065076, + "learning_rate": 5e-05, + "loss": 1.1977, + "step": 785 + }, + { + "epoch": 1.1, + "grad_norm": 0.10117229239881861, + "learning_rate": 5e-05, + "loss": 1.1884, + "step": 786 + }, + { + "epoch": 1.11, + "grad_norm": 0.11948040578958839, + "learning_rate": 5e-05, + "loss": 1.1862, + "step": 787 + }, + { + "epoch": 1.11, + "grad_norm": 0.11263044877607341, + "learning_rate": 5e-05, + "loss": 1.0975, + "step": 788 + }, + { + "epoch": 1.11, + "grad_norm": 0.10162187237638322, + "learning_rate": 5e-05, + "loss": 1.2881, + "step": 789 + }, + { + "epoch": 1.11, + "grad_norm": 0.09761584113237752, + "learning_rate": 5e-05, + "loss": 1.1133, + "step": 790 + }, + { + "epoch": 1.11, + "grad_norm": 0.10179676445024048, + "learning_rate": 5e-05, + "loss": 1.1925, + "step": 791 + }, + { + "epoch": 1.11, + "grad_norm": 0.10048619299055972, + "learning_rate": 5e-05, + "loss": 1.1626, + "step": 792 + }, + { + "epoch": 1.11, + "grad_norm": 0.10540479457335808, + "learning_rate": 5e-05, + "loss": 1.1609, + "step": 793 + }, + { + "epoch": 1.12, + "grad_norm": 0.09576741807479447, + "learning_rate": 5e-05, + "loss": 1.2164, + "step": 794 + }, + { + "epoch": 1.12, + "grad_norm": 0.10211722164324015, + "learning_rate": 5e-05, + "loss": 1.2733, + "step": 795 + }, + { + "epoch": 1.12, + "grad_norm": 0.0994069528386415, + "learning_rate": 5e-05, + "loss": 1.277, + "step": 796 + }, + { + "epoch": 1.12, + "grad_norm": 0.11702861428225478, + "learning_rate": 5e-05, + "loss": 1.3852, + "step": 797 + }, + { + "epoch": 1.12, + "grad_norm": 0.11569287694449525, + "learning_rate": 5e-05, + "loss": 1.1918, + "step": 798 + }, + { + "epoch": 1.12, + "grad_norm": 0.10796848173468654, + "learning_rate": 5e-05, + "loss": 1.2132, + "step": 799 + }, + { + "epoch": 1.12, + "grad_norm": 0.11356557024659053, + "learning_rate": 5e-05, + "loss": 1.1443, + "step": 800 + }, + { + "epoch": 1.12, + "grad_norm": 0.10035049988674814, + "learning_rate": 5e-05, + "loss": 1.2292, + "step": 801 + }, + { + "epoch": 1.13, + "grad_norm": 0.10133797526045685, + "learning_rate": 5e-05, + "loss": 1.2533, + "step": 802 + }, + { + "epoch": 1.13, + "grad_norm": 0.09720806286113348, + "learning_rate": 5e-05, + "loss": 1.2717, + "step": 803 + }, + { + "epoch": 1.13, + "grad_norm": 0.1031376804344581, + "learning_rate": 5e-05, + "loss": 1.1421, + "step": 804 + }, + { + "epoch": 1.13, + "grad_norm": 0.18830466286012526, + "learning_rate": 5e-05, + "loss": 1.1433, + "step": 805 + }, + { + "epoch": 1.13, + "grad_norm": 0.11603087901097664, + "learning_rate": 5e-05, + "loss": 1.2186, + "step": 806 + }, + { + "epoch": 1.13, + "grad_norm": 0.10478959523077212, + "learning_rate": 5e-05, + "loss": 1.2219, + "step": 807 + }, + { + "epoch": 1.13, + "grad_norm": 0.09553368510904443, + "learning_rate": 5e-05, + "loss": 1.2017, + "step": 808 + }, + { + "epoch": 1.14, + "grad_norm": 0.11042525638456162, + "learning_rate": 5e-05, + "loss": 1.2168, + "step": 809 + }, + { + "epoch": 1.14, + "grad_norm": 0.10169783065905559, + "learning_rate": 5e-05, + "loss": 1.2701, + "step": 810 + }, + { + "epoch": 1.14, + "grad_norm": 0.10292594868979561, + "learning_rate": 5e-05, + "loss": 1.2496, + "step": 811 + }, + { + "epoch": 1.14, + "grad_norm": 0.096388359121937, + "learning_rate": 5e-05, + "loss": 1.2348, + "step": 812 + }, + { + "epoch": 1.14, + "grad_norm": 0.09309208735472058, + "learning_rate": 5e-05, + "loss": 1.3132, + "step": 813 + }, + { + "epoch": 1.14, + "grad_norm": 0.09828706153194258, + "learning_rate": 5e-05, + "loss": 1.2129, + "step": 814 + }, + { + "epoch": 1.14, + "grad_norm": 0.10245628163747142, + "learning_rate": 5e-05, + "loss": 1.2323, + "step": 815 + }, + { + "epoch": 1.15, + "grad_norm": 0.09945742718624755, + "learning_rate": 5e-05, + "loss": 1.0269, + "step": 816 + }, + { + "epoch": 1.15, + "grad_norm": 0.10402570140552733, + "learning_rate": 5e-05, + "loss": 1.1454, + "step": 817 + }, + { + "epoch": 1.15, + "grad_norm": 0.10679095390597962, + "learning_rate": 5e-05, + "loss": 1.1737, + "step": 818 + }, + { + "epoch": 1.15, + "grad_norm": 0.11131020002635934, + "learning_rate": 5e-05, + "loss": 1.1798, + "step": 819 + }, + { + "epoch": 1.15, + "grad_norm": 0.19411752110726227, + "learning_rate": 5e-05, + "loss": 1.2349, + "step": 820 + }, + { + "epoch": 1.15, + "grad_norm": 0.12075632954232787, + "learning_rate": 5e-05, + "loss": 1.2102, + "step": 821 + }, + { + "epoch": 1.15, + "grad_norm": 0.14563663550644834, + "learning_rate": 5e-05, + "loss": 1.1971, + "step": 822 + }, + { + "epoch": 1.16, + "grad_norm": 0.10155568945220618, + "learning_rate": 5e-05, + "loss": 1.2767, + "step": 823 + }, + { + "epoch": 1.16, + "grad_norm": 0.14110329174351133, + "learning_rate": 5e-05, + "loss": 1.1161, + "step": 824 + }, + { + "epoch": 1.16, + "grad_norm": 0.1280627595012023, + "learning_rate": 5e-05, + "loss": 1.1679, + "step": 825 + }, + { + "epoch": 1.16, + "grad_norm": 0.10517376300859077, + "learning_rate": 5e-05, + "loss": 1.225, + "step": 826 + }, + { + "epoch": 1.16, + "grad_norm": 0.11688344813726845, + "learning_rate": 5e-05, + "loss": 1.1472, + "step": 827 + }, + { + "epoch": 1.16, + "grad_norm": 0.12875551575115213, + "learning_rate": 5e-05, + "loss": 1.3059, + "step": 828 + }, + { + "epoch": 1.16, + "grad_norm": 0.10331275620542076, + "learning_rate": 5e-05, + "loss": 1.1897, + "step": 829 + }, + { + "epoch": 1.17, + "grad_norm": 0.11814901122457233, + "learning_rate": 5e-05, + "loss": 1.2886, + "step": 830 + }, + { + "epoch": 1.17, + "grad_norm": 0.09471838984232255, + "learning_rate": 5e-05, + "loss": 1.2303, + "step": 831 + }, + { + "epoch": 1.17, + "grad_norm": 0.14860588114869513, + "learning_rate": 5e-05, + "loss": 1.1934, + "step": 832 + }, + { + "epoch": 1.17, + "grad_norm": 0.10584314236750857, + "learning_rate": 5e-05, + "loss": 1.2584, + "step": 833 + }, + { + "epoch": 1.17, + "grad_norm": 0.11520751891265056, + "learning_rate": 5e-05, + "loss": 1.1897, + "step": 834 + }, + { + "epoch": 1.17, + "grad_norm": 0.28001595194924067, + "learning_rate": 5e-05, + "loss": 1.3453, + "step": 835 + }, + { + "epoch": 1.17, + "grad_norm": 0.14806410459094513, + "learning_rate": 5e-05, + "loss": 1.2792, + "step": 836 + }, + { + "epoch": 1.18, + "grad_norm": 0.1297344607276581, + "learning_rate": 5e-05, + "loss": 1.1851, + "step": 837 + }, + { + "epoch": 1.18, + "grad_norm": 0.11593565387718859, + "learning_rate": 5e-05, + "loss": 1.2544, + "step": 838 + }, + { + "epoch": 1.18, + "grad_norm": 0.1436467271405091, + "learning_rate": 5e-05, + "loss": 1.1878, + "step": 839 + }, + { + "epoch": 1.18, + "grad_norm": 0.25345500969595197, + "learning_rate": 5e-05, + "loss": 1.28, + "step": 840 + }, + { + "epoch": 1.18, + "grad_norm": 0.2078365932479126, + "learning_rate": 5e-05, + "loss": 1.2371, + "step": 841 + }, + { + "epoch": 1.18, + "grad_norm": 0.1224506575042296, + "learning_rate": 5e-05, + "loss": 1.2283, + "step": 842 + }, + { + "epoch": 1.18, + "grad_norm": 0.10860704275988492, + "learning_rate": 5e-05, + "loss": 1.1925, + "step": 843 + }, + { + "epoch": 1.19, + "grad_norm": 0.13984158399146657, + "learning_rate": 5e-05, + "loss": 1.1854, + "step": 844 + }, + { + "epoch": 1.19, + "grad_norm": 0.1337763392877898, + "learning_rate": 5e-05, + "loss": 1.2313, + "step": 845 + }, + { + "epoch": 1.19, + "grad_norm": 0.10226803676816323, + "learning_rate": 5e-05, + "loss": 1.203, + "step": 846 + }, + { + "epoch": 1.19, + "grad_norm": 0.09320162293879738, + "learning_rate": 5e-05, + "loss": 1.2391, + "step": 847 + }, + { + "epoch": 1.19, + "grad_norm": 0.09409514812359136, + "learning_rate": 5e-05, + "loss": 1.2224, + "step": 848 + }, + { + "epoch": 1.19, + "grad_norm": 0.10052912222465865, + "learning_rate": 5e-05, + "loss": 1.2169, + "step": 849 + }, + { + "epoch": 1.19, + "grad_norm": 0.09906328558060296, + "learning_rate": 5e-05, + "loss": 1.1966, + "step": 850 + }, + { + "epoch": 1.2, + "grad_norm": 0.12102946778744055, + "learning_rate": 5e-05, + "loss": 1.2103, + "step": 851 + }, + { + "epoch": 1.2, + "grad_norm": 0.10113058133812584, + "learning_rate": 5e-05, + "loss": 1.268, + "step": 852 + }, + { + "epoch": 1.2, + "grad_norm": 0.10055161182595006, + "learning_rate": 5e-05, + "loss": 1.1344, + "step": 853 + }, + { + "epoch": 1.2, + "grad_norm": 0.09991313175496647, + "learning_rate": 5e-05, + "loss": 1.3187, + "step": 854 + }, + { + "epoch": 1.2, + "grad_norm": 0.12951627822859785, + "learning_rate": 5e-05, + "loss": 1.3176, + "step": 855 + }, + { + "epoch": 1.2, + "grad_norm": 0.09815170395943444, + "learning_rate": 5e-05, + "loss": 1.2497, + "step": 856 + }, + { + "epoch": 1.2, + "grad_norm": 0.12246225672826391, + "learning_rate": 5e-05, + "loss": 1.3025, + "step": 857 + }, + { + "epoch": 1.21, + "grad_norm": 0.0954537733224272, + "learning_rate": 5e-05, + "loss": 1.2217, + "step": 858 + }, + { + "epoch": 1.21, + "grad_norm": 0.09405780024619664, + "learning_rate": 5e-05, + "loss": 1.2256, + "step": 859 + }, + { + "epoch": 1.21, + "grad_norm": 0.10542056748781518, + "learning_rate": 5e-05, + "loss": 1.2219, + "step": 860 + }, + { + "epoch": 1.21, + "grad_norm": 0.09445376702773765, + "learning_rate": 5e-05, + "loss": 1.2511, + "step": 861 + }, + { + "epoch": 1.21, + "grad_norm": 0.10752474403917245, + "learning_rate": 5e-05, + "loss": 1.1942, + "step": 862 + }, + { + "epoch": 1.21, + "grad_norm": 0.0938429879577624, + "learning_rate": 5e-05, + "loss": 1.3777, + "step": 863 + }, + { + "epoch": 1.21, + "grad_norm": 0.10810466379307371, + "learning_rate": 5e-05, + "loss": 1.2441, + "step": 864 + }, + { + "epoch": 1.21, + "grad_norm": 0.10690895063667602, + "learning_rate": 5e-05, + "loss": 1.1865, + "step": 865 + }, + { + "epoch": 1.22, + "grad_norm": 0.12209367932915752, + "learning_rate": 5e-05, + "loss": 1.3176, + "step": 866 + }, + { + "epoch": 1.22, + "grad_norm": 0.102588124319327, + "learning_rate": 5e-05, + "loss": 1.1804, + "step": 867 + }, + { + "epoch": 1.22, + "grad_norm": 0.1060100472496696, + "learning_rate": 5e-05, + "loss": 1.2266, + "step": 868 + }, + { + "epoch": 1.22, + "grad_norm": 0.0983313694916884, + "learning_rate": 5e-05, + "loss": 1.2627, + "step": 869 + }, + { + "epoch": 1.22, + "grad_norm": 0.10303980986839666, + "learning_rate": 5e-05, + "loss": 1.2996, + "step": 870 + }, + { + "epoch": 1.22, + "grad_norm": 0.10680194565865087, + "learning_rate": 5e-05, + "loss": 1.2781, + "step": 871 + }, + { + "epoch": 1.22, + "grad_norm": 0.09979616568878538, + "learning_rate": 5e-05, + "loss": 1.3002, + "step": 872 + }, + { + "epoch": 1.23, + "grad_norm": 0.09267394233479663, + "learning_rate": 5e-05, + "loss": 1.2036, + "step": 873 + }, + { + "epoch": 1.23, + "grad_norm": 0.1149663518195294, + "learning_rate": 5e-05, + "loss": 1.1554, + "step": 874 + }, + { + "epoch": 1.23, + "grad_norm": 0.10237610394825647, + "learning_rate": 5e-05, + "loss": 1.3075, + "step": 875 + }, + { + "epoch": 1.23, + "grad_norm": 0.09245192145458844, + "learning_rate": 5e-05, + "loss": 1.2307, + "step": 876 + }, + { + "epoch": 1.23, + "grad_norm": 0.10589193198898804, + "learning_rate": 5e-05, + "loss": 1.2573, + "step": 877 + }, + { + "epoch": 1.23, + "grad_norm": 0.10126499777256248, + "learning_rate": 5e-05, + "loss": 1.1529, + "step": 878 + }, + { + "epoch": 1.23, + "grad_norm": 0.09660125908005594, + "learning_rate": 5e-05, + "loss": 1.2493, + "step": 879 + }, + { + "epoch": 1.24, + "grad_norm": 0.1247059817512293, + "learning_rate": 5e-05, + "loss": 1.229, + "step": 880 + }, + { + "epoch": 1.24, + "grad_norm": 0.09186685597728123, + "learning_rate": 5e-05, + "loss": 1.2894, + "step": 881 + }, + { + "epoch": 1.24, + "grad_norm": 0.10583196951125595, + "learning_rate": 5e-05, + "loss": 1.3119, + "step": 882 + }, + { + "epoch": 1.24, + "grad_norm": 0.09543741816461494, + "learning_rate": 5e-05, + "loss": 1.2534, + "step": 883 + }, + { + "epoch": 1.24, + "grad_norm": 0.10469972017765604, + "learning_rate": 5e-05, + "loss": 1.1627, + "step": 884 + }, + { + "epoch": 1.24, + "grad_norm": 0.10742474116808534, + "learning_rate": 5e-05, + "loss": 1.2241, + "step": 885 + }, + { + "epoch": 1.24, + "grad_norm": 0.10301119785274608, + "learning_rate": 5e-05, + "loss": 1.2369, + "step": 886 + }, + { + "epoch": 1.25, + "grad_norm": 0.11086391313945411, + "learning_rate": 5e-05, + "loss": 1.1584, + "step": 887 + }, + { + "epoch": 1.25, + "grad_norm": 0.09856150306496389, + "learning_rate": 5e-05, + "loss": 1.1314, + "step": 888 + }, + { + "epoch": 1.25, + "grad_norm": 0.09377525054933467, + "learning_rate": 5e-05, + "loss": 1.2545, + "step": 889 + }, + { + "epoch": 1.25, + "grad_norm": 0.09864051283641095, + "learning_rate": 5e-05, + "loss": 1.2924, + "step": 890 + }, + { + "epoch": 1.25, + "grad_norm": 0.09991766051946123, + "learning_rate": 5e-05, + "loss": 1.2165, + "step": 891 + }, + { + "epoch": 1.25, + "grad_norm": 0.0948527521839145, + "learning_rate": 5e-05, + "loss": 1.2812, + "step": 892 + }, + { + "epoch": 1.25, + "grad_norm": 0.09568181092038586, + "learning_rate": 5e-05, + "loss": 1.1717, + "step": 893 + }, + { + "epoch": 1.26, + "grad_norm": 0.0998176192734084, + "learning_rate": 5e-05, + "loss": 1.305, + "step": 894 + }, + { + "epoch": 1.26, + "grad_norm": 0.12039325363606615, + "learning_rate": 5e-05, + "loss": 1.2765, + "step": 895 + }, + { + "epoch": 1.26, + "grad_norm": 0.10924701771102203, + "learning_rate": 5e-05, + "loss": 1.2545, + "step": 896 + }, + { + "epoch": 1.26, + "grad_norm": 0.12035811393880431, + "learning_rate": 5e-05, + "loss": 1.1652, + "step": 897 + }, + { + "epoch": 1.26, + "grad_norm": 0.09142103086991352, + "learning_rate": 5e-05, + "loss": 1.2409, + "step": 898 + }, + { + "epoch": 1.26, + "grad_norm": 0.10267685463052927, + "learning_rate": 5e-05, + "loss": 1.1792, + "step": 899 + }, + { + "epoch": 1.26, + "grad_norm": 0.13250203781354886, + "learning_rate": 5e-05, + "loss": 1.2084, + "step": 900 + }, + { + "epoch": 1.27, + "grad_norm": 0.10834802537147774, + "learning_rate": 5e-05, + "loss": 1.1788, + "step": 901 + }, + { + "epoch": 1.27, + "grad_norm": 0.10561344797384954, + "learning_rate": 5e-05, + "loss": 1.2991, + "step": 902 + }, + { + "epoch": 1.27, + "grad_norm": 0.11364275040147816, + "learning_rate": 5e-05, + "loss": 1.2342, + "step": 903 + }, + { + "epoch": 1.27, + "grad_norm": 0.09882596512731058, + "learning_rate": 5e-05, + "loss": 1.2401, + "step": 904 + }, + { + "epoch": 1.27, + "grad_norm": 0.1598614608373347, + "learning_rate": 5e-05, + "loss": 1.1111, + "step": 905 + }, + { + "epoch": 1.27, + "grad_norm": 0.10379583645131088, + "learning_rate": 5e-05, + "loss": 1.242, + "step": 906 + }, + { + "epoch": 1.27, + "grad_norm": 0.10050971560567515, + "learning_rate": 5e-05, + "loss": 1.245, + "step": 907 + }, + { + "epoch": 1.28, + "grad_norm": 0.09733983912930413, + "learning_rate": 5e-05, + "loss": 1.2056, + "step": 908 + }, + { + "epoch": 1.28, + "grad_norm": 0.09728015607915742, + "learning_rate": 5e-05, + "loss": 1.259, + "step": 909 + }, + { + "epoch": 1.28, + "grad_norm": 0.09966988425633526, + "learning_rate": 5e-05, + "loss": 1.2024, + "step": 910 + }, + { + "epoch": 1.28, + "grad_norm": 0.10455978865642376, + "learning_rate": 5e-05, + "loss": 1.1231, + "step": 911 + }, + { + "epoch": 1.28, + "grad_norm": 0.09441860703901163, + "learning_rate": 5e-05, + "loss": 1.334, + "step": 912 + }, + { + "epoch": 1.28, + "grad_norm": 0.10730615783514179, + "learning_rate": 5e-05, + "loss": 1.2264, + "step": 913 + }, + { + "epoch": 1.28, + "grad_norm": 0.09947829609146572, + "learning_rate": 5e-05, + "loss": 1.0771, + "step": 914 + }, + { + "epoch": 1.29, + "grad_norm": 0.09963777107898805, + "learning_rate": 5e-05, + "loss": 1.186, + "step": 915 + }, + { + "epoch": 1.29, + "grad_norm": 0.09247240575157885, + "learning_rate": 5e-05, + "loss": 1.2108, + "step": 916 + }, + { + "epoch": 1.29, + "grad_norm": 0.10559239036702427, + "learning_rate": 5e-05, + "loss": 1.2395, + "step": 917 + }, + { + "epoch": 1.29, + "grad_norm": 0.09682297501448958, + "learning_rate": 5e-05, + "loss": 1.2698, + "step": 918 + }, + { + "epoch": 1.29, + "grad_norm": 0.16475050639429561, + "learning_rate": 5e-05, + "loss": 1.2197, + "step": 919 + }, + { + "epoch": 1.29, + "grad_norm": 0.11235995677493582, + "learning_rate": 5e-05, + "loss": 1.2784, + "step": 920 + }, + { + "epoch": 1.29, + "grad_norm": 0.47156079694933206, + "learning_rate": 5e-05, + "loss": 1.2786, + "step": 921 + }, + { + "epoch": 1.29, + "grad_norm": 0.11858338220188593, + "learning_rate": 5e-05, + "loss": 1.2193, + "step": 922 + }, + { + "epoch": 1.3, + "grad_norm": 0.23613792180343693, + "learning_rate": 5e-05, + "loss": 1.3102, + "step": 923 + }, + { + "epoch": 1.3, + "grad_norm": 0.0983111298091262, + "learning_rate": 5e-05, + "loss": 1.2385, + "step": 924 + }, + { + "epoch": 1.3, + "grad_norm": 0.11406810717062593, + "learning_rate": 5e-05, + "loss": 1.3788, + "step": 925 + }, + { + "epoch": 1.3, + "grad_norm": 0.11254524855462729, + "learning_rate": 5e-05, + "loss": 1.2784, + "step": 926 + }, + { + "epoch": 1.3, + "grad_norm": 0.15539104489315878, + "learning_rate": 5e-05, + "loss": 1.1682, + "step": 927 + }, + { + "epoch": 1.3, + "grad_norm": 0.11789469174067303, + "learning_rate": 5e-05, + "loss": 1.336, + "step": 928 + }, + { + "epoch": 1.3, + "grad_norm": 0.11349741317620189, + "learning_rate": 5e-05, + "loss": 1.2517, + "step": 929 + }, + { + "epoch": 1.31, + "grad_norm": 0.10919004596237217, + "learning_rate": 5e-05, + "loss": 1.216, + "step": 930 + }, + { + "epoch": 1.31, + "grad_norm": 0.1198831609531216, + "learning_rate": 5e-05, + "loss": 1.2071, + "step": 931 + }, + { + "epoch": 1.31, + "grad_norm": 0.10286766273135783, + "learning_rate": 5e-05, + "loss": 1.2043, + "step": 932 + }, + { + "epoch": 1.31, + "grad_norm": 0.09600011742192804, + "learning_rate": 5e-05, + "loss": 1.2604, + "step": 933 + }, + { + "epoch": 1.31, + "grad_norm": 0.22190196640417234, + "learning_rate": 5e-05, + "loss": 1.1831, + "step": 934 + }, + { + "epoch": 1.31, + "grad_norm": 0.10226010954520015, + "learning_rate": 5e-05, + "loss": 1.2089, + "step": 935 + }, + { + "epoch": 1.31, + "grad_norm": 0.11317533110892175, + "learning_rate": 5e-05, + "loss": 1.1593, + "step": 936 + }, + { + "epoch": 1.32, + "grad_norm": 0.11759030987224492, + "learning_rate": 5e-05, + "loss": 1.2899, + "step": 937 + }, + { + "epoch": 1.32, + "grad_norm": 0.1250834660376518, + "learning_rate": 5e-05, + "loss": 1.2589, + "step": 938 + }, + { + "epoch": 1.32, + "grad_norm": 0.14260058685971605, + "learning_rate": 5e-05, + "loss": 1.0978, + "step": 939 + }, + { + "epoch": 1.32, + "grad_norm": 0.09749870569187263, + "learning_rate": 5e-05, + "loss": 1.2144, + "step": 940 + }, + { + "epoch": 1.32, + "grad_norm": 0.10779307755110958, + "learning_rate": 5e-05, + "loss": 1.2084, + "step": 941 + }, + { + "epoch": 1.32, + "grad_norm": 0.14605707408138002, + "learning_rate": 5e-05, + "loss": 1.2735, + "step": 942 + }, + { + "epoch": 1.32, + "grad_norm": 0.11628990548806367, + "learning_rate": 5e-05, + "loss": 1.266, + "step": 943 + }, + { + "epoch": 1.33, + "grad_norm": 0.10230427023192927, + "learning_rate": 5e-05, + "loss": 1.2164, + "step": 944 + }, + { + "epoch": 1.33, + "grad_norm": 0.16410793851226396, + "learning_rate": 5e-05, + "loss": 1.2743, + "step": 945 + }, + { + "epoch": 1.33, + "grad_norm": 0.10765862205008636, + "learning_rate": 5e-05, + "loss": 1.2685, + "step": 946 + }, + { + "epoch": 1.33, + "grad_norm": 0.09907993502415698, + "learning_rate": 5e-05, + "loss": 1.2559, + "step": 947 + }, + { + "epoch": 1.33, + "grad_norm": 0.11889063526263191, + "learning_rate": 5e-05, + "loss": 1.234, + "step": 948 + }, + { + "epoch": 1.33, + "grad_norm": 0.10767936872705047, + "learning_rate": 5e-05, + "loss": 1.1664, + "step": 949 + }, + { + "epoch": 1.33, + "grad_norm": 0.09749757641604646, + "learning_rate": 5e-05, + "loss": 1.3048, + "step": 950 + }, + { + "epoch": 1.34, + "grad_norm": 0.23069423873119332, + "learning_rate": 5e-05, + "loss": 1.1651, + "step": 951 + }, + { + "epoch": 1.34, + "grad_norm": 0.10700583434758547, + "learning_rate": 5e-05, + "loss": 1.1825, + "step": 952 + }, + { + "epoch": 1.34, + "grad_norm": 0.10396836363765337, + "learning_rate": 5e-05, + "loss": 1.1863, + "step": 953 + }, + { + "epoch": 1.34, + "grad_norm": 0.10052794694013932, + "learning_rate": 5e-05, + "loss": 1.1817, + "step": 954 + }, + { + "epoch": 1.34, + "grad_norm": 0.09993466906494232, + "learning_rate": 5e-05, + "loss": 1.234, + "step": 955 + }, + { + "epoch": 1.34, + "grad_norm": 0.11687322712115825, + "learning_rate": 5e-05, + "loss": 1.2533, + "step": 956 + }, + { + "epoch": 1.34, + "grad_norm": 0.10225631433831812, + "learning_rate": 5e-05, + "loss": 1.3213, + "step": 957 + }, + { + "epoch": 1.35, + "grad_norm": 0.20386069596611658, + "learning_rate": 5e-05, + "loss": 1.182, + "step": 958 + }, + { + "epoch": 1.35, + "grad_norm": 0.10453133927250215, + "learning_rate": 5e-05, + "loss": 1.1637, + "step": 959 + }, + { + "epoch": 1.35, + "grad_norm": 0.14175585587210737, + "learning_rate": 5e-05, + "loss": 1.2288, + "step": 960 + }, + { + "epoch": 1.35, + "grad_norm": 0.10504359148798725, + "learning_rate": 5e-05, + "loss": 1.1047, + "step": 961 + }, + { + "epoch": 1.35, + "grad_norm": 0.1372710003561213, + "learning_rate": 5e-05, + "loss": 1.2389, + "step": 962 + }, + { + "epoch": 1.35, + "grad_norm": 0.09652492703220977, + "learning_rate": 5e-05, + "loss": 1.244, + "step": 963 + }, + { + "epoch": 1.35, + "grad_norm": 0.12853818698223418, + "learning_rate": 5e-05, + "loss": 1.2166, + "step": 964 + }, + { + "epoch": 1.36, + "grad_norm": 0.09514313666309156, + "learning_rate": 5e-05, + "loss": 1.2089, + "step": 965 + }, + { + "epoch": 1.36, + "grad_norm": 0.10694618628695444, + "learning_rate": 5e-05, + "loss": 1.2235, + "step": 966 + }, + { + "epoch": 1.36, + "grad_norm": 0.12224798699692742, + "learning_rate": 5e-05, + "loss": 1.1813, + "step": 967 + }, + { + "epoch": 1.36, + "grad_norm": 0.26234747554071813, + "learning_rate": 5e-05, + "loss": 1.2717, + "step": 968 + }, + { + "epoch": 1.36, + "grad_norm": 0.2223576763234531, + "learning_rate": 5e-05, + "loss": 1.1696, + "step": 969 + }, + { + "epoch": 1.36, + "grad_norm": 0.09541227160678786, + "learning_rate": 5e-05, + "loss": 1.2806, + "step": 970 + }, + { + "epoch": 1.36, + "grad_norm": 0.190855658030098, + "learning_rate": 5e-05, + "loss": 1.1567, + "step": 971 + }, + { + "epoch": 1.37, + "grad_norm": 0.09497325579586648, + "learning_rate": 5e-05, + "loss": 1.1967, + "step": 972 + }, + { + "epoch": 1.37, + "grad_norm": 0.11088288919053314, + "learning_rate": 5e-05, + "loss": 1.2721, + "step": 973 + }, + { + "epoch": 1.37, + "grad_norm": 0.10796216347531845, + "learning_rate": 5e-05, + "loss": 1.189, + "step": 974 + }, + { + "epoch": 1.37, + "grad_norm": 0.1480195977338028, + "learning_rate": 5e-05, + "loss": 1.2218, + "step": 975 + }, + { + "epoch": 1.37, + "grad_norm": 0.09927193222114575, + "learning_rate": 5e-05, + "loss": 1.1921, + "step": 976 + }, + { + "epoch": 1.37, + "grad_norm": 0.12996014889963342, + "learning_rate": 5e-05, + "loss": 1.1949, + "step": 977 + }, + { + "epoch": 1.37, + "grad_norm": 0.10121985656769747, + "learning_rate": 5e-05, + "loss": 1.2615, + "step": 978 + }, + { + "epoch": 1.38, + "grad_norm": 0.12529184737097637, + "learning_rate": 5e-05, + "loss": 1.242, + "step": 979 + }, + { + "epoch": 1.38, + "grad_norm": 0.09263455602695328, + "learning_rate": 5e-05, + "loss": 1.114, + "step": 980 + }, + { + "epoch": 1.38, + "grad_norm": 0.10771968620871182, + "learning_rate": 5e-05, + "loss": 1.2078, + "step": 981 + }, + { + "epoch": 1.38, + "grad_norm": 0.10216253778570993, + "learning_rate": 5e-05, + "loss": 1.1327, + "step": 982 + }, + { + "epoch": 1.38, + "grad_norm": 0.10818819544058149, + "learning_rate": 5e-05, + "loss": 1.1586, + "step": 983 + }, + { + "epoch": 1.38, + "grad_norm": 0.12823555056526956, + "learning_rate": 5e-05, + "loss": 1.2688, + "step": 984 + }, + { + "epoch": 1.38, + "grad_norm": 0.09924598713576864, + "learning_rate": 5e-05, + "loss": 1.2897, + "step": 985 + }, + { + "epoch": 1.38, + "grad_norm": 0.10450140329683655, + "learning_rate": 5e-05, + "loss": 1.2864, + "step": 986 + }, + { + "epoch": 1.39, + "grad_norm": 0.10811779229962425, + "learning_rate": 5e-05, + "loss": 1.2154, + "step": 987 + }, + { + "epoch": 1.39, + "grad_norm": 0.12094047182333562, + "learning_rate": 5e-05, + "loss": 1.2081, + "step": 988 + }, + { + "epoch": 1.39, + "grad_norm": 0.4600497792934725, + "learning_rate": 5e-05, + "loss": 1.3033, + "step": 989 + }, + { + "epoch": 1.39, + "grad_norm": 0.10852237288651734, + "learning_rate": 5e-05, + "loss": 1.2158, + "step": 990 + }, + { + "epoch": 1.39, + "grad_norm": 0.10195887833984932, + "learning_rate": 5e-05, + "loss": 1.1503, + "step": 991 + }, + { + "epoch": 1.39, + "grad_norm": 0.10399482935464686, + "learning_rate": 5e-05, + "loss": 1.2045, + "step": 992 + }, + { + "epoch": 1.39, + "grad_norm": 0.11118589790644104, + "learning_rate": 5e-05, + "loss": 1.2563, + "step": 993 + }, + { + "epoch": 1.4, + "grad_norm": 0.10465906043886859, + "learning_rate": 5e-05, + "loss": 1.2064, + "step": 994 + }, + { + "epoch": 1.4, + "grad_norm": 0.10013573706817067, + "learning_rate": 5e-05, + "loss": 1.209, + "step": 995 + }, + { + "epoch": 1.4, + "grad_norm": 0.1071570526430657, + "learning_rate": 5e-05, + "loss": 1.275, + "step": 996 + }, + { + "epoch": 1.4, + "grad_norm": 0.10919713500303668, + "learning_rate": 5e-05, + "loss": 1.288, + "step": 997 + }, + { + "epoch": 1.4, + "grad_norm": 0.11456205050457402, + "learning_rate": 5e-05, + "loss": 1.2305, + "step": 998 + }, + { + "epoch": 1.4, + "grad_norm": 0.12399882106372813, + "learning_rate": 5e-05, + "loss": 1.2837, + "step": 999 + }, + { + "epoch": 1.4, + "grad_norm": 0.10832642295721785, + "learning_rate": 5e-05, + "loss": 1.29, + "step": 1000 + }, + { + "epoch": 1.41, + "grad_norm": 0.10366791152273643, + "learning_rate": 5e-05, + "loss": 1.2974, + "step": 1001 + }, + { + "epoch": 1.41, + "grad_norm": 0.1003360742671259, + "learning_rate": 5e-05, + "loss": 1.1462, + "step": 1002 + }, + { + "epoch": 1.41, + "grad_norm": 0.1287165700820769, + "learning_rate": 5e-05, + "loss": 1.1544, + "step": 1003 + }, + { + "epoch": 1.41, + "grad_norm": 0.10869493803592062, + "learning_rate": 5e-05, + "loss": 1.2541, + "step": 1004 + }, + { + "epoch": 1.41, + "grad_norm": 0.10140977622387513, + "learning_rate": 5e-05, + "loss": 1.2757, + "step": 1005 + }, + { + "epoch": 1.41, + "grad_norm": 0.1044449979187249, + "learning_rate": 5e-05, + "loss": 1.2509, + "step": 1006 + }, + { + "epoch": 1.41, + "grad_norm": 0.10164122941659054, + "learning_rate": 5e-05, + "loss": 1.1214, + "step": 1007 + }, + { + "epoch": 1.42, + "grad_norm": 0.09656524427061954, + "learning_rate": 5e-05, + "loss": 1.2794, + "step": 1008 + }, + { + "epoch": 1.42, + "grad_norm": 0.10316130824172369, + "learning_rate": 5e-05, + "loss": 1.2456, + "step": 1009 + }, + { + "epoch": 1.42, + "grad_norm": 0.12661507350057447, + "learning_rate": 5e-05, + "loss": 1.3331, + "step": 1010 + }, + { + "epoch": 1.42, + "grad_norm": 0.09343343695130192, + "learning_rate": 5e-05, + "loss": 1.2193, + "step": 1011 + }, + { + "epoch": 1.42, + "grad_norm": 0.10105563782443838, + "learning_rate": 5e-05, + "loss": 1.1847, + "step": 1012 + }, + { + "epoch": 1.42, + "grad_norm": 0.09945667547651259, + "learning_rate": 5e-05, + "loss": 1.3234, + "step": 1013 + }, + { + "epoch": 1.42, + "grad_norm": 0.10631412051783554, + "learning_rate": 5e-05, + "loss": 1.2156, + "step": 1014 + }, + { + "epoch": 1.43, + "grad_norm": 0.10136339520159555, + "learning_rate": 5e-05, + "loss": 1.2395, + "step": 1015 + }, + { + "epoch": 1.43, + "grad_norm": 0.10386181257649985, + "learning_rate": 5e-05, + "loss": 1.1837, + "step": 1016 + }, + { + "epoch": 1.43, + "grad_norm": 0.11322232984171478, + "learning_rate": 5e-05, + "loss": 1.2568, + "step": 1017 + }, + { + "epoch": 1.43, + "grad_norm": 0.15564747896162864, + "learning_rate": 5e-05, + "loss": 1.208, + "step": 1018 + }, + { + "epoch": 1.43, + "grad_norm": 0.09815366710701164, + "learning_rate": 5e-05, + "loss": 1.2358, + "step": 1019 + }, + { + "epoch": 1.43, + "grad_norm": 0.0947005716750396, + "learning_rate": 5e-05, + "loss": 1.2339, + "step": 1020 + }, + { + "epoch": 1.43, + "grad_norm": 0.10681616135760118, + "learning_rate": 5e-05, + "loss": 1.1831, + "step": 1021 + }, + { + "epoch": 1.44, + "grad_norm": 0.13281012673783055, + "learning_rate": 5e-05, + "loss": 1.2644, + "step": 1022 + }, + { + "epoch": 1.44, + "grad_norm": 0.09877299158515507, + "learning_rate": 5e-05, + "loss": 1.2352, + "step": 1023 + }, + { + "epoch": 1.44, + "grad_norm": 0.12293371631329547, + "learning_rate": 5e-05, + "loss": 1.2489, + "step": 1024 + }, + { + "epoch": 1.44, + "grad_norm": 0.09705893021636595, + "learning_rate": 5e-05, + "loss": 1.1578, + "step": 1025 + }, + { + "epoch": 1.44, + "grad_norm": 0.10973605966782467, + "learning_rate": 5e-05, + "loss": 1.186, + "step": 1026 + }, + { + "epoch": 1.44, + "grad_norm": 0.10491996108084253, + "learning_rate": 5e-05, + "loss": 1.2308, + "step": 1027 + }, + { + "epoch": 1.44, + "grad_norm": 0.1044888405375428, + "learning_rate": 5e-05, + "loss": 1.2694, + "step": 1028 + }, + { + "epoch": 1.45, + "grad_norm": 0.13136089252156674, + "learning_rate": 5e-05, + "loss": 1.2088, + "step": 1029 + }, + { + "epoch": 1.45, + "grad_norm": 0.10099304079305634, + "learning_rate": 5e-05, + "loss": 1.1863, + "step": 1030 + }, + { + "epoch": 1.45, + "grad_norm": 0.1460377875439273, + "learning_rate": 5e-05, + "loss": 1.098, + "step": 1031 + }, + { + "epoch": 1.45, + "grad_norm": 0.10861430882064095, + "learning_rate": 5e-05, + "loss": 1.1955, + "step": 1032 + }, + { + "epoch": 1.45, + "grad_norm": 0.10496775010185377, + "learning_rate": 5e-05, + "loss": 1.1734, + "step": 1033 + }, + { + "epoch": 1.45, + "grad_norm": 0.10446995033363628, + "learning_rate": 5e-05, + "loss": 1.1894, + "step": 1034 + }, + { + "epoch": 1.45, + "grad_norm": 0.10549243265636632, + "learning_rate": 5e-05, + "loss": 1.1358, + "step": 1035 + }, + { + "epoch": 1.46, + "grad_norm": 0.10896447810159171, + "learning_rate": 5e-05, + "loss": 1.2034, + "step": 1036 + }, + { + "epoch": 1.46, + "grad_norm": 0.11061310703224005, + "learning_rate": 5e-05, + "loss": 1.1825, + "step": 1037 + }, + { + "epoch": 1.46, + "grad_norm": 0.16590775932321747, + "learning_rate": 5e-05, + "loss": 1.1903, + "step": 1038 + }, + { + "epoch": 1.46, + "grad_norm": 0.12514434847006461, + "learning_rate": 5e-05, + "loss": 1.3002, + "step": 1039 + }, + { + "epoch": 1.46, + "grad_norm": 0.15781803094359004, + "learning_rate": 5e-05, + "loss": 1.1637, + "step": 1040 + }, + { + "epoch": 1.46, + "grad_norm": 0.12413132010118058, + "learning_rate": 5e-05, + "loss": 1.2507, + "step": 1041 + }, + { + "epoch": 1.46, + "grad_norm": 0.18426087842596092, + "learning_rate": 5e-05, + "loss": 1.2318, + "step": 1042 + }, + { + "epoch": 1.46, + "grad_norm": 0.10635114094076911, + "learning_rate": 5e-05, + "loss": 1.2053, + "step": 1043 + }, + { + "epoch": 1.47, + "grad_norm": 0.10170843014452283, + "learning_rate": 5e-05, + "loss": 1.1673, + "step": 1044 + }, + { + "epoch": 1.47, + "grad_norm": 0.11318331074469846, + "learning_rate": 5e-05, + "loss": 1.2605, + "step": 1045 + }, + { + "epoch": 1.47, + "grad_norm": 0.17396111219741633, + "learning_rate": 5e-05, + "loss": 1.1856, + "step": 1046 + }, + { + "epoch": 1.47, + "grad_norm": 0.20923671082644354, + "learning_rate": 5e-05, + "loss": 1.2154, + "step": 1047 + }, + { + "epoch": 1.47, + "grad_norm": 0.11196845757564178, + "learning_rate": 5e-05, + "loss": 1.2044, + "step": 1048 + }, + { + "epoch": 1.47, + "grad_norm": 0.13055991020143282, + "learning_rate": 5e-05, + "loss": 1.1808, + "step": 1049 + }, + { + "epoch": 1.47, + "grad_norm": 0.10527238862728859, + "learning_rate": 5e-05, + "loss": 1.2132, + "step": 1050 + }, + { + "epoch": 1.48, + "grad_norm": 0.10907032731642057, + "learning_rate": 5e-05, + "loss": 1.2486, + "step": 1051 + }, + { + "epoch": 1.48, + "grad_norm": 0.11052964516050179, + "learning_rate": 5e-05, + "loss": 1.2098, + "step": 1052 + }, + { + "epoch": 1.48, + "grad_norm": 0.10449099870037604, + "learning_rate": 5e-05, + "loss": 1.3333, + "step": 1053 + }, + { + "epoch": 1.48, + "grad_norm": 0.1031383339280535, + "learning_rate": 5e-05, + "loss": 1.1824, + "step": 1054 + }, + { + "epoch": 1.48, + "grad_norm": 0.13227089922436905, + "learning_rate": 5e-05, + "loss": 1.2001, + "step": 1055 + }, + { + "epoch": 1.48, + "grad_norm": 0.09664835532955035, + "learning_rate": 5e-05, + "loss": 1.167, + "step": 1056 + }, + { + "epoch": 1.48, + "grad_norm": 0.11335851967004781, + "learning_rate": 5e-05, + "loss": 1.2281, + "step": 1057 + }, + { + "epoch": 1.49, + "grad_norm": 0.10498278152815924, + "learning_rate": 5e-05, + "loss": 1.1733, + "step": 1058 + }, + { + "epoch": 1.49, + "grad_norm": 0.10463428399728839, + "learning_rate": 5e-05, + "loss": 1.1869, + "step": 1059 + }, + { + "epoch": 1.49, + "grad_norm": 0.1067819882408397, + "learning_rate": 5e-05, + "loss": 1.2354, + "step": 1060 + }, + { + "epoch": 1.49, + "grad_norm": 0.106121227536998, + "learning_rate": 5e-05, + "loss": 1.2082, + "step": 1061 + }, + { + "epoch": 1.49, + "grad_norm": 0.12840621609678718, + "learning_rate": 5e-05, + "loss": 1.1806, + "step": 1062 + }, + { + "epoch": 1.49, + "grad_norm": 0.12471783967396649, + "learning_rate": 5e-05, + "loss": 1.2427, + "step": 1063 + }, + { + "epoch": 1.49, + "grad_norm": 0.10106792284786356, + "learning_rate": 5e-05, + "loss": 1.1918, + "step": 1064 + }, + { + "epoch": 1.5, + "grad_norm": 0.10469400899894957, + "learning_rate": 5e-05, + "loss": 1.3323, + "step": 1065 + }, + { + "epoch": 1.5, + "grad_norm": 0.09841756090169458, + "learning_rate": 5e-05, + "loss": 1.1394, + "step": 1066 + }, + { + "epoch": 1.5, + "grad_norm": 0.10521064721286782, + "learning_rate": 5e-05, + "loss": 1.2369, + "step": 1067 + }, + { + "epoch": 1.5, + "grad_norm": 0.1485189439001766, + "learning_rate": 5e-05, + "loss": 1.1872, + "step": 1068 + }, + { + "epoch": 1.5, + "grad_norm": 0.1079971402508067, + "learning_rate": 5e-05, + "loss": 1.1736, + "step": 1069 + }, + { + "epoch": 1.5, + "grad_norm": 0.3116945764054355, + "learning_rate": 5e-05, + "loss": 1.1337, + "step": 1070 + }, + { + "epoch": 1.5, + "grad_norm": 0.100199655243572, + "learning_rate": 5e-05, + "loss": 1.2735, + "step": 1071 + }, + { + "epoch": 1.51, + "grad_norm": 0.10810114625745185, + "learning_rate": 5e-05, + "loss": 1.1165, + "step": 1072 + }, + { + "epoch": 1.51, + "grad_norm": 0.10661802966803025, + "learning_rate": 5e-05, + "loss": 1.211, + "step": 1073 + }, + { + "epoch": 1.51, + "grad_norm": 0.1702498815516622, + "learning_rate": 5e-05, + "loss": 1.2107, + "step": 1074 + }, + { + "epoch": 1.51, + "grad_norm": 0.19737300799778165, + "learning_rate": 5e-05, + "loss": 1.2667, + "step": 1075 + }, + { + "epoch": 1.51, + "grad_norm": 0.14465651500853066, + "learning_rate": 5e-05, + "loss": 1.1231, + "step": 1076 + }, + { + "epoch": 1.51, + "grad_norm": 0.10265693402616242, + "learning_rate": 5e-05, + "loss": 1.1786, + "step": 1077 + }, + { + "epoch": 1.51, + "grad_norm": 0.14591430498719865, + "learning_rate": 5e-05, + "loss": 1.1901, + "step": 1078 + }, + { + "epoch": 1.52, + "grad_norm": 0.10532373028754374, + "learning_rate": 5e-05, + "loss": 1.2289, + "step": 1079 + }, + { + "epoch": 1.52, + "grad_norm": 0.11182699139962098, + "learning_rate": 5e-05, + "loss": 1.1806, + "step": 1080 + }, + { + "epoch": 1.52, + "grad_norm": 0.11423113268976091, + "learning_rate": 5e-05, + "loss": 1.2026, + "step": 1081 + }, + { + "epoch": 1.52, + "grad_norm": 0.13063554823255874, + "learning_rate": 5e-05, + "loss": 1.2587, + "step": 1082 + }, + { + "epoch": 1.52, + "grad_norm": 0.10791638156088533, + "learning_rate": 5e-05, + "loss": 1.0991, + "step": 1083 + }, + { + "epoch": 1.52, + "grad_norm": 0.11938249787224578, + "learning_rate": 5e-05, + "loss": 1.2406, + "step": 1084 + }, + { + "epoch": 1.52, + "grad_norm": 0.10902651597381753, + "learning_rate": 5e-05, + "loss": 1.2697, + "step": 1085 + }, + { + "epoch": 1.53, + "grad_norm": 0.10486855124377796, + "learning_rate": 5e-05, + "loss": 1.1325, + "step": 1086 + }, + { + "epoch": 1.53, + "grad_norm": 0.3453109872403924, + "learning_rate": 5e-05, + "loss": 1.1767, + "step": 1087 + }, + { + "epoch": 1.53, + "grad_norm": 0.11239569153139466, + "learning_rate": 5e-05, + "loss": 1.0797, + "step": 1088 + }, + { + "epoch": 1.53, + "grad_norm": 0.45855681572495244, + "learning_rate": 5e-05, + "loss": 1.2631, + "step": 1089 + }, + { + "epoch": 1.53, + "grad_norm": 0.1036145956775789, + "learning_rate": 5e-05, + "loss": 1.1923, + "step": 1090 + }, + { + "epoch": 1.53, + "grad_norm": 0.11355495090838764, + "learning_rate": 5e-05, + "loss": 1.2224, + "step": 1091 + }, + { + "epoch": 1.53, + "grad_norm": 0.10402736950459299, + "learning_rate": 5e-05, + "loss": 1.1325, + "step": 1092 + }, + { + "epoch": 1.54, + "grad_norm": 0.10464257964892076, + "learning_rate": 5e-05, + "loss": 1.3013, + "step": 1093 + }, + { + "epoch": 1.54, + "grad_norm": 0.2997750672110947, + "learning_rate": 5e-05, + "loss": 1.2847, + "step": 1094 + }, + { + "epoch": 1.54, + "grad_norm": 0.1038403157372341, + "learning_rate": 5e-05, + "loss": 1.1845, + "step": 1095 + }, + { + "epoch": 1.54, + "grad_norm": 0.10749180100362335, + "learning_rate": 5e-05, + "loss": 1.1651, + "step": 1096 + }, + { + "epoch": 1.54, + "grad_norm": 0.1239660329028446, + "learning_rate": 5e-05, + "loss": 1.2477, + "step": 1097 + }, + { + "epoch": 1.54, + "grad_norm": 0.14474184991191533, + "learning_rate": 5e-05, + "loss": 1.1866, + "step": 1098 + }, + { + "epoch": 1.54, + "grad_norm": 0.11037461451933651, + "learning_rate": 5e-05, + "loss": 1.1542, + "step": 1099 + }, + { + "epoch": 1.54, + "grad_norm": 0.106841448364413, + "learning_rate": 5e-05, + "loss": 1.2771, + "step": 1100 + }, + { + "epoch": 1.55, + "grad_norm": 0.11810389395422928, + "learning_rate": 5e-05, + "loss": 1.0569, + "step": 1101 + }, + { + "epoch": 1.55, + "grad_norm": 0.10533151633570667, + "learning_rate": 5e-05, + "loss": 1.288, + "step": 1102 + }, + { + "epoch": 1.55, + "grad_norm": 0.10717168365401805, + "learning_rate": 5e-05, + "loss": 1.1776, + "step": 1103 + }, + { + "epoch": 1.55, + "grad_norm": 0.0961612976475172, + "learning_rate": 5e-05, + "loss": 1.2076, + "step": 1104 + }, + { + "epoch": 1.55, + "grad_norm": 0.12243032566601864, + "learning_rate": 5e-05, + "loss": 1.2372, + "step": 1105 + }, + { + "epoch": 1.55, + "grad_norm": 0.10895416526413164, + "learning_rate": 5e-05, + "loss": 1.237, + "step": 1106 + }, + { + "epoch": 1.55, + "grad_norm": 0.15851208507114167, + "learning_rate": 5e-05, + "loss": 1.2776, + "step": 1107 + }, + { + "epoch": 1.56, + "grad_norm": 0.10017642682972655, + "learning_rate": 5e-05, + "loss": 1.1887, + "step": 1108 + }, + { + "epoch": 1.56, + "grad_norm": 0.11014110698632447, + "learning_rate": 5e-05, + "loss": 1.2839, + "step": 1109 + }, + { + "epoch": 1.56, + "grad_norm": 0.12226489365159669, + "learning_rate": 5e-05, + "loss": 1.3129, + "step": 1110 + }, + { + "epoch": 1.56, + "grad_norm": 0.12807374471444058, + "learning_rate": 5e-05, + "loss": 1.2599, + "step": 1111 + }, + { + "epoch": 1.56, + "grad_norm": 0.12259938643247845, + "learning_rate": 5e-05, + "loss": 1.1938, + "step": 1112 + }, + { + "epoch": 1.56, + "grad_norm": 0.0980980052185939, + "learning_rate": 5e-05, + "loss": 1.2428, + "step": 1113 + }, + { + "epoch": 1.56, + "grad_norm": 0.12319771896361019, + "learning_rate": 5e-05, + "loss": 1.316, + "step": 1114 + }, + { + "epoch": 1.57, + "grad_norm": 0.09716149046365014, + "learning_rate": 5e-05, + "loss": 1.1714, + "step": 1115 + }, + { + "epoch": 1.57, + "grad_norm": 0.10397951019093706, + "learning_rate": 5e-05, + "loss": 1.1, + "step": 1116 + }, + { + "epoch": 1.57, + "grad_norm": 0.10995082972128228, + "learning_rate": 5e-05, + "loss": 1.2488, + "step": 1117 + }, + { + "epoch": 1.57, + "grad_norm": 0.12451854257642488, + "learning_rate": 5e-05, + "loss": 1.2055, + "step": 1118 + }, + { + "epoch": 1.57, + "grad_norm": 0.1531437089478829, + "learning_rate": 5e-05, + "loss": 1.2104, + "step": 1119 + }, + { + "epoch": 1.57, + "grad_norm": 0.11493923874545948, + "learning_rate": 5e-05, + "loss": 1.2339, + "step": 1120 + }, + { + "epoch": 1.57, + "grad_norm": 0.12461445575609231, + "learning_rate": 5e-05, + "loss": 1.1558, + "step": 1121 + }, + { + "epoch": 1.58, + "grad_norm": 0.10520018544273198, + "learning_rate": 5e-05, + "loss": 1.0898, + "step": 1122 + }, + { + "epoch": 1.58, + "grad_norm": 0.10894887361146934, + "learning_rate": 5e-05, + "loss": 1.2124, + "step": 1123 + }, + { + "epoch": 1.58, + "grad_norm": 0.09840018449855767, + "learning_rate": 5e-05, + "loss": 1.2204, + "step": 1124 + }, + { + "epoch": 1.58, + "grad_norm": 0.11199058641987784, + "learning_rate": 5e-05, + "loss": 1.2546, + "step": 1125 + }, + { + "epoch": 1.58, + "grad_norm": 0.12408245293592914, + "learning_rate": 5e-05, + "loss": 1.1638, + "step": 1126 + }, + { + "epoch": 1.58, + "grad_norm": 0.10127521215119754, + "learning_rate": 5e-05, + "loss": 1.2653, + "step": 1127 + }, + { + "epoch": 1.58, + "grad_norm": 0.1399086035344601, + "learning_rate": 5e-05, + "loss": 1.2332, + "step": 1128 + }, + { + "epoch": 1.59, + "grad_norm": 0.11426115988130074, + "learning_rate": 5e-05, + "loss": 1.1353, + "step": 1129 + }, + { + "epoch": 1.59, + "grad_norm": 0.10395875125555352, + "learning_rate": 5e-05, + "loss": 1.1236, + "step": 1130 + }, + { + "epoch": 1.59, + "grad_norm": 0.10162172390578378, + "learning_rate": 5e-05, + "loss": 1.2529, + "step": 1131 + }, + { + "epoch": 1.59, + "grad_norm": 0.15889830320606954, + "learning_rate": 5e-05, + "loss": 1.2249, + "step": 1132 + }, + { + "epoch": 1.59, + "grad_norm": 0.10069562029591411, + "learning_rate": 5e-05, + "loss": 1.0911, + "step": 1133 + }, + { + "epoch": 1.59, + "grad_norm": 0.0988902826921912, + "learning_rate": 5e-05, + "loss": 1.2811, + "step": 1134 + }, + { + "epoch": 1.59, + "grad_norm": 0.10245904846153597, + "learning_rate": 5e-05, + "loss": 1.1767, + "step": 1135 + }, + { + "epoch": 1.6, + "grad_norm": 0.10011785338352583, + "learning_rate": 5e-05, + "loss": 1.231, + "step": 1136 + }, + { + "epoch": 1.6, + "grad_norm": 0.10463545323443652, + "learning_rate": 5e-05, + "loss": 1.1936, + "step": 1137 + }, + { + "epoch": 1.6, + "grad_norm": 0.10083707261198763, + "learning_rate": 5e-05, + "loss": 1.2133, + "step": 1138 + }, + { + "epoch": 1.6, + "grad_norm": 0.10492203552637583, + "learning_rate": 5e-05, + "loss": 1.2816, + "step": 1139 + }, + { + "epoch": 1.6, + "grad_norm": 0.12054430455453068, + "learning_rate": 5e-05, + "loss": 1.2614, + "step": 1140 + }, + { + "epoch": 1.6, + "grad_norm": 0.10179671312141375, + "learning_rate": 5e-05, + "loss": 1.1926, + "step": 1141 + }, + { + "epoch": 1.6, + "grad_norm": 0.1151371912064686, + "learning_rate": 5e-05, + "loss": 1.1968, + "step": 1142 + }, + { + "epoch": 1.61, + "grad_norm": 0.10399576378706286, + "learning_rate": 5e-05, + "loss": 1.2151, + "step": 1143 + }, + { + "epoch": 1.61, + "grad_norm": 0.12203535636977965, + "learning_rate": 5e-05, + "loss": 1.1495, + "step": 1144 + }, + { + "epoch": 1.61, + "grad_norm": 0.09290063978036708, + "learning_rate": 5e-05, + "loss": 1.2679, + "step": 1145 + }, + { + "epoch": 1.61, + "grad_norm": 0.10898289097335016, + "learning_rate": 5e-05, + "loss": 1.2008, + "step": 1146 + }, + { + "epoch": 1.61, + "grad_norm": 0.10148033448384093, + "learning_rate": 5e-05, + "loss": 1.322, + "step": 1147 + }, + { + "epoch": 1.61, + "grad_norm": 0.10834959113369981, + "learning_rate": 5e-05, + "loss": 1.2096, + "step": 1148 + }, + { + "epoch": 1.61, + "grad_norm": 0.10065777111501409, + "learning_rate": 5e-05, + "loss": 1.2288, + "step": 1149 + }, + { + "epoch": 1.62, + "grad_norm": 0.09631018069832553, + "learning_rate": 5e-05, + "loss": 1.2207, + "step": 1150 + }, + { + "epoch": 1.62, + "grad_norm": 0.3230353288983818, + "learning_rate": 5e-05, + "loss": 1.2501, + "step": 1151 + }, + { + "epoch": 1.62, + "grad_norm": 0.12702499368592196, + "learning_rate": 5e-05, + "loss": 1.2306, + "step": 1152 + }, + { + "epoch": 1.62, + "grad_norm": 0.10936572946474664, + "learning_rate": 5e-05, + "loss": 1.2404, + "step": 1153 + }, + { + "epoch": 1.62, + "grad_norm": 0.1177538068676181, + "learning_rate": 5e-05, + "loss": 1.2443, + "step": 1154 + }, + { + "epoch": 1.62, + "grad_norm": 0.10975006494672532, + "learning_rate": 5e-05, + "loss": 1.1036, + "step": 1155 + }, + { + "epoch": 1.62, + "grad_norm": 0.2197390336947409, + "learning_rate": 5e-05, + "loss": 1.2376, + "step": 1156 + }, + { + "epoch": 1.62, + "grad_norm": 0.10452116607674644, + "learning_rate": 5e-05, + "loss": 1.2399, + "step": 1157 + }, + { + "epoch": 1.63, + "grad_norm": 0.11607607047242494, + "learning_rate": 5e-05, + "loss": 1.1981, + "step": 1158 + }, + { + "epoch": 1.63, + "grad_norm": 0.10380754969804883, + "learning_rate": 5e-05, + "loss": 1.262, + "step": 1159 + }, + { + "epoch": 1.63, + "grad_norm": 0.11185829212786501, + "learning_rate": 5e-05, + "loss": 1.2712, + "step": 1160 + }, + { + "epoch": 1.63, + "grad_norm": 0.10179469200333552, + "learning_rate": 5e-05, + "loss": 1.2246, + "step": 1161 + }, + { + "epoch": 1.63, + "grad_norm": 0.11110636528358402, + "learning_rate": 5e-05, + "loss": 1.1113, + "step": 1162 + }, + { + "epoch": 1.63, + "grad_norm": 0.10042761084255272, + "learning_rate": 5e-05, + "loss": 1.267, + "step": 1163 + }, + { + "epoch": 1.63, + "grad_norm": 0.11312232886115434, + "learning_rate": 5e-05, + "loss": 1.274, + "step": 1164 + }, + { + "epoch": 1.64, + "grad_norm": 0.10089653412943035, + "learning_rate": 5e-05, + "loss": 1.139, + "step": 1165 + }, + { + "epoch": 1.64, + "grad_norm": 0.09343133217706687, + "learning_rate": 5e-05, + "loss": 1.1974, + "step": 1166 + }, + { + "epoch": 1.64, + "grad_norm": 0.12073081879301166, + "learning_rate": 5e-05, + "loss": 1.2073, + "step": 1167 + }, + { + "epoch": 1.64, + "grad_norm": 0.09852467020934617, + "learning_rate": 5e-05, + "loss": 1.2665, + "step": 1168 + }, + { + "epoch": 1.64, + "grad_norm": 0.10136884495861127, + "learning_rate": 5e-05, + "loss": 1.2409, + "step": 1169 + }, + { + "epoch": 1.64, + "grad_norm": 0.10022427968306054, + "learning_rate": 5e-05, + "loss": 1.1998, + "step": 1170 + }, + { + "epoch": 1.64, + "grad_norm": 0.0910322515063694, + "learning_rate": 5e-05, + "loss": 1.2836, + "step": 1171 + }, + { + "epoch": 1.65, + "grad_norm": 0.10437507814306135, + "learning_rate": 5e-05, + "loss": 1.2973, + "step": 1172 + }, + { + "epoch": 1.65, + "grad_norm": 0.10199438944121582, + "learning_rate": 5e-05, + "loss": 1.271, + "step": 1173 + }, + { + "epoch": 1.65, + "grad_norm": 0.09920544631583435, + "learning_rate": 5e-05, + "loss": 1.2717, + "step": 1174 + }, + { + "epoch": 1.65, + "grad_norm": 0.10804525830212407, + "learning_rate": 5e-05, + "loss": 1.23, + "step": 1175 + }, + { + "epoch": 1.65, + "grad_norm": 0.09912634214578551, + "learning_rate": 5e-05, + "loss": 1.2064, + "step": 1176 + }, + { + "epoch": 1.65, + "grad_norm": 0.12054697422829558, + "learning_rate": 5e-05, + "loss": 1.1208, + "step": 1177 + }, + { + "epoch": 1.65, + "grad_norm": 0.10122850210400991, + "learning_rate": 5e-05, + "loss": 1.1655, + "step": 1178 + }, + { + "epoch": 1.66, + "grad_norm": 0.0965851330147051, + "learning_rate": 5e-05, + "loss": 1.2477, + "step": 1179 + }, + { + "epoch": 1.66, + "grad_norm": 0.11218789799430838, + "learning_rate": 5e-05, + "loss": 1.1991, + "step": 1180 + }, + { + "epoch": 1.66, + "grad_norm": 0.15542937261692477, + "learning_rate": 5e-05, + "loss": 1.1494, + "step": 1181 + }, + { + "epoch": 1.66, + "grad_norm": 0.111541983858468, + "learning_rate": 5e-05, + "loss": 1.1802, + "step": 1182 + }, + { + "epoch": 1.66, + "grad_norm": 0.111410084332699, + "learning_rate": 5e-05, + "loss": 1.1395, + "step": 1183 + }, + { + "epoch": 1.66, + "grad_norm": 0.11619923573838167, + "learning_rate": 5e-05, + "loss": 1.3272, + "step": 1184 + }, + { + "epoch": 1.66, + "grad_norm": 0.10718441596644573, + "learning_rate": 5e-05, + "loss": 1.2276, + "step": 1185 + }, + { + "epoch": 1.67, + "grad_norm": 0.10801899868334196, + "learning_rate": 5e-05, + "loss": 1.0912, + "step": 1186 + }, + { + "epoch": 1.67, + "grad_norm": 0.15050847078799887, + "learning_rate": 5e-05, + "loss": 1.1554, + "step": 1187 + }, + { + "epoch": 1.67, + "grad_norm": 0.10919376860447183, + "learning_rate": 5e-05, + "loss": 1.1637, + "step": 1188 + }, + { + "epoch": 1.67, + "grad_norm": 0.10622165564978144, + "learning_rate": 5e-05, + "loss": 1.2543, + "step": 1189 + }, + { + "epoch": 1.67, + "grad_norm": 0.10810630217231784, + "learning_rate": 5e-05, + "loss": 1.2308, + "step": 1190 + }, + { + "epoch": 1.67, + "grad_norm": 0.10840706011043, + "learning_rate": 5e-05, + "loss": 1.276, + "step": 1191 + }, + { + "epoch": 1.67, + "grad_norm": 0.10246667696227883, + "learning_rate": 5e-05, + "loss": 1.2435, + "step": 1192 + }, + { + "epoch": 1.68, + "grad_norm": 0.0976182543352327, + "learning_rate": 5e-05, + "loss": 1.2223, + "step": 1193 + }, + { + "epoch": 1.68, + "grad_norm": 0.11112765591620033, + "learning_rate": 5e-05, + "loss": 1.1617, + "step": 1194 + }, + { + "epoch": 1.68, + "grad_norm": 0.09630889540310973, + "learning_rate": 5e-05, + "loss": 1.1156, + "step": 1195 + }, + { + "epoch": 1.68, + "grad_norm": 0.11227327495026782, + "learning_rate": 5e-05, + "loss": 1.1249, + "step": 1196 + }, + { + "epoch": 1.68, + "grad_norm": 0.12233812487333268, + "learning_rate": 5e-05, + "loss": 1.214, + "step": 1197 + }, + { + "epoch": 1.68, + "grad_norm": 0.09859226961924858, + "learning_rate": 5e-05, + "loss": 1.1707, + "step": 1198 + }, + { + "epoch": 1.68, + "grad_norm": 0.1219145840608869, + "learning_rate": 5e-05, + "loss": 1.1669, + "step": 1199 + }, + { + "epoch": 1.69, + "grad_norm": 0.10829044049284217, + "learning_rate": 5e-05, + "loss": 1.2756, + "step": 1200 + }, + { + "epoch": 1.69, + "grad_norm": 0.10932578968684246, + "learning_rate": 5e-05, + "loss": 1.2864, + "step": 1201 + }, + { + "epoch": 1.69, + "grad_norm": 0.12670280491084548, + "learning_rate": 5e-05, + "loss": 1.289, + "step": 1202 + }, + { + "epoch": 1.69, + "grad_norm": 0.09685526139167519, + "learning_rate": 5e-05, + "loss": 1.2881, + "step": 1203 + }, + { + "epoch": 1.69, + "grad_norm": 0.10583813949907528, + "learning_rate": 5e-05, + "loss": 1.2822, + "step": 1204 + }, + { + "epoch": 1.69, + "grad_norm": 0.10945130691209246, + "learning_rate": 5e-05, + "loss": 1.2269, + "step": 1205 + }, + { + "epoch": 1.69, + "grad_norm": 0.10921995222605822, + "learning_rate": 5e-05, + "loss": 1.2643, + "step": 1206 + }, + { + "epoch": 1.7, + "grad_norm": 0.0983530242970981, + "learning_rate": 5e-05, + "loss": 1.1867, + "step": 1207 + }, + { + "epoch": 1.7, + "grad_norm": 0.10929467514266258, + "learning_rate": 5e-05, + "loss": 1.2048, + "step": 1208 + }, + { + "epoch": 1.7, + "grad_norm": 0.10226448992517749, + "learning_rate": 5e-05, + "loss": 1.246, + "step": 1209 + }, + { + "epoch": 1.7, + "grad_norm": 0.12407437670976282, + "learning_rate": 5e-05, + "loss": 1.1378, + "step": 1210 + }, + { + "epoch": 1.7, + "grad_norm": 0.10460464410563199, + "learning_rate": 5e-05, + "loss": 1.2073, + "step": 1211 + }, + { + "epoch": 1.7, + "grad_norm": 0.1130481700493754, + "learning_rate": 5e-05, + "loss": 1.2146, + "step": 1212 + }, + { + "epoch": 1.7, + "grad_norm": 0.10898937707078646, + "learning_rate": 5e-05, + "loss": 1.2017, + "step": 1213 + }, + { + "epoch": 1.71, + "grad_norm": 0.10458208968254373, + "learning_rate": 5e-05, + "loss": 1.2241, + "step": 1214 + }, + { + "epoch": 1.71, + "grad_norm": 0.17157051460666153, + "learning_rate": 5e-05, + "loss": 1.2247, + "step": 1215 + }, + { + "epoch": 1.71, + "grad_norm": 0.11301909258134905, + "learning_rate": 5e-05, + "loss": 1.1742, + "step": 1216 + }, + { + "epoch": 1.71, + "grad_norm": 0.11090992344071807, + "learning_rate": 5e-05, + "loss": 1.2621, + "step": 1217 + }, + { + "epoch": 1.71, + "grad_norm": 0.19061726604876061, + "learning_rate": 5e-05, + "loss": 1.321, + "step": 1218 + }, + { + "epoch": 1.71, + "grad_norm": 0.11306033650929138, + "learning_rate": 5e-05, + "loss": 1.1946, + "step": 1219 + }, + { + "epoch": 1.71, + "grad_norm": 0.11054797742330863, + "learning_rate": 5e-05, + "loss": 1.1247, + "step": 1220 + }, + { + "epoch": 1.71, + "grad_norm": 0.12409173074359779, + "learning_rate": 5e-05, + "loss": 1.2783, + "step": 1221 + }, + { + "epoch": 1.72, + "grad_norm": 0.10116741290769664, + "learning_rate": 5e-05, + "loss": 1.2847, + "step": 1222 + }, + { + "epoch": 1.72, + "grad_norm": 0.10482518955704544, + "learning_rate": 5e-05, + "loss": 1.2904, + "step": 1223 + }, + { + "epoch": 1.72, + "grad_norm": 0.11115021294905844, + "learning_rate": 5e-05, + "loss": 1.2046, + "step": 1224 + }, + { + "epoch": 1.72, + "grad_norm": 0.1543937030512322, + "learning_rate": 5e-05, + "loss": 1.3135, + "step": 1225 + }, + { + "epoch": 1.72, + "grad_norm": 0.1024341754657176, + "learning_rate": 5e-05, + "loss": 1.1804, + "step": 1226 + }, + { + "epoch": 1.72, + "grad_norm": 0.10162401906455482, + "learning_rate": 5e-05, + "loss": 1.2233, + "step": 1227 + }, + { + "epoch": 1.72, + "grad_norm": 0.14714386441625504, + "learning_rate": 5e-05, + "loss": 1.3606, + "step": 1228 + }, + { + "epoch": 1.73, + "grad_norm": 0.14316956030100939, + "learning_rate": 5e-05, + "loss": 1.2321, + "step": 1229 + }, + { + "epoch": 1.73, + "grad_norm": 0.10717046646861364, + "learning_rate": 5e-05, + "loss": 1.2936, + "step": 1230 + }, + { + "epoch": 1.73, + "grad_norm": 0.10306250658030879, + "learning_rate": 5e-05, + "loss": 1.2919, + "step": 1231 + }, + { + "epoch": 1.73, + "grad_norm": 0.10398905424648734, + "learning_rate": 5e-05, + "loss": 1.2025, + "step": 1232 + }, + { + "epoch": 1.73, + "grad_norm": 0.10393126846253141, + "learning_rate": 5e-05, + "loss": 1.3078, + "step": 1233 + }, + { + "epoch": 1.73, + "grad_norm": 0.1228474547464257, + "learning_rate": 5e-05, + "loss": 1.2569, + "step": 1234 + }, + { + "epoch": 1.73, + "grad_norm": 0.18494956328524312, + "learning_rate": 5e-05, + "loss": 1.2019, + "step": 1235 + }, + { + "epoch": 1.74, + "grad_norm": 0.2089798375546016, + "learning_rate": 5e-05, + "loss": 1.2641, + "step": 1236 + }, + { + "epoch": 1.74, + "grad_norm": 0.1120639345674154, + "learning_rate": 5e-05, + "loss": 1.1805, + "step": 1237 + }, + { + "epoch": 1.74, + "grad_norm": 0.09869073334943725, + "learning_rate": 5e-05, + "loss": 1.1377, + "step": 1238 + }, + { + "epoch": 1.74, + "grad_norm": 0.0987004825345963, + "learning_rate": 5e-05, + "loss": 1.2975, + "step": 1239 + }, + { + "epoch": 1.74, + "grad_norm": 0.10117622099339955, + "learning_rate": 5e-05, + "loss": 1.2602, + "step": 1240 + }, + { + "epoch": 1.74, + "grad_norm": 0.27026743100111356, + "learning_rate": 5e-05, + "loss": 1.2129, + "step": 1241 + }, + { + "epoch": 1.74, + "grad_norm": 0.09600849716739375, + "learning_rate": 5e-05, + "loss": 1.2822, + "step": 1242 + }, + { + "epoch": 1.75, + "grad_norm": 0.09195470019743175, + "learning_rate": 5e-05, + "loss": 1.214, + "step": 1243 + }, + { + "epoch": 1.75, + "grad_norm": 0.21139686590673398, + "learning_rate": 5e-05, + "loss": 1.2527, + "step": 1244 + }, + { + "epoch": 1.75, + "grad_norm": 0.35125858508648117, + "learning_rate": 5e-05, + "loss": 1.059, + "step": 1245 + }, + { + "epoch": 1.75, + "grad_norm": 0.11357337969627455, + "learning_rate": 5e-05, + "loss": 1.2338, + "step": 1246 + }, + { + "epoch": 1.75, + "grad_norm": 0.1643801080348834, + "learning_rate": 5e-05, + "loss": 1.3028, + "step": 1247 + }, + { + "epoch": 1.75, + "grad_norm": 0.12090900207903225, + "learning_rate": 5e-05, + "loss": 1.3095, + "step": 1248 + }, + { + "epoch": 1.75, + "grad_norm": 0.11153656319798944, + "learning_rate": 5e-05, + "loss": 1.3215, + "step": 1249 + }, + { + "epoch": 1.76, + "grad_norm": 0.11707006072514332, + "learning_rate": 5e-05, + "loss": 1.1271, + "step": 1250 + }, + { + "epoch": 1.76, + "grad_norm": 0.11753420218744483, + "learning_rate": 5e-05, + "loss": 1.2146, + "step": 1251 + }, + { + "epoch": 1.76, + "grad_norm": 0.10357254508306929, + "learning_rate": 5e-05, + "loss": 1.1059, + "step": 1252 + }, + { + "epoch": 1.76, + "grad_norm": 0.09663866569935589, + "learning_rate": 5e-05, + "loss": 1.192, + "step": 1253 + }, + { + "epoch": 1.76, + "grad_norm": 0.1575776172648202, + "learning_rate": 5e-05, + "loss": 1.1684, + "step": 1254 + }, + { + "epoch": 1.76, + "grad_norm": 0.10326535906232145, + "learning_rate": 5e-05, + "loss": 1.1345, + "step": 1255 + }, + { + "epoch": 1.76, + "grad_norm": 0.10209137977079849, + "learning_rate": 5e-05, + "loss": 1.219, + "step": 1256 + }, + { + "epoch": 1.77, + "grad_norm": 0.13229065013863076, + "learning_rate": 5e-05, + "loss": 1.2622, + "step": 1257 + }, + { + "epoch": 1.77, + "grad_norm": 0.0906911459500279, + "learning_rate": 5e-05, + "loss": 1.181, + "step": 1258 + }, + { + "epoch": 1.77, + "grad_norm": 0.12023731835311281, + "learning_rate": 5e-05, + "loss": 1.2514, + "step": 1259 + }, + { + "epoch": 1.77, + "grad_norm": 0.10366397873766577, + "learning_rate": 5e-05, + "loss": 1.2729, + "step": 1260 + }, + { + "epoch": 1.77, + "grad_norm": 0.1247952402268157, + "learning_rate": 5e-05, + "loss": 1.2957, + "step": 1261 + }, + { + "epoch": 1.77, + "grad_norm": 0.09558557583720882, + "learning_rate": 5e-05, + "loss": 1.1755, + "step": 1262 + }, + { + "epoch": 1.77, + "grad_norm": 0.09793069365544155, + "learning_rate": 5e-05, + "loss": 1.2805, + "step": 1263 + }, + { + "epoch": 1.78, + "grad_norm": 0.11905319553833751, + "learning_rate": 5e-05, + "loss": 1.1488, + "step": 1264 + }, + { + "epoch": 1.78, + "grad_norm": 0.12667648557528083, + "learning_rate": 5e-05, + "loss": 1.1817, + "step": 1265 + }, + { + "epoch": 1.78, + "grad_norm": 0.17320168990875578, + "learning_rate": 5e-05, + "loss": 1.1387, + "step": 1266 + }, + { + "epoch": 1.78, + "grad_norm": 0.1278704824906583, + "learning_rate": 5e-05, + "loss": 1.2801, + "step": 1267 + }, + { + "epoch": 1.78, + "grad_norm": 0.09857146933191926, + "learning_rate": 5e-05, + "loss": 1.2289, + "step": 1268 + }, + { + "epoch": 1.78, + "grad_norm": 0.11033316991181887, + "learning_rate": 5e-05, + "loss": 1.2567, + "step": 1269 + }, + { + "epoch": 1.78, + "grad_norm": 0.9573844042386731, + "learning_rate": 5e-05, + "loss": 1.2047, + "step": 1270 + }, + { + "epoch": 1.79, + "grad_norm": 0.1051122250248032, + "learning_rate": 5e-05, + "loss": 1.2515, + "step": 1271 + }, + { + "epoch": 1.79, + "grad_norm": 0.1437255080680373, + "learning_rate": 5e-05, + "loss": 1.2482, + "step": 1272 + }, + { + "epoch": 1.79, + "grad_norm": 0.0991494729841633, + "learning_rate": 5e-05, + "loss": 1.2842, + "step": 1273 + }, + { + "epoch": 1.79, + "grad_norm": 0.0972414511934356, + "learning_rate": 5e-05, + "loss": 1.1815, + "step": 1274 + }, + { + "epoch": 1.79, + "grad_norm": 0.11533577527558092, + "learning_rate": 5e-05, + "loss": 1.2547, + "step": 1275 + }, + { + "epoch": 1.79, + "grad_norm": 0.17134502324827014, + "learning_rate": 5e-05, + "loss": 1.113, + "step": 1276 + }, + { + "epoch": 1.79, + "grad_norm": 0.10589478661864486, + "learning_rate": 5e-05, + "loss": 1.2786, + "step": 1277 + }, + { + "epoch": 1.79, + "grad_norm": 0.10422140839238593, + "learning_rate": 5e-05, + "loss": 1.2973, + "step": 1278 + }, + { + "epoch": 1.8, + "grad_norm": 0.10640150155678055, + "learning_rate": 5e-05, + "loss": 1.1525, + "step": 1279 + }, + { + "epoch": 1.8, + "grad_norm": 0.10199851060213333, + "learning_rate": 5e-05, + "loss": 1.1229, + "step": 1280 + }, + { + "epoch": 1.8, + "grad_norm": 0.1223477356844984, + "learning_rate": 5e-05, + "loss": 1.1436, + "step": 1281 + }, + { + "epoch": 1.8, + "grad_norm": 0.09854064368856022, + "learning_rate": 5e-05, + "loss": 1.2149, + "step": 1282 + }, + { + "epoch": 1.8, + "grad_norm": 0.12059049956928705, + "learning_rate": 5e-05, + "loss": 1.2748, + "step": 1283 + }, + { + "epoch": 1.8, + "grad_norm": 0.10009505515058299, + "learning_rate": 5e-05, + "loss": 1.2496, + "step": 1284 + }, + { + "epoch": 1.8, + "grad_norm": 0.1057531566096711, + "learning_rate": 5e-05, + "loss": 1.2263, + "step": 1285 + }, + { + "epoch": 1.81, + "grad_norm": 0.10577195617365408, + "learning_rate": 5e-05, + "loss": 1.2097, + "step": 1286 + }, + { + "epoch": 1.81, + "grad_norm": 0.1980193546773249, + "learning_rate": 5e-05, + "loss": 1.2558, + "step": 1287 + }, + { + "epoch": 1.81, + "grad_norm": 0.10620780530453745, + "learning_rate": 5e-05, + "loss": 1.1121, + "step": 1288 + }, + { + "epoch": 1.81, + "grad_norm": 0.10364754509557948, + "learning_rate": 5e-05, + "loss": 1.2278, + "step": 1289 + }, + { + "epoch": 1.81, + "grad_norm": 0.1268398552835006, + "learning_rate": 5e-05, + "loss": 1.2659, + "step": 1290 + }, + { + "epoch": 1.81, + "grad_norm": 0.11118097887284956, + "learning_rate": 5e-05, + "loss": 1.2535, + "step": 1291 + }, + { + "epoch": 1.81, + "grad_norm": 0.18499773138538006, + "learning_rate": 5e-05, + "loss": 1.1819, + "step": 1292 + }, + { + "epoch": 1.82, + "grad_norm": 0.14794438042883248, + "learning_rate": 5e-05, + "loss": 1.2723, + "step": 1293 + }, + { + "epoch": 1.82, + "grad_norm": 0.10378984669475717, + "learning_rate": 5e-05, + "loss": 1.3342, + "step": 1294 + }, + { + "epoch": 1.82, + "grad_norm": 0.3840623510789001, + "learning_rate": 5e-05, + "loss": 1.2473, + "step": 1295 + }, + { + "epoch": 1.82, + "grad_norm": 0.13499019720704672, + "learning_rate": 5e-05, + "loss": 1.1873, + "step": 1296 + }, + { + "epoch": 1.82, + "grad_norm": 0.11083415018038016, + "learning_rate": 5e-05, + "loss": 1.3038, + "step": 1297 + }, + { + "epoch": 1.82, + "grad_norm": 0.09774055063425115, + "learning_rate": 5e-05, + "loss": 1.1524, + "step": 1298 + }, + { + "epoch": 1.82, + "grad_norm": 0.09765011460774314, + "learning_rate": 5e-05, + "loss": 1.1562, + "step": 1299 + }, + { + "epoch": 1.83, + "grad_norm": 0.12959918677475585, + "learning_rate": 5e-05, + "loss": 1.2451, + "step": 1300 + }, + { + "epoch": 1.83, + "grad_norm": 0.10307521658627149, + "learning_rate": 5e-05, + "loss": 1.2482, + "step": 1301 + }, + { + "epoch": 1.83, + "grad_norm": 0.10889075695796573, + "learning_rate": 5e-05, + "loss": 1.2032, + "step": 1302 + }, + { + "epoch": 1.83, + "grad_norm": 0.10274428682413997, + "learning_rate": 5e-05, + "loss": 1.2095, + "step": 1303 + }, + { + "epoch": 1.83, + "grad_norm": 0.09870991788841747, + "learning_rate": 5e-05, + "loss": 1.1679, + "step": 1304 + }, + { + "epoch": 1.83, + "grad_norm": 0.1029976635960567, + "learning_rate": 5e-05, + "loss": 1.197, + "step": 1305 + }, + { + "epoch": 1.83, + "grad_norm": 0.11565596154580056, + "learning_rate": 5e-05, + "loss": 1.2518, + "step": 1306 + }, + { + "epoch": 1.84, + "grad_norm": 0.11358697176280404, + "learning_rate": 5e-05, + "loss": 1.231, + "step": 1307 + }, + { + "epoch": 1.84, + "grad_norm": 0.11584857396336613, + "learning_rate": 5e-05, + "loss": 1.2431, + "step": 1308 + }, + { + "epoch": 1.84, + "grad_norm": 0.10126436030422854, + "learning_rate": 5e-05, + "loss": 1.3353, + "step": 1309 + }, + { + "epoch": 1.84, + "grad_norm": 0.12509494789947387, + "learning_rate": 5e-05, + "loss": 1.2041, + "step": 1310 + }, + { + "epoch": 1.84, + "grad_norm": 0.1184327130883153, + "learning_rate": 5e-05, + "loss": 1.233, + "step": 1311 + }, + { + "epoch": 1.84, + "grad_norm": 0.10464714266664168, + "learning_rate": 5e-05, + "loss": 1.2261, + "step": 1312 + }, + { + "epoch": 1.84, + "grad_norm": 0.12514815355665904, + "learning_rate": 5e-05, + "loss": 1.1561, + "step": 1313 + }, + { + "epoch": 1.85, + "grad_norm": 0.12311991854330846, + "learning_rate": 5e-05, + "loss": 1.1087, + "step": 1314 + }, + { + "epoch": 1.85, + "grad_norm": 0.09295582031960702, + "learning_rate": 5e-05, + "loss": 1.2152, + "step": 1315 + }, + { + "epoch": 1.85, + "grad_norm": 0.14449588211116057, + "learning_rate": 5e-05, + "loss": 1.2647, + "step": 1316 + }, + { + "epoch": 1.85, + "grad_norm": 0.10235199468766547, + "learning_rate": 5e-05, + "loss": 1.2049, + "step": 1317 + }, + { + "epoch": 1.85, + "grad_norm": 0.11350422082691311, + "learning_rate": 5e-05, + "loss": 1.2558, + "step": 1318 + }, + { + "epoch": 1.85, + "grad_norm": 0.12144887004278876, + "learning_rate": 5e-05, + "loss": 1.2685, + "step": 1319 + }, + { + "epoch": 1.85, + "grad_norm": 0.10173889195958385, + "learning_rate": 5e-05, + "loss": 1.2801, + "step": 1320 + }, + { + "epoch": 1.86, + "grad_norm": 0.09330810178399741, + "learning_rate": 5e-05, + "loss": 1.2512, + "step": 1321 + }, + { + "epoch": 1.86, + "grad_norm": 0.20161533425887576, + "learning_rate": 5e-05, + "loss": 1.1789, + "step": 1322 + }, + { + "epoch": 1.86, + "grad_norm": 0.09777849402957198, + "learning_rate": 5e-05, + "loss": 1.2417, + "step": 1323 + }, + { + "epoch": 1.86, + "grad_norm": 0.10024832098891394, + "learning_rate": 5e-05, + "loss": 1.2915, + "step": 1324 + }, + { + "epoch": 1.86, + "grad_norm": 0.11258369856811815, + "learning_rate": 5e-05, + "loss": 1.2514, + "step": 1325 + }, + { + "epoch": 1.86, + "grad_norm": 0.12484690094005808, + "learning_rate": 5e-05, + "loss": 1.3151, + "step": 1326 + }, + { + "epoch": 1.86, + "grad_norm": 0.0935577575829671, + "learning_rate": 5e-05, + "loss": 1.2521, + "step": 1327 + }, + { + "epoch": 1.87, + "grad_norm": 0.1210684422035205, + "learning_rate": 5e-05, + "loss": 1.1265, + "step": 1328 + }, + { + "epoch": 1.87, + "grad_norm": 0.09207941251078382, + "learning_rate": 5e-05, + "loss": 1.2198, + "step": 1329 + }, + { + "epoch": 1.87, + "grad_norm": 0.12309544282600764, + "learning_rate": 5e-05, + "loss": 1.2424, + "step": 1330 + }, + { + "epoch": 1.87, + "grad_norm": 0.10064914884191513, + "learning_rate": 5e-05, + "loss": 1.187, + "step": 1331 + }, + { + "epoch": 1.87, + "grad_norm": 0.13995361473968207, + "learning_rate": 5e-05, + "loss": 1.1866, + "step": 1332 + }, + { + "epoch": 1.87, + "grad_norm": 0.10713021403389816, + "learning_rate": 5e-05, + "loss": 1.2787, + "step": 1333 + }, + { + "epoch": 1.87, + "grad_norm": 0.12386001378771107, + "learning_rate": 5e-05, + "loss": 1.2367, + "step": 1334 + }, + { + "epoch": 1.88, + "grad_norm": 0.14112063187119603, + "learning_rate": 5e-05, + "loss": 1.1763, + "step": 1335 + }, + { + "epoch": 1.88, + "grad_norm": 0.10156292242573838, + "learning_rate": 5e-05, + "loss": 1.2417, + "step": 1336 + }, + { + "epoch": 1.88, + "grad_norm": 0.5023353070726944, + "learning_rate": 5e-05, + "loss": 1.2568, + "step": 1337 + }, + { + "epoch": 1.88, + "grad_norm": 0.11115935835706549, + "learning_rate": 5e-05, + "loss": 1.2205, + "step": 1338 + }, + { + "epoch": 1.88, + "grad_norm": 0.10000845594414244, + "learning_rate": 5e-05, + "loss": 1.2893, + "step": 1339 + }, + { + "epoch": 1.88, + "grad_norm": 0.10195023988800359, + "learning_rate": 5e-05, + "loss": 1.2801, + "step": 1340 + }, + { + "epoch": 1.88, + "grad_norm": 0.3067232561424585, + "learning_rate": 5e-05, + "loss": 1.2299, + "step": 1341 + }, + { + "epoch": 1.88, + "grad_norm": 0.10005294161976021, + "learning_rate": 5e-05, + "loss": 1.2635, + "step": 1342 + }, + { + "epoch": 1.89, + "grad_norm": 0.0926599419663251, + "learning_rate": 5e-05, + "loss": 1.2824, + "step": 1343 + }, + { + "epoch": 1.89, + "grad_norm": 0.16997862964953497, + "learning_rate": 5e-05, + "loss": 1.2689, + "step": 1344 + }, + { + "epoch": 1.89, + "grad_norm": 0.10425373316842217, + "learning_rate": 5e-05, + "loss": 1.1412, + "step": 1345 + }, + { + "epoch": 1.89, + "grad_norm": 0.10116618450255803, + "learning_rate": 5e-05, + "loss": 1.3225, + "step": 1346 + }, + { + "epoch": 1.89, + "grad_norm": 0.09712297309677503, + "learning_rate": 5e-05, + "loss": 1.2455, + "step": 1347 + }, + { + "epoch": 1.89, + "grad_norm": 0.09526406291414628, + "learning_rate": 5e-05, + "loss": 1.2307, + "step": 1348 + }, + { + "epoch": 1.89, + "grad_norm": 0.10066316805603513, + "learning_rate": 5e-05, + "loss": 1.1478, + "step": 1349 + }, + { + "epoch": 1.9, + "grad_norm": 0.10664910103833464, + "learning_rate": 5e-05, + "loss": 1.1798, + "step": 1350 + }, + { + "epoch": 1.9, + "grad_norm": 0.09373044345316567, + "learning_rate": 5e-05, + "loss": 1.243, + "step": 1351 + }, + { + "epoch": 1.9, + "grad_norm": 0.09009331726599838, + "learning_rate": 5e-05, + "loss": 1.2168, + "step": 1352 + }, + { + "epoch": 1.9, + "grad_norm": 0.2485364577633018, + "learning_rate": 5e-05, + "loss": 1.2433, + "step": 1353 + }, + { + "epoch": 1.9, + "grad_norm": 0.09513898211847613, + "learning_rate": 5e-05, + "loss": 1.2525, + "step": 1354 + }, + { + "epoch": 1.9, + "grad_norm": 0.13329990401583716, + "learning_rate": 5e-05, + "loss": 1.2002, + "step": 1355 + }, + { + "epoch": 1.9, + "grad_norm": 0.10449667016871154, + "learning_rate": 5e-05, + "loss": 1.1616, + "step": 1356 + }, + { + "epoch": 1.91, + "grad_norm": 0.09994582726380859, + "learning_rate": 5e-05, + "loss": 1.3169, + "step": 1357 + }, + { + "epoch": 1.91, + "grad_norm": 0.09877612528109277, + "learning_rate": 5e-05, + "loss": 1.3307, + "step": 1358 + }, + { + "epoch": 1.91, + "grad_norm": 0.11189316406658582, + "learning_rate": 5e-05, + "loss": 1.2293, + "step": 1359 + }, + { + "epoch": 1.91, + "grad_norm": 0.10185356183272931, + "learning_rate": 5e-05, + "loss": 1.234, + "step": 1360 + }, + { + "epoch": 1.91, + "grad_norm": 0.09507739812915995, + "learning_rate": 5e-05, + "loss": 1.279, + "step": 1361 + }, + { + "epoch": 1.91, + "grad_norm": 0.1163942518914868, + "learning_rate": 5e-05, + "loss": 1.1431, + "step": 1362 + }, + { + "epoch": 1.91, + "grad_norm": 0.103983513284656, + "learning_rate": 5e-05, + "loss": 1.1965, + "step": 1363 + }, + { + "epoch": 1.92, + "grad_norm": 0.10693226780267162, + "learning_rate": 5e-05, + "loss": 1.2534, + "step": 1364 + }, + { + "epoch": 1.92, + "grad_norm": 0.10701702402784265, + "learning_rate": 5e-05, + "loss": 1.2336, + "step": 1365 + }, + { + "epoch": 1.92, + "grad_norm": 0.09508301897524707, + "learning_rate": 5e-05, + "loss": 1.1621, + "step": 1366 + }, + { + "epoch": 1.92, + "grad_norm": 0.13472707990301636, + "learning_rate": 5e-05, + "loss": 1.255, + "step": 1367 + }, + { + "epoch": 1.92, + "grad_norm": 0.09638639506299534, + "learning_rate": 5e-05, + "loss": 1.2698, + "step": 1368 + }, + { + "epoch": 1.92, + "grad_norm": 0.148927462211784, + "learning_rate": 5e-05, + "loss": 1.2054, + "step": 1369 + }, + { + "epoch": 1.92, + "grad_norm": 0.09799235041711472, + "learning_rate": 5e-05, + "loss": 1.2038, + "step": 1370 + }, + { + "epoch": 1.93, + "grad_norm": 0.09420136770495892, + "learning_rate": 5e-05, + "loss": 1.2759, + "step": 1371 + }, + { + "epoch": 1.93, + "grad_norm": 0.10327592707615818, + "learning_rate": 5e-05, + "loss": 1.3487, + "step": 1372 + }, + { + "epoch": 1.93, + "grad_norm": 0.10648587942616909, + "learning_rate": 5e-05, + "loss": 1.2778, + "step": 1373 + }, + { + "epoch": 1.93, + "grad_norm": 0.10077774041822993, + "learning_rate": 5e-05, + "loss": 1.1275, + "step": 1374 + }, + { + "epoch": 1.93, + "grad_norm": 0.10851390832620712, + "learning_rate": 5e-05, + "loss": 1.2849, + "step": 1375 + }, + { + "epoch": 1.93, + "grad_norm": 0.10685085492856919, + "learning_rate": 5e-05, + "loss": 1.213, + "step": 1376 + }, + { + "epoch": 1.93, + "grad_norm": 0.33132745678315667, + "learning_rate": 5e-05, + "loss": 1.213, + "step": 1377 + }, + { + "epoch": 1.94, + "grad_norm": 0.10144441234398532, + "learning_rate": 5e-05, + "loss": 1.2017, + "step": 1378 + }, + { + "epoch": 1.94, + "grad_norm": 0.11228015380290499, + "learning_rate": 5e-05, + "loss": 1.1263, + "step": 1379 + }, + { + "epoch": 1.94, + "grad_norm": 0.10619109907523792, + "learning_rate": 5e-05, + "loss": 1.2028, + "step": 1380 + }, + { + "epoch": 1.94, + "grad_norm": 0.09927912077152588, + "learning_rate": 5e-05, + "loss": 1.1571, + "step": 1381 + }, + { + "epoch": 1.94, + "grad_norm": 0.09836226270654248, + "learning_rate": 5e-05, + "loss": 1.263, + "step": 1382 + }, + { + "epoch": 1.94, + "grad_norm": 0.10152642676398088, + "learning_rate": 5e-05, + "loss": 1.2906, + "step": 1383 + }, + { + "epoch": 1.94, + "grad_norm": 0.09751898942054647, + "learning_rate": 5e-05, + "loss": 1.2299, + "step": 1384 + }, + { + "epoch": 1.95, + "grad_norm": 0.09928849404743591, + "learning_rate": 5e-05, + "loss": 1.2162, + "step": 1385 + }, + { + "epoch": 1.95, + "grad_norm": 0.12489116214026824, + "learning_rate": 5e-05, + "loss": 1.3218, + "step": 1386 + }, + { + "epoch": 1.95, + "grad_norm": 0.10882496396878796, + "learning_rate": 5e-05, + "loss": 1.1562, + "step": 1387 + }, + { + "epoch": 1.95, + "grad_norm": 0.10726176666870928, + "learning_rate": 5e-05, + "loss": 1.3121, + "step": 1388 + }, + { + "epoch": 1.95, + "grad_norm": 0.12451799281699823, + "learning_rate": 5e-05, + "loss": 1.2435, + "step": 1389 + }, + { + "epoch": 1.95, + "grad_norm": 0.1063201573835307, + "learning_rate": 5e-05, + "loss": 1.1881, + "step": 1390 + }, + { + "epoch": 1.95, + "grad_norm": 0.09591833849880978, + "learning_rate": 5e-05, + "loss": 1.0813, + "step": 1391 + }, + { + "epoch": 1.96, + "grad_norm": 0.3608436435573168, + "learning_rate": 5e-05, + "loss": 1.2783, + "step": 1392 + }, + { + "epoch": 1.96, + "grad_norm": 0.09891988026600457, + "learning_rate": 5e-05, + "loss": 1.2458, + "step": 1393 + }, + { + "epoch": 1.96, + "grad_norm": 0.10372056524311288, + "learning_rate": 5e-05, + "loss": 1.2561, + "step": 1394 + }, + { + "epoch": 1.96, + "grad_norm": 0.11010379879458275, + "learning_rate": 5e-05, + "loss": 1.1742, + "step": 1395 + }, + { + "epoch": 1.96, + "grad_norm": 0.11577814654777499, + "learning_rate": 5e-05, + "loss": 1.2133, + "step": 1396 + }, + { + "epoch": 1.96, + "grad_norm": 0.09703009817213572, + "learning_rate": 5e-05, + "loss": 1.2666, + "step": 1397 + }, + { + "epoch": 1.96, + "grad_norm": 0.10605693309881119, + "learning_rate": 5e-05, + "loss": 1.211, + "step": 1398 + }, + { + "epoch": 1.96, + "grad_norm": 0.0953969154407503, + "learning_rate": 5e-05, + "loss": 1.2302, + "step": 1399 + }, + { + "epoch": 1.97, + "grad_norm": 0.1029669098774183, + "learning_rate": 5e-05, + "loss": 1.2036, + "step": 1400 + }, + { + "epoch": 1.97, + "grad_norm": 0.09681823509122052, + "learning_rate": 5e-05, + "loss": 1.2951, + "step": 1401 + }, + { + "epoch": 1.97, + "grad_norm": 0.11807096310067436, + "learning_rate": 5e-05, + "loss": 1.2065, + "step": 1402 + }, + { + "epoch": 1.97, + "grad_norm": 0.10137146652403635, + "learning_rate": 5e-05, + "loss": 1.194, + "step": 1403 + }, + { + "epoch": 1.97, + "grad_norm": 0.11503441758843483, + "learning_rate": 5e-05, + "loss": 1.1471, + "step": 1404 + }, + { + "epoch": 1.97, + "grad_norm": 0.11321304873658063, + "learning_rate": 5e-05, + "loss": 1.1208, + "step": 1405 + }, + { + "epoch": 1.97, + "grad_norm": 0.09959691816355418, + "learning_rate": 5e-05, + "loss": 1.2474, + "step": 1406 + }, + { + "epoch": 1.98, + "grad_norm": 0.10519968487918019, + "learning_rate": 5e-05, + "loss": 1.1389, + "step": 1407 + }, + { + "epoch": 1.98, + "grad_norm": 0.09696908055955931, + "learning_rate": 5e-05, + "loss": 1.2006, + "step": 1408 + }, + { + "epoch": 1.98, + "grad_norm": 0.10702366666252838, + "learning_rate": 5e-05, + "loss": 1.2117, + "step": 1409 + }, + { + "epoch": 1.98, + "grad_norm": 0.10076704123424733, + "learning_rate": 5e-05, + "loss": 1.2251, + "step": 1410 + }, + { + "epoch": 1.98, + "grad_norm": 0.0962787350921078, + "learning_rate": 5e-05, + "loss": 1.2286, + "step": 1411 + }, + { + "epoch": 1.98, + "grad_norm": 0.1102003262142478, + "learning_rate": 5e-05, + "loss": 1.1348, + "step": 1412 + }, + { + "epoch": 1.98, + "grad_norm": 0.23382426797734182, + "learning_rate": 5e-05, + "loss": 1.2689, + "step": 1413 + }, + { + "epoch": 1.99, + "grad_norm": 0.09619878014992354, + "learning_rate": 5e-05, + "loss": 1.2346, + "step": 1414 + }, + { + "epoch": 1.99, + "grad_norm": 0.10064237772902627, + "learning_rate": 5e-05, + "loss": 1.273, + "step": 1415 + }, + { + "epoch": 1.99, + "grad_norm": 0.11448682188068181, + "learning_rate": 5e-05, + "loss": 1.1405, + "step": 1416 + }, + { + "epoch": 1.99, + "grad_norm": 0.09456193036336086, + "learning_rate": 5e-05, + "loss": 1.2273, + "step": 1417 + }, + { + "epoch": 1.99, + "grad_norm": 0.10228441609601326, + "learning_rate": 5e-05, + "loss": 1.1198, + "step": 1418 + }, + { + "epoch": 1.99, + "grad_norm": 0.09922567877919913, + "learning_rate": 5e-05, + "loss": 1.2631, + "step": 1419 + }, + { + "epoch": 1.99, + "grad_norm": 0.19004659340585958, + "learning_rate": 5e-05, + "loss": 1.1986, + "step": 1420 + }, + { + "epoch": 2.0, + "grad_norm": 0.10754500601228272, + "learning_rate": 5e-05, + "loss": 1.1851, + "step": 1421 + }, + { + "epoch": 2.0, + "grad_norm": 0.09350976680922973, + "learning_rate": 5e-05, + "loss": 1.3649, + "step": 1422 + }, + { + "epoch": 2.0, + "grad_norm": 0.09001639847887033, + "learning_rate": 5e-05, + "loss": 1.224, + "step": 1423 + }, + { + "epoch": 2.0, + "grad_norm": 0.12001633209272844, + "learning_rate": 5e-05, + "loss": 1.1618, + "step": 1424 + }, + { + "epoch": 2.0, + "step": 1424, + "total_flos": 6427604718452736.0, + "train_loss": 1.233065366703138, + "train_runtime": 53767.7286, + "train_samples_per_second": 0.847, + "train_steps_per_second": 0.026 + } + ], + "logging_steps": 1.0, + "max_steps": 1424, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "total_flos": 6427604718452736.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}