diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7381 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 27.51196172248804, + "eval_steps": 1000, + "global_step": 92000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03, + "grad_norm": 3.539609432220459, + "learning_rate": 4.99925228054434e-05, + "loss": 2.134, + "step": 100 + }, + { + "epoch": 0.06, + "grad_norm": 3.197829246520996, + "learning_rate": 4.997756841633019e-05, + "loss": 0.6178, + "step": 200 + }, + { + "epoch": 0.09, + "grad_norm": 3.3991429805755615, + "learning_rate": 4.996261402721699e-05, + "loss": 0.5496, + "step": 300 + }, + { + "epoch": 0.12, + "grad_norm": 3.072633743286133, + "learning_rate": 4.9947659638103784e-05, + "loss": 0.5228, + "step": 400 + }, + { + "epoch": 0.15, + "grad_norm": 2.4815468788146973, + "learning_rate": 4.993270524899058e-05, + "loss": 0.5102, + "step": 500 + }, + { + "epoch": 0.18, + "grad_norm": 2.794753313064575, + "learning_rate": 4.991775085987738e-05, + "loss": 0.4746, + "step": 600 + }, + { + "epoch": 0.21, + "grad_norm": 2.1388251781463623, + "learning_rate": 4.9902796470764176e-05, + "loss": 0.4769, + "step": 700 + }, + { + "epoch": 0.24, + "grad_norm": 2.518214225769043, + "learning_rate": 4.988784208165096e-05, + "loss": 0.4476, + "step": 800 + }, + { + "epoch": 0.27, + "grad_norm": 4.257823467254639, + "learning_rate": 4.987288769253776e-05, + "loss": 0.439, + "step": 900 + }, + { + "epoch": 0.3, + "grad_norm": 2.0235888957977295, + "learning_rate": 4.985793330342456e-05, + "loss": 0.4465, + "step": 1000 + }, + { + "epoch": 0.3, + "eval_loss": 0.34466782212257385, + "eval_precision": 0.7649398815576958, + "eval_recall": 0.7874318790603159, + "eval_runtime": 321.2695, + "eval_samples_per_second": 41.629, + "eval_steps_per_second": 1.301, + "step": 1000 + }, + { + "epoch": 0.33, + "grad_norm": 2.372622489929199, + "learning_rate": 4.984297891431135e-05, + "loss": 0.438, + "step": 1100 + }, + { + "epoch": 0.36, + "grad_norm": 2.184081792831421, + "learning_rate": 4.982802452519815e-05, + "loss": 0.4319, + "step": 1200 + }, + { + "epoch": 0.39, + "grad_norm": 1.180004358291626, + "learning_rate": 4.981307013608494e-05, + "loss": 0.4153, + "step": 1300 + }, + { + "epoch": 0.42, + "grad_norm": 1.8515098094940186, + "learning_rate": 4.979811574697174e-05, + "loss": 0.4107, + "step": 1400 + }, + { + "epoch": 0.45, + "grad_norm": 2.0762712955474854, + "learning_rate": 4.978316135785853e-05, + "loss": 0.4087, + "step": 1500 + }, + { + "epoch": 0.48, + "grad_norm": 1.6716846227645874, + "learning_rate": 4.9768206968745326e-05, + "loss": 0.4082, + "step": 1600 + }, + { + "epoch": 0.51, + "grad_norm": 2.9515812397003174, + "learning_rate": 4.9753252579632126e-05, + "loss": 0.398, + "step": 1700 + }, + { + "epoch": 0.54, + "grad_norm": 1.9658855199813843, + "learning_rate": 4.973829819051892e-05, + "loss": 0.393, + "step": 1800 + }, + { + "epoch": 0.57, + "grad_norm": 1.9613778591156006, + "learning_rate": 4.972334380140571e-05, + "loss": 0.3904, + "step": 1900 + }, + { + "epoch": 0.6, + "grad_norm": 2.7774882316589355, + "learning_rate": 4.970838941229251e-05, + "loss": 0.3794, + "step": 2000 + }, + { + "epoch": 0.6, + "eval_loss": 0.310618132352829, + "eval_precision": 0.7516943243620137, + "eval_recall": 0.8298285045721852, + "eval_runtime": 320.9754, + "eval_samples_per_second": 41.667, + "eval_steps_per_second": 1.302, + "step": 2000 + }, + { + "epoch": 0.63, + "grad_norm": 1.4382622241973877, + "learning_rate": 4.969343502317931e-05, + "loss": 0.369, + "step": 2100 + }, + { + "epoch": 0.66, + "grad_norm": 1.813565731048584, + "learning_rate": 4.96784806340661e-05, + "loss": 0.3751, + "step": 2200 + }, + { + "epoch": 0.69, + "grad_norm": 2.279954195022583, + "learning_rate": 4.9663526244952897e-05, + "loss": 0.3804, + "step": 2300 + }, + { + "epoch": 0.72, + "grad_norm": 1.9376351833343506, + "learning_rate": 4.9648571855839696e-05, + "loss": 0.3611, + "step": 2400 + }, + { + "epoch": 0.75, + "grad_norm": 2.2867352962493896, + "learning_rate": 4.963361746672648e-05, + "loss": 0.3739, + "step": 2500 + }, + { + "epoch": 0.78, + "grad_norm": 2.132394313812256, + "learning_rate": 4.961866307761328e-05, + "loss": 0.3669, + "step": 2600 + }, + { + "epoch": 0.81, + "grad_norm": 2.0541863441467285, + "learning_rate": 4.9603708688500075e-05, + "loss": 0.366, + "step": 2700 + }, + { + "epoch": 0.84, + "grad_norm": 2.1414847373962402, + "learning_rate": 4.9588754299386874e-05, + "loss": 0.3535, + "step": 2800 + }, + { + "epoch": 0.87, + "grad_norm": 1.3949612379074097, + "learning_rate": 4.957379991027367e-05, + "loss": 0.3684, + "step": 2900 + }, + { + "epoch": 0.9, + "grad_norm": 1.8921570777893066, + "learning_rate": 4.955884552116046e-05, + "loss": 0.3556, + "step": 3000 + }, + { + "epoch": 0.9, + "eval_loss": 0.290554404258728, + "eval_precision": 0.79493216033703, + "eval_recall": 0.7901105329597586, + "eval_runtime": 307.7262, + "eval_samples_per_second": 43.461, + "eval_steps_per_second": 1.358, + "step": 3000 + }, + { + "epoch": 0.93, + "grad_norm": 1.6217349767684937, + "learning_rate": 4.954389113204726e-05, + "loss": 0.3566, + "step": 3100 + }, + { + "epoch": 0.96, + "grad_norm": 1.524946928024292, + "learning_rate": 4.952893674293405e-05, + "loss": 0.3477, + "step": 3200 + }, + { + "epoch": 0.99, + "grad_norm": 1.6807836294174194, + "learning_rate": 4.9513982353820846e-05, + "loss": 0.3409, + "step": 3300 + }, + { + "epoch": 1.02, + "grad_norm": 1.5750257968902588, + "learning_rate": 4.9499027964707645e-05, + "loss": 0.3178, + "step": 3400 + }, + { + "epoch": 1.05, + "grad_norm": 1.43153715133667, + "learning_rate": 4.9484073575594445e-05, + "loss": 0.2888, + "step": 3500 + }, + { + "epoch": 1.08, + "grad_norm": 1.4886215925216675, + "learning_rate": 4.946911918648123e-05, + "loss": 0.3153, + "step": 3600 + }, + { + "epoch": 1.11, + "grad_norm": 2.2148983478546143, + "learning_rate": 4.945416479736803e-05, + "loss": 0.3114, + "step": 3700 + }, + { + "epoch": 1.14, + "grad_norm": 1.3632937669754028, + "learning_rate": 4.9439210408254824e-05, + "loss": 0.3031, + "step": 3800 + }, + { + "epoch": 1.17, + "grad_norm": 1.8350048065185547, + "learning_rate": 4.9424256019141617e-05, + "loss": 0.292, + "step": 3900 + }, + { + "epoch": 1.2, + "grad_norm": 1.1402252912521362, + "learning_rate": 4.9409301630028416e-05, + "loss": 0.2983, + "step": 4000 + }, + { + "epoch": 1.2, + "eval_loss": 0.2781643867492676, + "eval_precision": 0.7788883753177721, + "eval_recall": 0.8301363958249947, + "eval_runtime": 307.2732, + "eval_samples_per_second": 43.525, + "eval_steps_per_second": 1.36, + "step": 4000 + }, + { + "epoch": 1.23, + "grad_norm": 1.2367932796478271, + "learning_rate": 4.939434724091521e-05, + "loss": 0.2894, + "step": 4100 + }, + { + "epoch": 1.26, + "grad_norm": 1.4055671691894531, + "learning_rate": 4.937939285180201e-05, + "loss": 0.2847, + "step": 4200 + }, + { + "epoch": 1.29, + "grad_norm": 1.910565972328186, + "learning_rate": 4.93644384626888e-05, + "loss": 0.2917, + "step": 4300 + }, + { + "epoch": 1.32, + "grad_norm": 1.9085345268249512, + "learning_rate": 4.9349484073575595e-05, + "loss": 0.2934, + "step": 4400 + }, + { + "epoch": 1.35, + "grad_norm": 1.5550158023834229, + "learning_rate": 4.9334529684462394e-05, + "loss": 0.2726, + "step": 4500 + }, + { + "epoch": 1.38, + "grad_norm": 2.1685421466827393, + "learning_rate": 4.931957529534919e-05, + "loss": 0.3077, + "step": 4600 + }, + { + "epoch": 1.41, + "grad_norm": 1.7528005838394165, + "learning_rate": 4.930462090623598e-05, + "loss": 0.2919, + "step": 4700 + }, + { + "epoch": 1.44, + "grad_norm": 1.804412841796875, + "learning_rate": 4.928966651712278e-05, + "loss": 0.278, + "step": 4800 + }, + { + "epoch": 1.47, + "grad_norm": 2.430739164352417, + "learning_rate": 4.927471212800957e-05, + "loss": 0.2901, + "step": 4900 + }, + { + "epoch": 1.5, + "grad_norm": 1.5466407537460327, + "learning_rate": 4.9259757738896365e-05, + "loss": 0.2886, + "step": 5000 + }, + { + "epoch": 1.5, + "eval_loss": 0.27095386385917664, + "eval_precision": 0.7892478844902066, + "eval_recall": 0.8212999168693618, + "eval_runtime": 308.5531, + "eval_samples_per_second": 43.344, + "eval_steps_per_second": 1.355, + "step": 5000 + }, + { + "epoch": 1.53, + "grad_norm": 1.1303741931915283, + "learning_rate": 4.9244803349783165e-05, + "loss": 0.291, + "step": 5100 + }, + { + "epoch": 1.56, + "grad_norm": 1.3640042543411255, + "learning_rate": 4.922984896066996e-05, + "loss": 0.2897, + "step": 5200 + }, + { + "epoch": 1.58, + "grad_norm": 1.9915575981140137, + "learning_rate": 4.921489457155675e-05, + "loss": 0.2798, + "step": 5300 + }, + { + "epoch": 1.61, + "grad_norm": 1.574576735496521, + "learning_rate": 4.919994018244355e-05, + "loss": 0.2856, + "step": 5400 + }, + { + "epoch": 1.64, + "grad_norm": 1.9231148958206177, + "learning_rate": 4.918498579333034e-05, + "loss": 0.2819, + "step": 5500 + }, + { + "epoch": 1.67, + "grad_norm": 2.171637773513794, + "learning_rate": 4.917003140421714e-05, + "loss": 0.2892, + "step": 5600 + }, + { + "epoch": 1.7, + "grad_norm": 1.7447925806045532, + "learning_rate": 4.9155077015103936e-05, + "loss": 0.2837, + "step": 5700 + }, + { + "epoch": 1.73, + "grad_norm": 2.282715320587158, + "learning_rate": 4.914012262599073e-05, + "loss": 0.2888, + "step": 5800 + }, + { + "epoch": 1.76, + "grad_norm": 2.041062831878662, + "learning_rate": 4.912516823687753e-05, + "loss": 0.2733, + "step": 5900 + }, + { + "epoch": 1.79, + "grad_norm": 1.3900405168533325, + "learning_rate": 4.911021384776432e-05, + "loss": 0.2982, + "step": 6000 + }, + { + "epoch": 1.79, + "eval_loss": 0.24861453473567963, + "eval_precision": 0.7945360585297875, + "eval_recall": 0.8426059915637797, + "eval_runtime": 306.7263, + "eval_samples_per_second": 43.602, + "eval_steps_per_second": 1.363, + "step": 6000 + }, + { + "epoch": 1.82, + "grad_norm": 2.156783103942871, + "learning_rate": 4.9095259458651114e-05, + "loss": 0.2883, + "step": 6100 + }, + { + "epoch": 1.85, + "grad_norm": 1.6421504020690918, + "learning_rate": 4.9080305069537914e-05, + "loss": 0.2716, + "step": 6200 + }, + { + "epoch": 1.88, + "grad_norm": 1.6905546188354492, + "learning_rate": 4.906535068042471e-05, + "loss": 0.2775, + "step": 6300 + }, + { + "epoch": 1.91, + "grad_norm": 1.1936814785003662, + "learning_rate": 4.90503962913115e-05, + "loss": 0.2571, + "step": 6400 + }, + { + "epoch": 1.94, + "grad_norm": 1.7146382331848145, + "learning_rate": 4.90354419021983e-05, + "loss": 0.2681, + "step": 6500 + }, + { + "epoch": 1.97, + "grad_norm": 1.5280200242996216, + "learning_rate": 4.902048751308509e-05, + "loss": 0.2655, + "step": 6600 + }, + { + "epoch": 2.0, + "grad_norm": 1.4756951332092285, + "learning_rate": 4.9005533123971885e-05, + "loss": 0.2554, + "step": 6700 + }, + { + "epoch": 2.03, + "grad_norm": 1.5664458274841309, + "learning_rate": 4.8990578734858685e-05, + "loss": 0.2125, + "step": 6800 + }, + { + "epoch": 2.06, + "grad_norm": 1.447304368019104, + "learning_rate": 4.897562434574548e-05, + "loss": 0.2161, + "step": 6900 + }, + { + "epoch": 2.09, + "grad_norm": 1.8067011833190918, + "learning_rate": 4.896066995663227e-05, + "loss": 0.213, + "step": 7000 + }, + { + "epoch": 2.09, + "eval_loss": 0.24976512789726257, + "eval_precision": 0.8138389031705227, + "eval_recall": 0.8187752085963238, + "eval_runtime": 305.8458, + "eval_samples_per_second": 43.728, + "eval_steps_per_second": 1.367, + "step": 7000 + }, + { + "epoch": 2.12, + "grad_norm": 2.7706127166748047, + "learning_rate": 4.894571556751907e-05, + "loss": 0.2186, + "step": 7100 + }, + { + "epoch": 2.15, + "grad_norm": 2.394275426864624, + "learning_rate": 4.893076117840586e-05, + "loss": 0.2094, + "step": 7200 + }, + { + "epoch": 2.18, + "grad_norm": 1.9464359283447266, + "learning_rate": 4.891580678929266e-05, + "loss": 0.2278, + "step": 7300 + }, + { + "epoch": 2.21, + "grad_norm": 2.1283416748046875, + "learning_rate": 4.8900852400179456e-05, + "loss": 0.2174, + "step": 7400 + }, + { + "epoch": 2.24, + "grad_norm": 1.7853657007217407, + "learning_rate": 4.888589801106625e-05, + "loss": 0.2184, + "step": 7500 + }, + { + "epoch": 2.27, + "grad_norm": 1.1081209182739258, + "learning_rate": 4.887094362195305e-05, + "loss": 0.2201, + "step": 7600 + }, + { + "epoch": 2.3, + "grad_norm": 1.3894284963607788, + "learning_rate": 4.885598923283984e-05, + "loss": 0.2213, + "step": 7700 + }, + { + "epoch": 2.33, + "grad_norm": 2.0615389347076416, + "learning_rate": 4.8841034843726634e-05, + "loss": 0.2217, + "step": 7800 + }, + { + "epoch": 2.36, + "grad_norm": 1.6415098905563354, + "learning_rate": 4.8826080454613434e-05, + "loss": 0.2266, + "step": 7900 + }, + { + "epoch": 2.39, + "grad_norm": 3.293736219406128, + "learning_rate": 4.8811126065500226e-05, + "loss": 0.2117, + "step": 8000 + }, + { + "epoch": 2.39, + "eval_loss": 0.24216407537460327, + "eval_precision": 0.8107814105275881, + "eval_recall": 0.826133809538471, + "eval_runtime": 307.023, + "eval_samples_per_second": 43.56, + "eval_steps_per_second": 1.361, + "step": 8000 + }, + { + "epoch": 2.42, + "grad_norm": 1.1580455303192139, + "learning_rate": 4.879617167638702e-05, + "loss": 0.2171, + "step": 8100 + }, + { + "epoch": 2.45, + "grad_norm": 1.0756213665008545, + "learning_rate": 4.878121728727382e-05, + "loss": 0.2174, + "step": 8200 + }, + { + "epoch": 2.48, + "grad_norm": 1.871605396270752, + "learning_rate": 4.876626289816061e-05, + "loss": 0.215, + "step": 8300 + }, + { + "epoch": 2.51, + "grad_norm": 1.8400825262069702, + "learning_rate": 4.8751308509047405e-05, + "loss": 0.2215, + "step": 8400 + }, + { + "epoch": 2.54, + "grad_norm": 2.0464110374450684, + "learning_rate": 4.8736354119934204e-05, + "loss": 0.2195, + "step": 8500 + }, + { + "epoch": 2.57, + "grad_norm": 1.2704099416732788, + "learning_rate": 4.8721399730821e-05, + "loss": 0.2266, + "step": 8600 + }, + { + "epoch": 2.6, + "grad_norm": 0.9448720216751099, + "learning_rate": 4.87064453417078e-05, + "loss": 0.2159, + "step": 8700 + }, + { + "epoch": 2.63, + "grad_norm": 1.2881120443344116, + "learning_rate": 4.869149095259459e-05, + "loss": 0.2084, + "step": 8800 + }, + { + "epoch": 2.66, + "grad_norm": 2.0659286975860596, + "learning_rate": 4.867653656348138e-05, + "loss": 0.2134, + "step": 8900 + }, + { + "epoch": 2.69, + "grad_norm": 1.109397530555725, + "learning_rate": 4.866158217436818e-05, + "loss": 0.2129, + "step": 9000 + }, + { + "epoch": 2.69, + "eval_loss": 0.22735044360160828, + "eval_precision": 0.8203027060082556, + "eval_recall": 0.8260106530373472, + "eval_runtime": 305.794, + "eval_samples_per_second": 43.735, + "eval_steps_per_second": 1.367, + "step": 9000 + }, + { + "epoch": 2.72, + "grad_norm": 1.164435625076294, + "learning_rate": 4.8646627785254975e-05, + "loss": 0.2155, + "step": 9100 + }, + { + "epoch": 2.75, + "grad_norm": 1.5477757453918457, + "learning_rate": 4.863167339614177e-05, + "loss": 0.2137, + "step": 9200 + }, + { + "epoch": 2.78, + "grad_norm": 1.4342052936553955, + "learning_rate": 4.861671900702857e-05, + "loss": 0.206, + "step": 9300 + }, + { + "epoch": 2.81, + "grad_norm": 1.3847391605377197, + "learning_rate": 4.860176461791536e-05, + "loss": 0.2077, + "step": 9400 + }, + { + "epoch": 2.84, + "grad_norm": 2.9082765579223633, + "learning_rate": 4.8586810228802154e-05, + "loss": 0.2126, + "step": 9500 + }, + { + "epoch": 2.87, + "grad_norm": 1.4943510293960571, + "learning_rate": 4.857185583968895e-05, + "loss": 0.2092, + "step": 9600 + }, + { + "epoch": 2.9, + "grad_norm": 1.2332855463027954, + "learning_rate": 4.8556901450575746e-05, + "loss": 0.2222, + "step": 9700 + }, + { + "epoch": 2.93, + "grad_norm": 2.227031946182251, + "learning_rate": 4.854194706146254e-05, + "loss": 0.1969, + "step": 9800 + }, + { + "epoch": 2.96, + "grad_norm": 1.2515846490859985, + "learning_rate": 4.852699267234934e-05, + "loss": 0.2017, + "step": 9900 + }, + { + "epoch": 2.99, + "grad_norm": 1.2267186641693115, + "learning_rate": 4.851203828323613e-05, + "loss": 0.2126, + "step": 10000 + }, + { + "epoch": 2.99, + "eval_loss": 0.20952437818050385, + "eval_precision": 0.8416687769055458, + "eval_recall": 0.818682841220481, + "eval_runtime": 302.8923, + "eval_samples_per_second": 44.154, + "eval_steps_per_second": 1.38, + "step": 10000 + }, + { + "epoch": 3.02, + "grad_norm": 1.151638150215149, + "learning_rate": 4.849708389412293e-05, + "loss": 0.171, + "step": 10100 + }, + { + "epoch": 3.05, + "grad_norm": 3.8168528079986572, + "learning_rate": 4.8482129505009724e-05, + "loss": 0.165, + "step": 10200 + }, + { + "epoch": 3.08, + "grad_norm": 2.3039355278015137, + "learning_rate": 4.846717511589652e-05, + "loss": 0.1675, + "step": 10300 + }, + { + "epoch": 3.11, + "grad_norm": 1.252301812171936, + "learning_rate": 4.845222072678332e-05, + "loss": 0.1554, + "step": 10400 + }, + { + "epoch": 3.14, + "grad_norm": 1.2682992219924927, + "learning_rate": 4.843726633767011e-05, + "loss": 0.1756, + "step": 10500 + }, + { + "epoch": 3.17, + "grad_norm": 1.3934777975082397, + "learning_rate": 4.84223119485569e-05, + "loss": 0.1576, + "step": 10600 + }, + { + "epoch": 3.2, + "grad_norm": 1.3386119604110718, + "learning_rate": 4.84073575594437e-05, + "loss": 0.1602, + "step": 10700 + }, + { + "epoch": 3.23, + "grad_norm": 1.6670503616333008, + "learning_rate": 4.8392403170330495e-05, + "loss": 0.1638, + "step": 10800 + }, + { + "epoch": 3.26, + "grad_norm": 2.5150694847106934, + "learning_rate": 4.837744878121729e-05, + "loss": 0.1653, + "step": 10900 + }, + { + "epoch": 3.29, + "grad_norm": 2.840406656265259, + "learning_rate": 4.836249439210409e-05, + "loss": 0.1607, + "step": 11000 + }, + { + "epoch": 3.29, + "eval_loss": 0.22238589823246002, + "eval_precision": 0.8404415146405029, + "eval_recall": 0.8439607130761415, + "eval_runtime": 304.8188, + "eval_samples_per_second": 43.875, + "eval_steps_per_second": 1.371, + "step": 11000 + }, + { + "epoch": 3.32, + "grad_norm": 1.5171958208084106, + "learning_rate": 4.834754000299088e-05, + "loss": 0.1606, + "step": 11100 + }, + { + "epoch": 3.35, + "grad_norm": 1.6955703496932983, + "learning_rate": 4.833258561387767e-05, + "loss": 0.1554, + "step": 11200 + }, + { + "epoch": 3.38, + "grad_norm": 1.893128514289856, + "learning_rate": 4.831763122476447e-05, + "loss": 0.1488, + "step": 11300 + }, + { + "epoch": 3.41, + "grad_norm": 1.7299461364746094, + "learning_rate": 4.8302676835651266e-05, + "loss": 0.1596, + "step": 11400 + }, + { + "epoch": 3.44, + "grad_norm": 2.150355339050293, + "learning_rate": 4.8287722446538065e-05, + "loss": 0.1623, + "step": 11500 + }, + { + "epoch": 3.47, + "grad_norm": 3.2869186401367188, + "learning_rate": 4.827276805742486e-05, + "loss": 0.1622, + "step": 11600 + }, + { + "epoch": 3.5, + "grad_norm": 1.7936344146728516, + "learning_rate": 4.825781366831165e-05, + "loss": 0.1651, + "step": 11700 + }, + { + "epoch": 3.53, + "grad_norm": 1.579736590385437, + "learning_rate": 4.824285927919845e-05, + "loss": 0.169, + "step": 11800 + }, + { + "epoch": 3.56, + "grad_norm": 2.1929283142089844, + "learning_rate": 4.822790489008524e-05, + "loss": 0.1629, + "step": 11900 + }, + { + "epoch": 3.59, + "grad_norm": 1.7842892408370972, + "learning_rate": 4.821295050097204e-05, + "loss": 0.1621, + "step": 12000 + }, + { + "epoch": 3.59, + "eval_loss": 0.21504360437393188, + "eval_precision": 0.8350246187102197, + "eval_recall": 0.8563379414390837, + "eval_runtime": 306.2124, + "eval_samples_per_second": 43.676, + "eval_steps_per_second": 1.365, + "step": 12000 + }, + { + "epoch": 3.62, + "grad_norm": 2.2203197479248047, + "learning_rate": 4.8197996111858836e-05, + "loss": 0.1595, + "step": 12100 + }, + { + "epoch": 3.65, + "grad_norm": 1.8541319370269775, + "learning_rate": 4.818304172274562e-05, + "loss": 0.1702, + "step": 12200 + }, + { + "epoch": 3.68, + "grad_norm": 1.3299143314361572, + "learning_rate": 4.816808733363242e-05, + "loss": 0.1651, + "step": 12300 + }, + { + "epoch": 3.71, + "grad_norm": 1.7831319570541382, + "learning_rate": 4.815313294451922e-05, + "loss": 0.1601, + "step": 12400 + }, + { + "epoch": 3.74, + "grad_norm": 1.0528268814086914, + "learning_rate": 4.8138178555406015e-05, + "loss": 0.1644, + "step": 12500 + }, + { + "epoch": 3.77, + "grad_norm": 1.306907057762146, + "learning_rate": 4.812322416629281e-05, + "loss": 0.1556, + "step": 12600 + }, + { + "epoch": 3.8, + "grad_norm": 1.8565049171447754, + "learning_rate": 4.810826977717961e-05, + "loss": 0.1654, + "step": 12700 + }, + { + "epoch": 3.83, + "grad_norm": 1.4770090579986572, + "learning_rate": 4.80933153880664e-05, + "loss": 0.1628, + "step": 12800 + }, + { + "epoch": 3.86, + "grad_norm": 1.9089502096176147, + "learning_rate": 4.807836099895319e-05, + "loss": 0.1632, + "step": 12900 + }, + { + "epoch": 3.89, + "grad_norm": 1.3788821697235107, + "learning_rate": 4.806340660983999e-05, + "loss": 0.1597, + "step": 13000 + }, + { + "epoch": 3.89, + "eval_loss": 0.2062728852033615, + "eval_precision": 0.8378547953391097, + "eval_recall": 0.8634194402537024, + "eval_runtime": 304.7295, + "eval_samples_per_second": 43.888, + "eval_steps_per_second": 1.372, + "step": 13000 + }, + { + "epoch": 3.92, + "grad_norm": 15.79686164855957, + "learning_rate": 4.8048452220726785e-05, + "loss": 0.1637, + "step": 13100 + }, + { + "epoch": 3.95, + "grad_norm": 1.9472129344940186, + "learning_rate": 4.8033497831613585e-05, + "loss": 0.1666, + "step": 13200 + }, + { + "epoch": 3.98, + "grad_norm": 2.1338746547698975, + "learning_rate": 4.801854344250037e-05, + "loss": 0.1614, + "step": 13300 + }, + { + "epoch": 4.01, + "grad_norm": 1.1886940002441406, + "learning_rate": 4.800358905338717e-05, + "loss": 0.1474, + "step": 13400 + }, + { + "epoch": 4.04, + "grad_norm": 2.4190924167633057, + "learning_rate": 4.798863466427397e-05, + "loss": 0.121, + "step": 13500 + }, + { + "epoch": 4.07, + "grad_norm": 0.902584433555603, + "learning_rate": 4.797368027516076e-05, + "loss": 0.1192, + "step": 13600 + }, + { + "epoch": 4.1, + "grad_norm": 2.3466804027557373, + "learning_rate": 4.7958725886047556e-05, + "loss": 0.129, + "step": 13700 + }, + { + "epoch": 4.13, + "grad_norm": 4.135778427124023, + "learning_rate": 4.7943771496934356e-05, + "loss": 0.1206, + "step": 13800 + }, + { + "epoch": 4.16, + "grad_norm": 1.6940075159072876, + "learning_rate": 4.792881710782115e-05, + "loss": 0.1313, + "step": 13900 + }, + { + "epoch": 4.19, + "grad_norm": 1.7989047765731812, + "learning_rate": 4.791386271870794e-05, + "loss": 0.1139, + "step": 14000 + }, + { + "epoch": 4.19, + "eval_loss": 0.20718763768672943, + "eval_precision": 0.8631126181281592, + "eval_recall": 0.8464238430986176, + "eval_runtime": 304.0256, + "eval_samples_per_second": 43.99, + "eval_steps_per_second": 1.375, + "step": 14000 + }, + { + "epoch": 4.22, + "grad_norm": 1.9864155054092407, + "learning_rate": 4.789890832959474e-05, + "loss": 0.1222, + "step": 14100 + }, + { + "epoch": 4.25, + "grad_norm": 2.944260835647583, + "learning_rate": 4.7883953940481534e-05, + "loss": 0.1238, + "step": 14200 + }, + { + "epoch": 4.28, + "grad_norm": 0.5448206663131714, + "learning_rate": 4.786899955136833e-05, + "loss": 0.1191, + "step": 14300 + }, + { + "epoch": 4.31, + "grad_norm": 1.2996718883514404, + "learning_rate": 4.785404516225512e-05, + "loss": 0.1208, + "step": 14400 + }, + { + "epoch": 4.34, + "grad_norm": 2.5177977085113525, + "learning_rate": 4.783909077314192e-05, + "loss": 0.1258, + "step": 14500 + }, + { + "epoch": 4.37, + "grad_norm": 1.1356126070022583, + "learning_rate": 4.782413638402872e-05, + "loss": 0.1223, + "step": 14600 + }, + { + "epoch": 4.4, + "grad_norm": 1.2576464414596558, + "learning_rate": 4.7809181994915506e-05, + "loss": 0.124, + "step": 14700 + }, + { + "epoch": 4.43, + "grad_norm": 0.8868162631988525, + "learning_rate": 4.7794227605802305e-05, + "loss": 0.1246, + "step": 14800 + }, + { + "epoch": 4.46, + "grad_norm": 2.3075501918792725, + "learning_rate": 4.7779273216689105e-05, + "loss": 0.1216, + "step": 14900 + }, + { + "epoch": 4.49, + "grad_norm": 1.5548241138458252, + "learning_rate": 4.776431882757589e-05, + "loss": 0.1221, + "step": 15000 + }, + { + "epoch": 4.49, + "eval_loss": 0.19333235919475555, + "eval_precision": 0.8727586319112239, + "eval_recall": 0.8257335509098187, + "eval_runtime": 301.0242, + "eval_samples_per_second": 44.428, + "eval_steps_per_second": 1.389, + "step": 15000 + }, + { + "epoch": 4.52, + "grad_norm": 1.0018868446350098, + "learning_rate": 4.774936443846269e-05, + "loss": 0.1237, + "step": 15100 + }, + { + "epoch": 4.55, + "grad_norm": 1.264910101890564, + "learning_rate": 4.773441004934949e-05, + "loss": 0.1156, + "step": 15200 + }, + { + "epoch": 4.58, + "grad_norm": 5.281520366668701, + "learning_rate": 4.771945566023628e-05, + "loss": 0.1286, + "step": 15300 + }, + { + "epoch": 4.61, + "grad_norm": 1.9591494798660278, + "learning_rate": 4.7704501271123076e-05, + "loss": 0.1249, + "step": 15400 + }, + { + "epoch": 4.64, + "grad_norm": 2.021794080734253, + "learning_rate": 4.768954688200987e-05, + "loss": 0.1233, + "step": 15500 + }, + { + "epoch": 4.67, + "grad_norm": 2.007873773574829, + "learning_rate": 4.767459249289667e-05, + "loss": 0.1281, + "step": 15600 + }, + { + "epoch": 4.69, + "grad_norm": 2.0108394622802734, + "learning_rate": 4.765963810378346e-05, + "loss": 0.1302, + "step": 15700 + }, + { + "epoch": 4.72, + "grad_norm": 1.7474627494812012, + "learning_rate": 4.7644683714670254e-05, + "loss": 0.1164, + "step": 15800 + }, + { + "epoch": 4.75, + "grad_norm": 0.758482813835144, + "learning_rate": 4.7629729325557054e-05, + "loss": 0.1211, + "step": 15900 + }, + { + "epoch": 4.78, + "grad_norm": 0.9910192489624023, + "learning_rate": 4.7614774936443854e-05, + "loss": 0.1222, + "step": 16000 + }, + { + "epoch": 4.78, + "eval_loss": 0.1955721527338028, + "eval_precision": 0.8685029567382508, + "eval_recall": 0.8591705409649312, + "eval_runtime": 303.5505, + "eval_samples_per_second": 44.059, + "eval_steps_per_second": 1.377, + "step": 16000 + }, + { + "epoch": 4.81, + "grad_norm": 2.4667110443115234, + "learning_rate": 4.759982054733064e-05, + "loss": 0.1214, + "step": 16100 + }, + { + "epoch": 4.84, + "grad_norm": 2.103156566619873, + "learning_rate": 4.758486615821744e-05, + "loss": 0.1211, + "step": 16200 + }, + { + "epoch": 4.87, + "grad_norm": 1.3806654214859009, + "learning_rate": 4.756991176910424e-05, + "loss": 0.1152, + "step": 16300 + }, + { + "epoch": 4.9, + "grad_norm": 2.1174566745758057, + "learning_rate": 4.7554957379991025e-05, + "loss": 0.1246, + "step": 16400 + }, + { + "epoch": 4.93, + "grad_norm": 2.0334010124206543, + "learning_rate": 4.7540002990877825e-05, + "loss": 0.1189, + "step": 16500 + }, + { + "epoch": 4.96, + "grad_norm": 2.668717861175537, + "learning_rate": 4.7525048601764625e-05, + "loss": 0.1237, + "step": 16600 + }, + { + "epoch": 4.99, + "grad_norm": 2.0749363899230957, + "learning_rate": 4.751009421265142e-05, + "loss": 0.1141, + "step": 16700 + }, + { + "epoch": 5.02, + "grad_norm": 1.893052577972412, + "learning_rate": 4.749513982353821e-05, + "loss": 0.095, + "step": 16800 + }, + { + "epoch": 5.05, + "grad_norm": 0.6495729684829712, + "learning_rate": 4.7480185434425e-05, + "loss": 0.085, + "step": 16900 + }, + { + "epoch": 5.08, + "grad_norm": 1.8883150815963745, + "learning_rate": 4.74652310453118e-05, + "loss": 0.0886, + "step": 17000 + }, + { + "epoch": 5.08, + "eval_loss": 0.2067934274673462, + "eval_precision": 0.880300808187974, + "eval_recall": 0.8685920133009021, + "eval_runtime": 303.377, + "eval_samples_per_second": 44.084, + "eval_steps_per_second": 1.378, + "step": 17000 + }, + { + "epoch": 5.11, + "grad_norm": 1.110809326171875, + "learning_rate": 4.7450276656198596e-05, + "loss": 0.0895, + "step": 17100 + }, + { + "epoch": 5.14, + "grad_norm": 1.9441896677017212, + "learning_rate": 4.743532226708539e-05, + "loss": 0.0935, + "step": 17200 + }, + { + "epoch": 5.17, + "grad_norm": 1.9851264953613281, + "learning_rate": 4.742036787797219e-05, + "loss": 0.0927, + "step": 17300 + }, + { + "epoch": 5.2, + "grad_norm": 1.2447096109390259, + "learning_rate": 4.740541348885899e-05, + "loss": 0.0911, + "step": 17400 + }, + { + "epoch": 5.23, + "grad_norm": 1.0151656866073608, + "learning_rate": 4.7390459099745774e-05, + "loss": 0.0932, + "step": 17500 + }, + { + "epoch": 5.26, + "grad_norm": 0.8265299201011658, + "learning_rate": 4.7375504710632574e-05, + "loss": 0.1006, + "step": 17600 + }, + { + "epoch": 5.29, + "grad_norm": 2.7819435596466064, + "learning_rate": 4.736055032151937e-05, + "loss": 0.0892, + "step": 17700 + }, + { + "epoch": 5.32, + "grad_norm": 1.3706836700439453, + "learning_rate": 4.734559593240616e-05, + "loss": 0.0976, + "step": 17800 + }, + { + "epoch": 5.35, + "grad_norm": 3.606653928756714, + "learning_rate": 4.733064154329296e-05, + "loss": 0.0932, + "step": 17900 + }, + { + "epoch": 5.38, + "grad_norm": 1.3535112142562866, + "learning_rate": 4.731568715417975e-05, + "loss": 0.0917, + "step": 18000 + }, + { + "epoch": 5.38, + "eval_loss": 0.1965586394071579, + "eval_precision": 0.8806825297432687, + "eval_recall": 0.8660673050278641, + "eval_runtime": 303.4486, + "eval_samples_per_second": 44.073, + "eval_steps_per_second": 1.377, + "step": 18000 + }, + { + "epoch": 5.41, + "grad_norm": 1.7558257579803467, + "learning_rate": 4.7300732765066545e-05, + "loss": 0.088, + "step": 18100 + }, + { + "epoch": 5.44, + "grad_norm": 2.291628837585449, + "learning_rate": 4.7285778375953345e-05, + "loss": 0.0963, + "step": 18200 + }, + { + "epoch": 5.47, + "grad_norm": 1.4217274188995361, + "learning_rate": 4.727082398684014e-05, + "loss": 0.0969, + "step": 18300 + }, + { + "epoch": 5.5, + "grad_norm": 1.8852524757385254, + "learning_rate": 4.725586959772694e-05, + "loss": 0.0952, + "step": 18400 + }, + { + "epoch": 5.53, + "grad_norm": 2.106452465057373, + "learning_rate": 4.724091520861373e-05, + "loss": 0.0966, + "step": 18500 + }, + { + "epoch": 5.56, + "grad_norm": 1.9277011156082153, + "learning_rate": 4.722596081950052e-05, + "loss": 0.089, + "step": 18600 + }, + { + "epoch": 5.59, + "grad_norm": 1.2175403833389282, + "learning_rate": 4.721100643038732e-05, + "loss": 0.0931, + "step": 18700 + }, + { + "epoch": 5.62, + "grad_norm": 2.060368299484253, + "learning_rate": 4.7196052041274115e-05, + "loss": 0.0968, + "step": 18800 + }, + { + "epoch": 5.65, + "grad_norm": 1.4981082677841187, + "learning_rate": 4.718109765216091e-05, + "loss": 0.0929, + "step": 18900 + }, + { + "epoch": 5.68, + "grad_norm": 1.6335569620132446, + "learning_rate": 4.716614326304771e-05, + "loss": 0.0938, + "step": 19000 + }, + { + "epoch": 5.68, + "eval_loss": 0.19031907618045807, + "eval_precision": 0.8913960623881361, + "eval_recall": 0.858708704085717, + "eval_runtime": 301.9634, + "eval_samples_per_second": 44.29, + "eval_steps_per_second": 1.384, + "step": 19000 + }, + { + "epoch": 5.71, + "grad_norm": 0.46949952840805054, + "learning_rate": 4.71511888739345e-05, + "loss": 0.09, + "step": 19100 + }, + { + "epoch": 5.74, + "grad_norm": 2.6525633335113525, + "learning_rate": 4.7136234484821294e-05, + "loss": 0.0954, + "step": 19200 + }, + { + "epoch": 5.77, + "grad_norm": 1.2892892360687256, + "learning_rate": 4.7121280095708093e-05, + "loss": 0.0949, + "step": 19300 + }, + { + "epoch": 5.8, + "grad_norm": 1.5637331008911133, + "learning_rate": 4.7106325706594886e-05, + "loss": 0.0962, + "step": 19400 + }, + { + "epoch": 5.83, + "grad_norm": 2.5609443187713623, + "learning_rate": 4.709137131748168e-05, + "loss": 0.0921, + "step": 19500 + }, + { + "epoch": 5.86, + "grad_norm": 1.4690775871276855, + "learning_rate": 4.707641692836848e-05, + "loss": 0.0955, + "step": 19600 + }, + { + "epoch": 5.89, + "grad_norm": 1.081965684890747, + "learning_rate": 4.706146253925527e-05, + "loss": 0.0928, + "step": 19700 + }, + { + "epoch": 5.92, + "grad_norm": 1.6817141771316528, + "learning_rate": 4.704650815014207e-05, + "loss": 0.0963, + "step": 19800 + }, + { + "epoch": 5.95, + "grad_norm": 2.984762191772461, + "learning_rate": 4.7031553761028864e-05, + "loss": 0.095, + "step": 19900 + }, + { + "epoch": 5.98, + "grad_norm": 2.1594882011413574, + "learning_rate": 4.701659937191566e-05, + "loss": 0.0985, + "step": 20000 + }, + { + "epoch": 5.98, + "eval_loss": 0.18151727318763733, + "eval_precision": 0.9042639298086573, + "eval_recall": 0.859940269096955, + "eval_runtime": 302.8985, + "eval_samples_per_second": 44.153, + "eval_steps_per_second": 1.38, + "step": 20000 + }, + { + "epoch": 6.01, + "grad_norm": 2.0218722820281982, + "learning_rate": 4.700164498280246e-05, + "loss": 0.0886, + "step": 20100 + }, + { + "epoch": 6.04, + "grad_norm": 1.3569700717926025, + "learning_rate": 4.698669059368925e-05, + "loss": 0.0711, + "step": 20200 + }, + { + "epoch": 6.07, + "grad_norm": 1.5697298049926758, + "learning_rate": 4.697173620457604e-05, + "loss": 0.0724, + "step": 20300 + }, + { + "epoch": 6.1, + "grad_norm": 1.7853014469146729, + "learning_rate": 4.695678181546284e-05, + "loss": 0.0747, + "step": 20400 + }, + { + "epoch": 6.13, + "grad_norm": 0.7531015872955322, + "learning_rate": 4.6941827426349635e-05, + "loss": 0.074, + "step": 20500 + }, + { + "epoch": 6.16, + "grad_norm": 1.3895870447158813, + "learning_rate": 4.692687303723643e-05, + "loss": 0.0683, + "step": 20600 + }, + { + "epoch": 6.19, + "grad_norm": 2.084857225418091, + "learning_rate": 4.691191864812323e-05, + "loss": 0.0741, + "step": 20700 + }, + { + "epoch": 6.22, + "grad_norm": 0.9525838494300842, + "learning_rate": 4.689696425901002e-05, + "loss": 0.0647, + "step": 20800 + }, + { + "epoch": 6.25, + "grad_norm": 2.0475118160247803, + "learning_rate": 4.6882009869896813e-05, + "loss": 0.0746, + "step": 20900 + }, + { + "epoch": 6.28, + "grad_norm": 1.0650370121002197, + "learning_rate": 4.686705548078361e-05, + "loss": 0.0696, + "step": 21000 + }, + { + "epoch": 6.28, + "eval_loss": 0.19116894900798798, + "eval_precision": 0.9016753284483037, + "eval_recall": 0.8600326364727978, + "eval_runtime": 303.289, + "eval_samples_per_second": 44.097, + "eval_steps_per_second": 1.378, + "step": 21000 + }, + { + "epoch": 6.31, + "grad_norm": 1.5736846923828125, + "learning_rate": 4.6852101091670406e-05, + "loss": 0.0685, + "step": 21100 + }, + { + "epoch": 6.34, + "grad_norm": 0.7526031136512756, + "learning_rate": 4.6837146702557206e-05, + "loss": 0.0816, + "step": 21200 + }, + { + "epoch": 6.37, + "grad_norm": 1.284680724143982, + "learning_rate": 4.6822192313444e-05, + "loss": 0.0676, + "step": 21300 + }, + { + "epoch": 6.4, + "grad_norm": 4.207923889160156, + "learning_rate": 4.680723792433079e-05, + "loss": 0.0679, + "step": 21400 + }, + { + "epoch": 6.43, + "grad_norm": 1.3670810461044312, + "learning_rate": 4.679228353521759e-05, + "loss": 0.0721, + "step": 21500 + }, + { + "epoch": 6.46, + "grad_norm": 1.8094091415405273, + "learning_rate": 4.6777329146104384e-05, + "loss": 0.0673, + "step": 21600 + }, + { + "epoch": 6.49, + "grad_norm": 2.057133436203003, + "learning_rate": 4.676237475699118e-05, + "loss": 0.0711, + "step": 21700 + }, + { + "epoch": 6.52, + "grad_norm": 1.9356772899627686, + "learning_rate": 4.6747420367877976e-05, + "loss": 0.0713, + "step": 21800 + }, + { + "epoch": 6.55, + "grad_norm": 0.4188990592956543, + "learning_rate": 4.673246597876477e-05, + "loss": 0.0772, + "step": 21900 + }, + { + "epoch": 6.58, + "grad_norm": 0.9256879091262817, + "learning_rate": 4.671751158965156e-05, + "loss": 0.0715, + "step": 22000 + }, + { + "epoch": 6.58, + "eval_loss": 0.19474047422409058, + "eval_precision": 0.9012208304190246, + "eval_recall": 0.8727793343391115, + "eval_runtime": 305.0313, + "eval_samples_per_second": 43.845, + "eval_steps_per_second": 1.37, + "step": 22000 + }, + { + "epoch": 6.61, + "grad_norm": 0.890701949596405, + "learning_rate": 4.670255720053836e-05, + "loss": 0.0712, + "step": 22100 + }, + { + "epoch": 6.64, + "grad_norm": 1.6164826154708862, + "learning_rate": 4.6687602811425155e-05, + "loss": 0.0772, + "step": 22200 + }, + { + "epoch": 6.67, + "grad_norm": 1.2075903415679932, + "learning_rate": 4.667264842231195e-05, + "loss": 0.0734, + "step": 22300 + }, + { + "epoch": 6.7, + "grad_norm": 0.9141576886177063, + "learning_rate": 4.665769403319875e-05, + "loss": 0.0803, + "step": 22400 + }, + { + "epoch": 6.73, + "grad_norm": 3.0547311305999756, + "learning_rate": 4.664273964408554e-05, + "loss": 0.0688, + "step": 22500 + }, + { + "epoch": 6.76, + "grad_norm": 1.1152849197387695, + "learning_rate": 4.662778525497234e-05, + "loss": 0.0703, + "step": 22600 + }, + { + "epoch": 6.79, + "grad_norm": 2.150590181350708, + "learning_rate": 4.661283086585913e-05, + "loss": 0.0745, + "step": 22700 + }, + { + "epoch": 6.82, + "grad_norm": 1.4829721450805664, + "learning_rate": 4.6597876476745926e-05, + "loss": 0.0738, + "step": 22800 + }, + { + "epoch": 6.85, + "grad_norm": 0.6545503735542297, + "learning_rate": 4.6582922087632725e-05, + "loss": 0.0764, + "step": 22900 + }, + { + "epoch": 6.88, + "grad_norm": 1.2322636842727661, + "learning_rate": 4.656796769851952e-05, + "loss": 0.0765, + "step": 23000 + }, + { + "epoch": 6.88, + "eval_loss": 0.18639414012432098, + "eval_precision": 0.9072111489223789, + "eval_recall": 0.861849194864374, + "eval_runtime": 301.5834, + "eval_samples_per_second": 44.346, + "eval_steps_per_second": 1.386, + "step": 23000 + }, + { + "epoch": 6.91, + "grad_norm": 1.8931362628936768, + "learning_rate": 4.655301330940631e-05, + "loss": 0.0783, + "step": 23100 + }, + { + "epoch": 6.94, + "grad_norm": 0.7884649038314819, + "learning_rate": 4.653805892029311e-05, + "loss": 0.0718, + "step": 23200 + }, + { + "epoch": 6.97, + "grad_norm": 0.6341440081596375, + "learning_rate": 4.6523104531179904e-05, + "loss": 0.0698, + "step": 23300 + }, + { + "epoch": 7.0, + "grad_norm": 0.9098210334777832, + "learning_rate": 4.6508150142066697e-05, + "loss": 0.071, + "step": 23400 + }, + { + "epoch": 7.03, + "grad_norm": 3.0700671672821045, + "learning_rate": 4.6493195752953496e-05, + "loss": 0.0552, + "step": 23500 + }, + { + "epoch": 7.06, + "grad_norm": 1.5736912488937378, + "learning_rate": 4.647824136384029e-05, + "loss": 0.055, + "step": 23600 + }, + { + "epoch": 7.09, + "grad_norm": 0.9347396492958069, + "learning_rate": 4.646328697472708e-05, + "loss": 0.0592, + "step": 23700 + }, + { + "epoch": 7.12, + "grad_norm": 1.7453091144561768, + "learning_rate": 4.644833258561388e-05, + "loss": 0.0623, + "step": 23800 + }, + { + "epoch": 7.15, + "grad_norm": 1.1539710760116577, + "learning_rate": 4.6433378196500674e-05, + "loss": 0.0558, + "step": 23900 + }, + { + "epoch": 7.18, + "grad_norm": 0.7530619502067566, + "learning_rate": 4.641842380738747e-05, + "loss": 0.0546, + "step": 24000 + }, + { + "epoch": 7.18, + "eval_loss": 0.2078467607498169, + "eval_precision": 0.908101688386724, + "eval_recall": 0.8710551433233782, + "eval_runtime": 302.902, + "eval_samples_per_second": 44.153, + "eval_steps_per_second": 1.38, + "step": 24000 + }, + { + "epoch": 7.21, + "grad_norm": 1.6339865922927856, + "learning_rate": 4.640346941827427e-05, + "loss": 0.0579, + "step": 24100 + }, + { + "epoch": 7.24, + "grad_norm": 2.397862434387207, + "learning_rate": 4.638851502916106e-05, + "loss": 0.054, + "step": 24200 + }, + { + "epoch": 7.27, + "grad_norm": 2.5979652404785156, + "learning_rate": 4.637356064004786e-05, + "loss": 0.0582, + "step": 24300 + }, + { + "epoch": 7.3, + "grad_norm": 1.4249415397644043, + "learning_rate": 4.635860625093465e-05, + "loss": 0.0611, + "step": 24400 + }, + { + "epoch": 7.33, + "grad_norm": 1.1104274988174438, + "learning_rate": 4.6343651861821445e-05, + "loss": 0.0603, + "step": 24500 + }, + { + "epoch": 7.36, + "grad_norm": 1.039832353591919, + "learning_rate": 4.6328697472708245e-05, + "loss": 0.06, + "step": 24600 + }, + { + "epoch": 7.39, + "grad_norm": 1.1284308433532715, + "learning_rate": 4.631374308359504e-05, + "loss": 0.0528, + "step": 24700 + }, + { + "epoch": 7.42, + "grad_norm": 3.3189823627471924, + "learning_rate": 4.629878869448183e-05, + "loss": 0.0634, + "step": 24800 + }, + { + "epoch": 7.45, + "grad_norm": 2.0465550422668457, + "learning_rate": 4.628383430536863e-05, + "loss": 0.0599, + "step": 24900 + }, + { + "epoch": 7.48, + "grad_norm": 1.93597412109375, + "learning_rate": 4.626887991625542e-05, + "loss": 0.0588, + "step": 25000 + }, + { + "epoch": 7.48, + "eval_loss": 0.20041726529598236, + "eval_precision": 0.9101642057026477, + "eval_recall": 0.8805997721604729, + "eval_runtime": 302.521, + "eval_samples_per_second": 44.209, + "eval_steps_per_second": 1.382, + "step": 25000 + }, + { + "epoch": 7.51, + "grad_norm": 2.2025020122528076, + "learning_rate": 4.6253925527142216e-05, + "loss": 0.0557, + "step": 25100 + }, + { + "epoch": 7.54, + "grad_norm": 2.4900927543640137, + "learning_rate": 4.6238971138029016e-05, + "loss": 0.0613, + "step": 25200 + }, + { + "epoch": 7.57, + "grad_norm": 1.2546288967132568, + "learning_rate": 4.622401674891581e-05, + "loss": 0.0609, + "step": 25300 + }, + { + "epoch": 7.6, + "grad_norm": 1.3969674110412598, + "learning_rate": 4.62090623598026e-05, + "loss": 0.0617, + "step": 25400 + }, + { + "epoch": 7.63, + "grad_norm": 0.2969658374786377, + "learning_rate": 4.61941079706894e-05, + "loss": 0.0602, + "step": 25500 + }, + { + "epoch": 7.66, + "grad_norm": 0.7388882040977478, + "learning_rate": 4.6179153581576194e-05, + "loss": 0.0593, + "step": 25600 + }, + { + "epoch": 7.69, + "grad_norm": 0.609923779964447, + "learning_rate": 4.6164199192462994e-05, + "loss": 0.0596, + "step": 25700 + }, + { + "epoch": 7.72, + "grad_norm": 2.3986215591430664, + "learning_rate": 4.614924480334979e-05, + "loss": 0.0651, + "step": 25800 + }, + { + "epoch": 7.75, + "grad_norm": 1.1203041076660156, + "learning_rate": 4.613429041423658e-05, + "loss": 0.0649, + "step": 25900 + }, + { + "epoch": 7.78, + "grad_norm": 0.7929214835166931, + "learning_rate": 4.611933602512338e-05, + "loss": 0.0648, + "step": 26000 + }, + { + "epoch": 7.78, + "eval_loss": 0.19321496784687042, + "eval_precision": 0.9163062916598927, + "eval_recall": 0.8676683395424736, + "eval_runtime": 301.2643, + "eval_samples_per_second": 44.393, + "eval_steps_per_second": 1.387, + "step": 26000 + }, + { + "epoch": 7.81, + "grad_norm": 0.5828276872634888, + "learning_rate": 4.610438163601017e-05, + "loss": 0.058, + "step": 26100 + }, + { + "epoch": 7.83, + "grad_norm": 0.44025149941444397, + "learning_rate": 4.6089427246896965e-05, + "loss": 0.0598, + "step": 26200 + }, + { + "epoch": 7.86, + "grad_norm": 0.7976229786872864, + "learning_rate": 4.6074472857783765e-05, + "loss": 0.0655, + "step": 26300 + }, + { + "epoch": 7.89, + "grad_norm": 2.6843769550323486, + "learning_rate": 4.605951846867056e-05, + "loss": 0.0588, + "step": 26400 + }, + { + "epoch": 7.92, + "grad_norm": 1.1365008354187012, + "learning_rate": 4.604456407955735e-05, + "loss": 0.0563, + "step": 26500 + }, + { + "epoch": 7.95, + "grad_norm": 2.463488817214966, + "learning_rate": 4.602960969044415e-05, + "loss": 0.0581, + "step": 26600 + }, + { + "epoch": 7.98, + "grad_norm": 0.47716620564460754, + "learning_rate": 4.601465530133094e-05, + "loss": 0.0595, + "step": 26700 + }, + { + "epoch": 8.01, + "grad_norm": 1.3218754529953003, + "learning_rate": 4.5999700912217736e-05, + "loss": 0.0554, + "step": 26800 + }, + { + "epoch": 8.04, + "grad_norm": 1.0640392303466797, + "learning_rate": 4.5984746523104536e-05, + "loss": 0.0409, + "step": 26900 + }, + { + "epoch": 8.07, + "grad_norm": 0.7323993444442749, + "learning_rate": 4.596979213399133e-05, + "loss": 0.0463, + "step": 27000 + }, + { + "epoch": 8.07, + "eval_loss": 0.21357020735740662, + "eval_precision": 0.9223724947042529, + "eval_recall": 0.8714246128267495, + "eval_runtime": 301.9271, + "eval_samples_per_second": 44.295, + "eval_steps_per_second": 1.384, + "step": 27000 + }, + { + "epoch": 8.1, + "grad_norm": 2.1960983276367188, + "learning_rate": 4.595483774487813e-05, + "loss": 0.0424, + "step": 27100 + }, + { + "epoch": 8.13, + "grad_norm": 2.5061357021331787, + "learning_rate": 4.593988335576492e-05, + "loss": 0.0436, + "step": 27200 + }, + { + "epoch": 8.16, + "grad_norm": 0.5249370336532593, + "learning_rate": 4.5924928966651714e-05, + "loss": 0.0537, + "step": 27300 + }, + { + "epoch": 8.19, + "grad_norm": 1.0211517810821533, + "learning_rate": 4.5909974577538514e-05, + "loss": 0.0448, + "step": 27400 + }, + { + "epoch": 8.22, + "grad_norm": 2.860835552215576, + "learning_rate": 4.58950201884253e-05, + "loss": 0.0474, + "step": 27500 + }, + { + "epoch": 8.25, + "grad_norm": 2.019699811935425, + "learning_rate": 4.58800657993121e-05, + "loss": 0.0482, + "step": 27600 + }, + { + "epoch": 8.28, + "grad_norm": 0.9144898653030396, + "learning_rate": 4.58651114101989e-05, + "loss": 0.045, + "step": 27700 + }, + { + "epoch": 8.31, + "grad_norm": 1.656792402267456, + "learning_rate": 4.585015702108569e-05, + "loss": 0.0475, + "step": 27800 + }, + { + "epoch": 8.34, + "grad_norm": 1.1702663898468018, + "learning_rate": 4.5835202631972485e-05, + "loss": 0.0445, + "step": 27900 + }, + { + "epoch": 8.37, + "grad_norm": 2.0331854820251465, + "learning_rate": 4.5820248242859284e-05, + "loss": 0.0429, + "step": 28000 + }, + { + "epoch": 8.37, + "eval_loss": 0.22609786689281464, + "eval_precision": 0.9198246970868781, + "eval_recall": 0.8788447920194588, + "eval_runtime": 302.1631, + "eval_samples_per_second": 44.261, + "eval_steps_per_second": 1.383, + "step": 28000 + }, + { + "epoch": 8.4, + "grad_norm": 5.98319673538208, + "learning_rate": 4.580529385374608e-05, + "loss": 0.0429, + "step": 28100 + }, + { + "epoch": 8.43, + "grad_norm": 1.0793452262878418, + "learning_rate": 4.579033946463287e-05, + "loss": 0.0525, + "step": 28200 + }, + { + "epoch": 8.46, + "grad_norm": 1.4804214239120483, + "learning_rate": 4.577538507551967e-05, + "loss": 0.0459, + "step": 28300 + }, + { + "epoch": 8.49, + "grad_norm": 0.9862244129180908, + "learning_rate": 4.576043068640646e-05, + "loss": 0.0534, + "step": 28400 + }, + { + "epoch": 8.52, + "grad_norm": 1.26304030418396, + "learning_rate": 4.574547629729326e-05, + "loss": 0.048, + "step": 28500 + }, + { + "epoch": 8.55, + "grad_norm": 0.4214903712272644, + "learning_rate": 4.573052190818005e-05, + "loss": 0.0547, + "step": 28600 + }, + { + "epoch": 8.58, + "grad_norm": 0.9271091222763062, + "learning_rate": 4.571556751906685e-05, + "loss": 0.0537, + "step": 28700 + }, + { + "epoch": 8.61, + "grad_norm": 0.8437818884849548, + "learning_rate": 4.570061312995365e-05, + "loss": 0.0537, + "step": 28800 + }, + { + "epoch": 8.64, + "grad_norm": 0.8551807999610901, + "learning_rate": 4.5685658740840434e-05, + "loss": 0.0461, + "step": 28900 + }, + { + "epoch": 8.67, + "grad_norm": 1.8268975019454956, + "learning_rate": 4.5670704351727234e-05, + "loss": 0.046, + "step": 29000 + }, + { + "epoch": 8.67, + "eval_loss": 0.20938238501548767, + "eval_precision": 0.9151901573163308, + "eval_recall": 0.8794605745250778, + "eval_runtime": 302.034, + "eval_samples_per_second": 44.28, + "eval_steps_per_second": 1.384, + "step": 29000 + }, + { + "epoch": 8.7, + "grad_norm": 0.08975500613451004, + "learning_rate": 4.565574996261403e-05, + "loss": 0.0493, + "step": 29100 + }, + { + "epoch": 8.73, + "grad_norm": 2.3698606491088867, + "learning_rate": 4.564079557350082e-05, + "loss": 0.0506, + "step": 29200 + }, + { + "epoch": 8.76, + "grad_norm": 1.1118419170379639, + "learning_rate": 4.562584118438762e-05, + "loss": 0.0445, + "step": 29300 + }, + { + "epoch": 8.79, + "grad_norm": 1.8186097145080566, + "learning_rate": 4.561088679527442e-05, + "loss": 0.0471, + "step": 29400 + }, + { + "epoch": 8.82, + "grad_norm": 1.4056422710418701, + "learning_rate": 4.559593240616121e-05, + "loss": 0.0513, + "step": 29500 + }, + { + "epoch": 8.85, + "grad_norm": 1.5597076416015625, + "learning_rate": 4.5580978017048004e-05, + "loss": 0.0452, + "step": 29600 + }, + { + "epoch": 8.88, + "grad_norm": 0.8287553191184998, + "learning_rate": 4.5566023627934804e-05, + "loss": 0.0523, + "step": 29700 + }, + { + "epoch": 8.91, + "grad_norm": 0.6897550821304321, + "learning_rate": 4.55510692388216e-05, + "loss": 0.0466, + "step": 29800 + }, + { + "epoch": 8.94, + "grad_norm": 0.7071977853775024, + "learning_rate": 4.553611484970839e-05, + "loss": 0.0434, + "step": 29900 + }, + { + "epoch": 8.97, + "grad_norm": 0.6574975252151489, + "learning_rate": 4.552116046059518e-05, + "loss": 0.0495, + "step": 30000 + }, + { + "epoch": 8.97, + "eval_loss": 0.20542754232883453, + "eval_precision": 0.9183409556852231, + "eval_recall": 0.8964561716801626, + "eval_runtime": 302.3305, + "eval_samples_per_second": 44.236, + "eval_steps_per_second": 1.383, + "step": 30000 + }, + { + "epoch": 9.0, + "grad_norm": 1.3489534854888916, + "learning_rate": 4.550620607148198e-05, + "loss": 0.0499, + "step": 30100 + }, + { + "epoch": 9.03, + "grad_norm": 1.0300263166427612, + "learning_rate": 4.549125168236878e-05, + "loss": 0.0353, + "step": 30200 + }, + { + "epoch": 9.06, + "grad_norm": 0.4393318295478821, + "learning_rate": 4.547629729325557e-05, + "loss": 0.0352, + "step": 30300 + }, + { + "epoch": 9.09, + "grad_norm": 0.4519498944282532, + "learning_rate": 4.546134290414237e-05, + "loss": 0.0342, + "step": 30400 + }, + { + "epoch": 9.12, + "grad_norm": 0.9631327986717224, + "learning_rate": 4.544638851502917e-05, + "loss": 0.0364, + "step": 30500 + }, + { + "epoch": 9.15, + "grad_norm": 2.7282943725585938, + "learning_rate": 4.5431434125915954e-05, + "loss": 0.0354, + "step": 30600 + }, + { + "epoch": 9.18, + "grad_norm": 0.5908452272415161, + "learning_rate": 4.541647973680275e-05, + "loss": 0.0356, + "step": 30700 + }, + { + "epoch": 9.21, + "grad_norm": 2.3660802841186523, + "learning_rate": 4.540152534768955e-05, + "loss": 0.0413, + "step": 30800 + }, + { + "epoch": 9.24, + "grad_norm": 1.7346217632293701, + "learning_rate": 4.5386570958576346e-05, + "loss": 0.036, + "step": 30900 + }, + { + "epoch": 9.27, + "grad_norm": 1.0829362869262695, + "learning_rate": 4.537161656946314e-05, + "loss": 0.0376, + "step": 31000 + }, + { + "epoch": 9.27, + "eval_loss": 0.226752370595932, + "eval_precision": 0.925325841962565, + "eval_recall": 0.8721635518334924, + "eval_runtime": 302.3165, + "eval_samples_per_second": 44.238, + "eval_steps_per_second": 1.383, + "step": 31000 + }, + { + "epoch": 9.3, + "grad_norm": 1.2249701023101807, + "learning_rate": 4.535666218034993e-05, + "loss": 0.039, + "step": 31100 + }, + { + "epoch": 9.33, + "grad_norm": 2.201986789703369, + "learning_rate": 4.534170779123673e-05, + "loss": 0.0384, + "step": 31200 + }, + { + "epoch": 9.36, + "grad_norm": 0.31157541275024414, + "learning_rate": 4.5326753402123524e-05, + "loss": 0.0318, + "step": 31300 + }, + { + "epoch": 9.39, + "grad_norm": 0.7502834796905518, + "learning_rate": 4.531179901301032e-05, + "loss": 0.0397, + "step": 31400 + }, + { + "epoch": 9.42, + "grad_norm": 0.3627040684223175, + "learning_rate": 4.529684462389712e-05, + "loss": 0.0389, + "step": 31500 + }, + { + "epoch": 9.45, + "grad_norm": 2.008009672164917, + "learning_rate": 4.5281890234783916e-05, + "loss": 0.042, + "step": 31600 + }, + { + "epoch": 9.48, + "grad_norm": 2.5352540016174316, + "learning_rate": 4.52669358456707e-05, + "loss": 0.0407, + "step": 31700 + }, + { + "epoch": 9.51, + "grad_norm": 0.543992280960083, + "learning_rate": 4.52519814565575e-05, + "loss": 0.0309, + "step": 31800 + }, + { + "epoch": 9.54, + "grad_norm": 1.3150848150253296, + "learning_rate": 4.52370270674443e-05, + "loss": 0.0369, + "step": 31900 + }, + { + "epoch": 9.57, + "grad_norm": 1.6026105880737305, + "learning_rate": 4.522207267833109e-05, + "loss": 0.0418, + "step": 32000 + }, + { + "epoch": 9.57, + "eval_loss": 0.21585828065872192, + "eval_precision": 0.9208557844690967, + "eval_recall": 0.8945164567874627, + "eval_runtime": 303.0508, + "eval_samples_per_second": 44.131, + "eval_steps_per_second": 1.379, + "step": 32000 + }, + { + "epoch": 9.6, + "grad_norm": 1.8489359617233276, + "learning_rate": 4.520711828921789e-05, + "loss": 0.0427, + "step": 32100 + }, + { + "epoch": 9.63, + "grad_norm": 2.4979922771453857, + "learning_rate": 4.519216390010468e-05, + "loss": 0.0337, + "step": 32200 + }, + { + "epoch": 9.66, + "grad_norm": 0.3452712595462799, + "learning_rate": 4.517720951099148e-05, + "loss": 0.0347, + "step": 32300 + }, + { + "epoch": 9.69, + "grad_norm": 1.081455945968628, + "learning_rate": 4.516225512187827e-05, + "loss": 0.047, + "step": 32400 + }, + { + "epoch": 9.72, + "grad_norm": 2.3087069988250732, + "learning_rate": 4.5147300732765066e-05, + "loss": 0.0404, + "step": 32500 + }, + { + "epoch": 9.75, + "grad_norm": 1.901135802268982, + "learning_rate": 4.5132346343651865e-05, + "loss": 0.0394, + "step": 32600 + }, + { + "epoch": 9.78, + "grad_norm": 1.2389637231826782, + "learning_rate": 4.511739195453866e-05, + "loss": 0.0376, + "step": 32700 + }, + { + "epoch": 9.81, + "grad_norm": 0.619143545627594, + "learning_rate": 4.510243756542545e-05, + "loss": 0.0414, + "step": 32800 + }, + { + "epoch": 9.84, + "grad_norm": 1.3270721435546875, + "learning_rate": 4.508748317631225e-05, + "loss": 0.0405, + "step": 32900 + }, + { + "epoch": 9.87, + "grad_norm": 2.503606081008911, + "learning_rate": 4.507252878719905e-05, + "loss": 0.0493, + "step": 33000 + }, + { + "epoch": 9.87, + "eval_loss": 0.20709815621376038, + "eval_precision": 0.9246134231259603, + "eval_recall": 0.8708088303211305, + "eval_runtime": 301.957, + "eval_samples_per_second": 44.291, + "eval_steps_per_second": 1.384, + "step": 33000 + }, + { + "epoch": 9.9, + "grad_norm": 0.6343371868133545, + "learning_rate": 4.505757439808584e-05, + "loss": 0.0365, + "step": 33100 + }, + { + "epoch": 9.93, + "grad_norm": 0.3116106688976288, + "learning_rate": 4.5042620008972636e-05, + "loss": 0.0358, + "step": 33200 + }, + { + "epoch": 9.96, + "grad_norm": 0.7307326197624207, + "learning_rate": 4.5027665619859436e-05, + "loss": 0.0411, + "step": 33300 + }, + { + "epoch": 9.99, + "grad_norm": 2.104717493057251, + "learning_rate": 4.501271123074622e-05, + "loss": 0.0401, + "step": 33400 + }, + { + "epoch": 10.02, + "grad_norm": 3.8659448623657227, + "learning_rate": 4.499775684163302e-05, + "loss": 0.0348, + "step": 33500 + }, + { + "epoch": 10.05, + "grad_norm": 1.0324366092681885, + "learning_rate": 4.4982802452519815e-05, + "loss": 0.0344, + "step": 33600 + }, + { + "epoch": 10.08, + "grad_norm": 1.0838052034378052, + "learning_rate": 4.4967848063406614e-05, + "loss": 0.0327, + "step": 33700 + }, + { + "epoch": 10.11, + "grad_norm": 1.8709659576416016, + "learning_rate": 4.495289367429341e-05, + "loss": 0.0267, + "step": 33800 + }, + { + "epoch": 10.14, + "grad_norm": 0.4261041283607483, + "learning_rate": 4.49379392851802e-05, + "loss": 0.0305, + "step": 33900 + }, + { + "epoch": 10.17, + "grad_norm": 0.16497644782066345, + "learning_rate": 4.4922984896067e-05, + "loss": 0.0276, + "step": 34000 + }, + { + "epoch": 10.17, + "eval_loss": 0.2343963235616684, + "eval_precision": 0.9252133285746731, + "eval_recall": 0.8779826965115921, + "eval_runtime": 301.9423, + "eval_samples_per_second": 44.293, + "eval_steps_per_second": 1.384, + "step": 34000 + }, + { + "epoch": 10.2, + "grad_norm": 2.9655115604400635, + "learning_rate": 4.490803050695379e-05, + "loss": 0.0268, + "step": 34100 + }, + { + "epoch": 10.23, + "grad_norm": 1.536979079246521, + "learning_rate": 4.4893076117840586e-05, + "loss": 0.0299, + "step": 34200 + }, + { + "epoch": 10.26, + "grad_norm": 2.8167715072631836, + "learning_rate": 4.4878121728727385e-05, + "loss": 0.0325, + "step": 34300 + }, + { + "epoch": 10.29, + "grad_norm": 2.1207668781280518, + "learning_rate": 4.4863167339614185e-05, + "loss": 0.029, + "step": 34400 + }, + { + "epoch": 10.32, + "grad_norm": 2.277759552001953, + "learning_rate": 4.484821295050097e-05, + "loss": 0.0308, + "step": 34500 + }, + { + "epoch": 10.35, + "grad_norm": 1.226417899131775, + "learning_rate": 4.483325856138777e-05, + "loss": 0.0299, + "step": 34600 + }, + { + "epoch": 10.38, + "grad_norm": 0.63482266664505, + "learning_rate": 4.4818304172274563e-05, + "loss": 0.0337, + "step": 34700 + }, + { + "epoch": 10.41, + "grad_norm": 1.8453493118286133, + "learning_rate": 4.4803349783161356e-05, + "loss": 0.0346, + "step": 34800 + }, + { + "epoch": 10.44, + "grad_norm": 0.40149375796318054, + "learning_rate": 4.4788395394048156e-05, + "loss": 0.03, + "step": 34900 + }, + { + "epoch": 10.47, + "grad_norm": 0.3980793058872223, + "learning_rate": 4.477344100493495e-05, + "loss": 0.035, + "step": 35000 + }, + { + "epoch": 10.47, + "eval_loss": 0.22229593992233276, + "eval_precision": 0.9262946269334285, + "eval_recall": 0.8795221527756396, + "eval_runtime": 302.9773, + "eval_samples_per_second": 44.142, + "eval_steps_per_second": 1.38, + "step": 35000 + }, + { + "epoch": 10.5, + "grad_norm": 0.629266083240509, + "learning_rate": 4.475848661582174e-05, + "loss": 0.0363, + "step": 35100 + }, + { + "epoch": 10.53, + "grad_norm": 1.134805679321289, + "learning_rate": 4.474353222670854e-05, + "loss": 0.0343, + "step": 35200 + }, + { + "epoch": 10.56, + "grad_norm": 1.9168953895568848, + "learning_rate": 4.4728577837595334e-05, + "loss": 0.0333, + "step": 35300 + }, + { + "epoch": 10.59, + "grad_norm": 0.7437408566474915, + "learning_rate": 4.4713623448482134e-05, + "loss": 0.0377, + "step": 35400 + }, + { + "epoch": 10.62, + "grad_norm": 0.8649216890335083, + "learning_rate": 4.469866905936893e-05, + "loss": 0.0387, + "step": 35500 + }, + { + "epoch": 10.65, + "grad_norm": 1.9679126739501953, + "learning_rate": 4.468371467025572e-05, + "loss": 0.0324, + "step": 35600 + }, + { + "epoch": 10.68, + "grad_norm": 1.0343681573867798, + "learning_rate": 4.466876028114252e-05, + "loss": 0.0371, + "step": 35700 + }, + { + "epoch": 10.71, + "grad_norm": 0.3291555941104889, + "learning_rate": 4.465380589202931e-05, + "loss": 0.0339, + "step": 35800 + }, + { + "epoch": 10.74, + "grad_norm": 1.2407808303833008, + "learning_rate": 4.4638851502916105e-05, + "loss": 0.0376, + "step": 35900 + }, + { + "epoch": 10.77, + "grad_norm": 1.2906955480575562, + "learning_rate": 4.4623897113802905e-05, + "loss": 0.0348, + "step": 36000 + }, + { + "epoch": 10.77, + "eval_loss": 0.22172214090824127, + "eval_precision": 0.9251365945617791, + "eval_recall": 0.8914683333846486, + "eval_runtime": 302.63, + "eval_samples_per_second": 44.193, + "eval_steps_per_second": 1.381, + "step": 36000 + }, + { + "epoch": 10.8, + "grad_norm": 0.9678496718406677, + "learning_rate": 4.46089427246897e-05, + "loss": 0.0354, + "step": 36100 + }, + { + "epoch": 10.83, + "grad_norm": 1.92240571975708, + "learning_rate": 4.459398833557649e-05, + "loss": 0.0324, + "step": 36200 + }, + { + "epoch": 10.86, + "grad_norm": 2.5916824340820312, + "learning_rate": 4.457903394646329e-05, + "loss": 0.034, + "step": 36300 + }, + { + "epoch": 10.89, + "grad_norm": 1.4677050113677979, + "learning_rate": 4.456407955735008e-05, + "loss": 0.0304, + "step": 36400 + }, + { + "epoch": 10.92, + "grad_norm": 1.1423336267471313, + "learning_rate": 4.4549125168236876e-05, + "loss": 0.0315, + "step": 36500 + }, + { + "epoch": 10.94, + "grad_norm": 1.0664762258529663, + "learning_rate": 4.4534170779123676e-05, + "loss": 0.0371, + "step": 36600 + }, + { + "epoch": 10.97, + "grad_norm": 1.344557762145996, + "learning_rate": 4.451921639001047e-05, + "loss": 0.0334, + "step": 36700 + }, + { + "epoch": 11.0, + "grad_norm": 2.944450616836548, + "learning_rate": 4.450426200089727e-05, + "loss": 0.0312, + "step": 36800 + }, + { + "epoch": 11.03, + "grad_norm": 1.02321195602417, + "learning_rate": 4.448930761178406e-05, + "loss": 0.0243, + "step": 36900 + }, + { + "epoch": 11.06, + "grad_norm": 1.4520535469055176, + "learning_rate": 4.4474353222670854e-05, + "loss": 0.0263, + "step": 37000 + }, + { + "epoch": 11.06, + "eval_loss": 0.23973342776298523, + "eval_precision": 0.928783958602846, + "eval_recall": 0.8842020998183442, + "eval_runtime": 302.1259, + "eval_samples_per_second": 44.266, + "eval_steps_per_second": 1.384, + "step": 37000 + }, + { + "epoch": 11.09, + "grad_norm": 0.9927899837493896, + "learning_rate": 4.4459398833557654e-05, + "loss": 0.0251, + "step": 37100 + }, + { + "epoch": 11.12, + "grad_norm": 0.7255445122718811, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.023, + "step": 37200 + }, + { + "epoch": 11.15, + "grad_norm": 1.2551404237747192, + "learning_rate": 4.442949005533124e-05, + "loss": 0.0282, + "step": 37300 + }, + { + "epoch": 11.18, + "grad_norm": 1.8652236461639404, + "learning_rate": 4.441453566621804e-05, + "loss": 0.0265, + "step": 37400 + }, + { + "epoch": 11.21, + "grad_norm": 0.29598140716552734, + "learning_rate": 4.439958127710483e-05, + "loss": 0.0231, + "step": 37500 + }, + { + "epoch": 11.24, + "grad_norm": 0.517977774143219, + "learning_rate": 4.4384626887991625e-05, + "loss": 0.0266, + "step": 37600 + }, + { + "epoch": 11.27, + "grad_norm": 1.3159215450286865, + "learning_rate": 4.4369672498878425e-05, + "loss": 0.0246, + "step": 37700 + }, + { + "epoch": 11.3, + "grad_norm": 1.8311362266540527, + "learning_rate": 4.435471810976522e-05, + "loss": 0.0325, + "step": 37800 + }, + { + "epoch": 11.33, + "grad_norm": 2.8861258029937744, + "learning_rate": 4.433976372065201e-05, + "loss": 0.0303, + "step": 37900 + }, + { + "epoch": 11.36, + "grad_norm": 0.6612695455551147, + "learning_rate": 4.432480933153881e-05, + "loss": 0.0284, + "step": 38000 + }, + { + "epoch": 11.36, + "eval_loss": 0.23250487446784973, + "eval_precision": 0.9248716302952503, + "eval_recall": 0.8873118014717202, + "eval_runtime": 302.5481, + "eval_samples_per_second": 44.205, + "eval_steps_per_second": 1.382, + "step": 38000 + }, + { + "epoch": 11.39, + "grad_norm": 0.8181266784667969, + "learning_rate": 4.43098549424256e-05, + "loss": 0.0251, + "step": 38100 + }, + { + "epoch": 11.42, + "grad_norm": 0.48834991455078125, + "learning_rate": 4.42949005533124e-05, + "loss": 0.0313, + "step": 38200 + }, + { + "epoch": 11.45, + "grad_norm": 0.4897523820400238, + "learning_rate": 4.4279946164199195e-05, + "loss": 0.0328, + "step": 38300 + }, + { + "epoch": 11.48, + "grad_norm": 0.7222294807434082, + "learning_rate": 4.426499177508599e-05, + "loss": 0.0298, + "step": 38400 + }, + { + "epoch": 11.51, + "grad_norm": 0.07086914777755737, + "learning_rate": 4.425003738597279e-05, + "loss": 0.032, + "step": 38500 + }, + { + "epoch": 11.54, + "grad_norm": 1.4812002182006836, + "learning_rate": 4.423508299685958e-05, + "loss": 0.0282, + "step": 38600 + }, + { + "epoch": 11.57, + "grad_norm": 1.302590012550354, + "learning_rate": 4.4220128607746374e-05, + "loss": 0.027, + "step": 38700 + }, + { + "epoch": 11.6, + "grad_norm": 1.9532426595687866, + "learning_rate": 4.420517421863317e-05, + "loss": 0.0304, + "step": 38800 + }, + { + "epoch": 11.63, + "grad_norm": 2.029754638671875, + "learning_rate": 4.4190219829519966e-05, + "loss": 0.0301, + "step": 38900 + }, + { + "epoch": 11.66, + "grad_norm": 1.320448398590088, + "learning_rate": 4.417526544040676e-05, + "loss": 0.0277, + "step": 39000 + }, + { + "epoch": 11.66, + "eval_loss": 0.241913303732872, + "eval_precision": 0.9234215627085253, + "eval_recall": 0.8947627697897103, + "eval_runtime": 303.3685, + "eval_samples_per_second": 44.085, + "eval_steps_per_second": 1.378, + "step": 39000 + }, + { + "epoch": 11.69, + "grad_norm": 1.8267722129821777, + "learning_rate": 4.416031105129356e-05, + "loss": 0.0249, + "step": 39100 + }, + { + "epoch": 11.72, + "grad_norm": 0.7122277021408081, + "learning_rate": 4.414535666218035e-05, + "loss": 0.0323, + "step": 39200 + }, + { + "epoch": 11.75, + "grad_norm": 0.5691227316856384, + "learning_rate": 4.4130402273067145e-05, + "loss": 0.0325, + "step": 39300 + }, + { + "epoch": 11.78, + "grad_norm": 0.40894216299057007, + "learning_rate": 4.4115447883953944e-05, + "loss": 0.0301, + "step": 39400 + }, + { + "epoch": 11.81, + "grad_norm": 2.4805972576141357, + "learning_rate": 4.410049349484074e-05, + "loss": 0.0277, + "step": 39500 + }, + { + "epoch": 11.84, + "grad_norm": 1.2774219512939453, + "learning_rate": 4.408553910572754e-05, + "loss": 0.0278, + "step": 39600 + }, + { + "epoch": 11.87, + "grad_norm": 1.267562985420227, + "learning_rate": 4.407058471661433e-05, + "loss": 0.0286, + "step": 39700 + }, + { + "epoch": 11.9, + "grad_norm": 0.6910821795463562, + "learning_rate": 4.405563032750112e-05, + "loss": 0.0344, + "step": 39800 + }, + { + "epoch": 11.93, + "grad_norm": 0.3539283275604248, + "learning_rate": 4.404067593838792e-05, + "loss": 0.0298, + "step": 39900 + }, + { + "epoch": 11.96, + "grad_norm": 1.7098407745361328, + "learning_rate": 4.4025721549274715e-05, + "loss": 0.0318, + "step": 40000 + }, + { + "epoch": 11.96, + "eval_loss": 0.23493793606758118, + "eval_precision": 0.9253437490076529, + "eval_recall": 0.8971951106869054, + "eval_runtime": 302.3541, + "eval_samples_per_second": 44.233, + "eval_steps_per_second": 1.382, + "step": 40000 + }, + { + "epoch": 11.99, + "grad_norm": 2.1748311519622803, + "learning_rate": 4.401076716016151e-05, + "loss": 0.0312, + "step": 40100 + }, + { + "epoch": 12.02, + "grad_norm": 0.8426460027694702, + "learning_rate": 4.399581277104831e-05, + "loss": 0.0262, + "step": 40200 + }, + { + "epoch": 12.05, + "grad_norm": 0.3200826048851013, + "learning_rate": 4.39808583819351e-05, + "loss": 0.0237, + "step": 40300 + }, + { + "epoch": 12.08, + "grad_norm": 0.2708234488964081, + "learning_rate": 4.3965903992821893e-05, + "loss": 0.0229, + "step": 40400 + }, + { + "epoch": 12.11, + "grad_norm": 1.4237157106399536, + "learning_rate": 4.395094960370869e-05, + "loss": 0.0198, + "step": 40500 + }, + { + "epoch": 12.14, + "grad_norm": 0.06805676221847534, + "learning_rate": 4.3935995214595486e-05, + "loss": 0.026, + "step": 40600 + }, + { + "epoch": 12.17, + "grad_norm": 1.2842926979064941, + "learning_rate": 4.392104082548228e-05, + "loss": 0.0241, + "step": 40700 + }, + { + "epoch": 12.2, + "grad_norm": 1.5190855264663696, + "learning_rate": 4.390608643636908e-05, + "loss": 0.0232, + "step": 40800 + }, + { + "epoch": 12.23, + "grad_norm": 1.8280004262924194, + "learning_rate": 4.389113204725587e-05, + "loss": 0.0241, + "step": 40900 + }, + { + "epoch": 12.26, + "grad_norm": 0.19059352576732635, + "learning_rate": 4.3876177658142664e-05, + "loss": 0.0238, + "step": 41000 + }, + { + "epoch": 12.26, + "eval_loss": 0.24695585668087006, + "eval_precision": 0.9256610729722858, + "eval_recall": 0.8967332738076911, + "eval_runtime": 302.2734, + "eval_samples_per_second": 44.245, + "eval_steps_per_second": 1.383, + "step": 41000 + }, + { + "epoch": 12.29, + "grad_norm": 0.40746474266052246, + "learning_rate": 4.3861223269029464e-05, + "loss": 0.0232, + "step": 41100 + }, + { + "epoch": 12.32, + "grad_norm": 1.2412996292114258, + "learning_rate": 4.384626887991626e-05, + "loss": 0.0215, + "step": 41200 + }, + { + "epoch": 12.35, + "grad_norm": 0.2166558802127838, + "learning_rate": 4.3831314490803056e-05, + "loss": 0.0237, + "step": 41300 + }, + { + "epoch": 12.38, + "grad_norm": 0.719872236251831, + "learning_rate": 4.381636010168985e-05, + "loss": 0.0253, + "step": 41400 + }, + { + "epoch": 12.41, + "grad_norm": 1.5946626663208008, + "learning_rate": 4.380140571257664e-05, + "loss": 0.0235, + "step": 41500 + }, + { + "epoch": 12.44, + "grad_norm": 1.0119950771331787, + "learning_rate": 4.378645132346344e-05, + "loss": 0.0257, + "step": 41600 + }, + { + "epoch": 12.47, + "grad_norm": 0.9327923059463501, + "learning_rate": 4.377149693435023e-05, + "loss": 0.0243, + "step": 41700 + }, + { + "epoch": 12.5, + "grad_norm": 0.41256028413772583, + "learning_rate": 4.375654254523703e-05, + "loss": 0.0272, + "step": 41800 + }, + { + "epoch": 12.53, + "grad_norm": 0.1845785677433014, + "learning_rate": 4.374158815612383e-05, + "loss": 0.029, + "step": 41900 + }, + { + "epoch": 12.56, + "grad_norm": 1.754239559173584, + "learning_rate": 4.372663376701062e-05, + "loss": 0.0252, + "step": 42000 + }, + { + "epoch": 12.56, + "eval_loss": 0.2473253309726715, + "eval_precision": 0.9269791733010636, + "eval_recall": 0.8962406478031959, + "eval_runtime": 304.5592, + "eval_samples_per_second": 43.913, + "eval_steps_per_second": 1.372, + "step": 42000 + }, + { + "epoch": 12.59, + "grad_norm": 0.5748271346092224, + "learning_rate": 4.371167937789741e-05, + "loss": 0.0281, + "step": 42100 + }, + { + "epoch": 12.62, + "grad_norm": 0.36274582147598267, + "learning_rate": 4.369672498878421e-05, + "loss": 0.0248, + "step": 42200 + }, + { + "epoch": 12.65, + "grad_norm": 0.6130300164222717, + "learning_rate": 4.3681770599671006e-05, + "loss": 0.0269, + "step": 42300 + }, + { + "epoch": 12.68, + "grad_norm": 1.2477418184280396, + "learning_rate": 4.36668162105578e-05, + "loss": 0.0259, + "step": 42400 + }, + { + "epoch": 12.71, + "grad_norm": 0.8152483701705933, + "learning_rate": 4.36518618214446e-05, + "loss": 0.0263, + "step": 42500 + }, + { + "epoch": 12.74, + "grad_norm": 0.04731460288167, + "learning_rate": 4.363690743233139e-05, + "loss": 0.024, + "step": 42600 + }, + { + "epoch": 12.77, + "grad_norm": 0.7886996865272522, + "learning_rate": 4.362195304321819e-05, + "loss": 0.0245, + "step": 42700 + }, + { + "epoch": 12.8, + "grad_norm": 2.1900315284729004, + "learning_rate": 4.360699865410498e-05, + "loss": 0.0292, + "step": 42800 + }, + { + "epoch": 12.83, + "grad_norm": 0.45924192667007446, + "learning_rate": 4.3592044264991777e-05, + "loss": 0.0261, + "step": 42900 + }, + { + "epoch": 12.86, + "grad_norm": 0.07307754456996918, + "learning_rate": 4.3577089875878576e-05, + "loss": 0.0248, + "step": 43000 + }, + { + "epoch": 12.86, + "eval_loss": 0.24504822492599487, + "eval_precision": 0.9273960876319711, + "eval_recall": 0.9006127035930909, + "eval_runtime": 303.9567, + "eval_samples_per_second": 44.0, + "eval_steps_per_second": 1.375, + "step": 43000 + }, + { + "epoch": 12.89, + "grad_norm": 0.4676400423049927, + "learning_rate": 4.356213548676536e-05, + "loss": 0.0232, + "step": 43100 + }, + { + "epoch": 12.92, + "grad_norm": 0.2993585765361786, + "learning_rate": 4.354718109765216e-05, + "loss": 0.0237, + "step": 43200 + }, + { + "epoch": 12.95, + "grad_norm": 1.226276159286499, + "learning_rate": 4.353222670853896e-05, + "loss": 0.0256, + "step": 43300 + }, + { + "epoch": 12.98, + "grad_norm": 1.5110477209091187, + "learning_rate": 4.3517272319425754e-05, + "loss": 0.0285, + "step": 43400 + }, + { + "epoch": 13.01, + "grad_norm": 1.6162513494491577, + "learning_rate": 4.350231793031255e-05, + "loss": 0.0219, + "step": 43500 + }, + { + "epoch": 13.04, + "grad_norm": 0.1792839914560318, + "learning_rate": 4.348736354119935e-05, + "loss": 0.0191, + "step": 43600 + }, + { + "epoch": 13.07, + "grad_norm": 1.9044649600982666, + "learning_rate": 4.347240915208614e-05, + "loss": 0.017, + "step": 43700 + }, + { + "epoch": 13.1, + "grad_norm": 0.5899202823638916, + "learning_rate": 4.345745476297293e-05, + "loss": 0.0241, + "step": 43800 + }, + { + "epoch": 13.13, + "grad_norm": 0.6521077752113342, + "learning_rate": 4.344250037385973e-05, + "loss": 0.0216, + "step": 43900 + }, + { + "epoch": 13.16, + "grad_norm": 0.7596339583396912, + "learning_rate": 4.3427545984746525e-05, + "loss": 0.0181, + "step": 44000 + }, + { + "epoch": 13.16, + "eval_loss": 0.2613174319267273, + "eval_precision": 0.9276514907592247, + "eval_recall": 0.8870654884694725, + "eval_runtime": 304.3764, + "eval_samples_per_second": 43.939, + "eval_steps_per_second": 1.373, + "step": 44000 + }, + { + "epoch": 13.19, + "grad_norm": 1.0404387712478638, + "learning_rate": 4.3412591595633325e-05, + "loss": 0.0247, + "step": 44100 + }, + { + "epoch": 13.22, + "grad_norm": 1.7849115133285522, + "learning_rate": 4.339763720652011e-05, + "loss": 0.0188, + "step": 44200 + }, + { + "epoch": 13.25, + "grad_norm": 1.0972092151641846, + "learning_rate": 4.338268281740691e-05, + "loss": 0.0255, + "step": 44300 + }, + { + "epoch": 13.28, + "grad_norm": 0.7391771078109741, + "learning_rate": 4.336772842829371e-05, + "loss": 0.0225, + "step": 44400 + }, + { + "epoch": 13.31, + "grad_norm": 1.5010148286819458, + "learning_rate": 4.3352774039180497e-05, + "loss": 0.0217, + "step": 44500 + }, + { + "epoch": 13.34, + "grad_norm": 0.7189137935638428, + "learning_rate": 4.3337819650067296e-05, + "loss": 0.0211, + "step": 44600 + }, + { + "epoch": 13.37, + "grad_norm": 1.003636121749878, + "learning_rate": 4.3322865260954096e-05, + "loss": 0.0236, + "step": 44700 + }, + { + "epoch": 13.4, + "grad_norm": 0.914703369140625, + "learning_rate": 4.330791087184089e-05, + "loss": 0.0224, + "step": 44800 + }, + { + "epoch": 13.43, + "grad_norm": 0.1861487776041031, + "learning_rate": 4.329295648272768e-05, + "loss": 0.0251, + "step": 44900 + }, + { + "epoch": 13.46, + "grad_norm": 0.7734150886535645, + "learning_rate": 4.327800209361448e-05, + "loss": 0.0254, + "step": 45000 + }, + { + "epoch": 13.46, + "eval_loss": 0.2583397924900055, + "eval_precision": 0.9213451745124829, + "eval_recall": 0.9135441362110902, + "eval_runtime": 305.1941, + "eval_samples_per_second": 43.821, + "eval_steps_per_second": 1.37, + "step": 45000 + }, + { + "epoch": 13.49, + "grad_norm": 0.7596560716629028, + "learning_rate": 4.3263047704501274e-05, + "loss": 0.0246, + "step": 45100 + }, + { + "epoch": 13.52, + "grad_norm": 1.4200429916381836, + "learning_rate": 4.324809331538807e-05, + "loss": 0.0174, + "step": 45200 + }, + { + "epoch": 13.55, + "grad_norm": 2.7082788944244385, + "learning_rate": 4.323313892627486e-05, + "loss": 0.026, + "step": 45300 + }, + { + "epoch": 13.58, + "grad_norm": 1.2132717370986938, + "learning_rate": 4.321818453716166e-05, + "loss": 0.0228, + "step": 45400 + }, + { + "epoch": 13.61, + "grad_norm": 3.768927812576294, + "learning_rate": 4.320323014804846e-05, + "loss": 0.0236, + "step": 45500 + }, + { + "epoch": 13.64, + "grad_norm": 1.5163260698318481, + "learning_rate": 4.3188275758935245e-05, + "loss": 0.0189, + "step": 45600 + }, + { + "epoch": 13.67, + "grad_norm": 0.7969369888305664, + "learning_rate": 4.3173321369822045e-05, + "loss": 0.0245, + "step": 45700 + }, + { + "epoch": 13.7, + "grad_norm": 1.445375680923462, + "learning_rate": 4.3158366980708845e-05, + "loss": 0.0232, + "step": 45800 + }, + { + "epoch": 13.73, + "grad_norm": 0.04813400283455849, + "learning_rate": 4.314341259159563e-05, + "loss": 0.0215, + "step": 45900 + }, + { + "epoch": 13.76, + "grad_norm": 2.0303447246551514, + "learning_rate": 4.312845820248243e-05, + "loss": 0.0206, + "step": 46000 + }, + { + "epoch": 13.76, + "eval_loss": 0.2769757807254791, + "eval_precision": 0.9277020832674738, + "eval_recall": 0.9035376704947813, + "eval_runtime": 304.0355, + "eval_samples_per_second": 43.988, + "eval_steps_per_second": 1.375, + "step": 46000 + }, + { + "epoch": 13.79, + "grad_norm": 0.9254265427589417, + "learning_rate": 4.311350381336923e-05, + "loss": 0.0203, + "step": 46100 + }, + { + "epoch": 13.82, + "grad_norm": 2.1310763359069824, + "learning_rate": 4.309854942425602e-05, + "loss": 0.0206, + "step": 46200 + }, + { + "epoch": 13.85, + "grad_norm": 0.5353107452392578, + "learning_rate": 4.3083595035142816e-05, + "loss": 0.0206, + "step": 46300 + }, + { + "epoch": 13.88, + "grad_norm": 0.9395775198936462, + "learning_rate": 4.306864064602961e-05, + "loss": 0.0304, + "step": 46400 + }, + { + "epoch": 13.91, + "grad_norm": 0.056145694106817245, + "learning_rate": 4.305368625691641e-05, + "loss": 0.0237, + "step": 46500 + }, + { + "epoch": 13.94, + "grad_norm": 0.03264997899532318, + "learning_rate": 4.30387318678032e-05, + "loss": 0.0244, + "step": 46600 + }, + { + "epoch": 13.97, + "grad_norm": 1.6055926084518433, + "learning_rate": 4.3023777478689994e-05, + "loss": 0.0224, + "step": 46700 + }, + { + "epoch": 14.0, + "grad_norm": 1.4891152381896973, + "learning_rate": 4.3008823089576794e-05, + "loss": 0.021, + "step": 46800 + }, + { + "epoch": 14.03, + "grad_norm": 0.3057061731815338, + "learning_rate": 4.299386870046359e-05, + "loss": 0.0173, + "step": 46900 + }, + { + "epoch": 14.06, + "grad_norm": 1.0254565477371216, + "learning_rate": 4.297891431135038e-05, + "loss": 0.017, + "step": 47000 + }, + { + "epoch": 14.06, + "eval_loss": 0.2714207172393799, + "eval_precision": 0.9283886660138359, + "eval_recall": 0.9048923920071431, + "eval_runtime": 302.2817, + "eval_samples_per_second": 44.244, + "eval_steps_per_second": 1.383, + "step": 47000 + }, + { + "epoch": 14.08, + "grad_norm": 0.6178631782531738, + "learning_rate": 4.296395992223718e-05, + "loss": 0.021, + "step": 47100 + }, + { + "epoch": 14.11, + "grad_norm": 3.516096353530884, + "learning_rate": 4.294900553312398e-05, + "loss": 0.0181, + "step": 47200 + }, + { + "epoch": 14.14, + "grad_norm": 0.20362690091133118, + "learning_rate": 4.2934051144010765e-05, + "loss": 0.0193, + "step": 47300 + }, + { + "epoch": 14.17, + "grad_norm": 2.5930867195129395, + "learning_rate": 4.2919096754897565e-05, + "loss": 0.0176, + "step": 47400 + }, + { + "epoch": 14.2, + "grad_norm": 1.4823873043060303, + "learning_rate": 4.2904142365784364e-05, + "loss": 0.0173, + "step": 47500 + }, + { + "epoch": 14.23, + "grad_norm": 0.5278753042221069, + "learning_rate": 4.288918797667115e-05, + "loss": 0.0212, + "step": 47600 + }, + { + "epoch": 14.26, + "grad_norm": 1.855218529701233, + "learning_rate": 4.287423358755795e-05, + "loss": 0.0199, + "step": 47700 + }, + { + "epoch": 14.29, + "grad_norm": 0.31464433670043945, + "learning_rate": 4.285927919844474e-05, + "loss": 0.0241, + "step": 47800 + }, + { + "epoch": 14.32, + "grad_norm": 0.2182936817407608, + "learning_rate": 4.284432480933154e-05, + "loss": 0.0172, + "step": 47900 + }, + { + "epoch": 14.35, + "grad_norm": 1.2800421714782715, + "learning_rate": 4.2829370420218336e-05, + "loss": 0.0188, + "step": 48000 + }, + { + "epoch": 14.35, + "eval_loss": 0.26452192664146423, + "eval_precision": 0.9272217673363986, + "eval_recall": 0.9065242156470334, + "eval_runtime": 302.9199, + "eval_samples_per_second": 44.15, + "eval_steps_per_second": 1.38, + "step": 48000 + }, + { + "epoch": 14.38, + "grad_norm": 3.320737361907959, + "learning_rate": 4.281441603110513e-05, + "loss": 0.0198, + "step": 48100 + }, + { + "epoch": 14.41, + "grad_norm": 0.8519121408462524, + "learning_rate": 4.279946164199193e-05, + "loss": 0.0182, + "step": 48200 + }, + { + "epoch": 14.44, + "grad_norm": 0.4318147599697113, + "learning_rate": 4.278450725287872e-05, + "loss": 0.0178, + "step": 48300 + }, + { + "epoch": 14.47, + "grad_norm": 0.047759074717760086, + "learning_rate": 4.2769552863765514e-05, + "loss": 0.021, + "step": 48400 + }, + { + "epoch": 14.5, + "grad_norm": 1.6022422313690186, + "learning_rate": 4.2754598474652314e-05, + "loss": 0.0144, + "step": 48500 + }, + { + "epoch": 14.53, + "grad_norm": 0.7104184031486511, + "learning_rate": 4.273964408553911e-05, + "loss": 0.0207, + "step": 48600 + }, + { + "epoch": 14.56, + "grad_norm": 1.5093780755996704, + "learning_rate": 4.27246896964259e-05, + "loss": 0.0205, + "step": 48700 + }, + { + "epoch": 14.59, + "grad_norm": 0.7566470503807068, + "learning_rate": 4.27097353073127e-05, + "loss": 0.0187, + "step": 48800 + }, + { + "epoch": 14.62, + "grad_norm": 1.222693920135498, + "learning_rate": 4.269478091819949e-05, + "loss": 0.0199, + "step": 48900 + }, + { + "epoch": 14.65, + "grad_norm": 1.5546650886535645, + "learning_rate": 4.2679826529086285e-05, + "loss": 0.0188, + "step": 49000 + }, + { + "epoch": 14.65, + "eval_loss": 0.2760772109031677, + "eval_precision": 0.9305101058710299, + "eval_recall": 0.8930077896486961, + "eval_runtime": 301.8588, + "eval_samples_per_second": 44.305, + "eval_steps_per_second": 1.385, + "step": 49000 + }, + { + "epoch": 14.68, + "grad_norm": 0.6152912378311157, + "learning_rate": 4.2664872139973084e-05, + "loss": 0.0199, + "step": 49100 + }, + { + "epoch": 14.71, + "grad_norm": 0.8479551672935486, + "learning_rate": 4.264991775085988e-05, + "loss": 0.0236, + "step": 49200 + }, + { + "epoch": 14.74, + "grad_norm": 2.0793190002441406, + "learning_rate": 4.263496336174668e-05, + "loss": 0.0257, + "step": 49300 + }, + { + "epoch": 14.77, + "grad_norm": 0.9795339107513428, + "learning_rate": 4.262000897263347e-05, + "loss": 0.019, + "step": 49400 + }, + { + "epoch": 14.8, + "grad_norm": 0.49018004536628723, + "learning_rate": 4.260505458352026e-05, + "loss": 0.0207, + "step": 49500 + }, + { + "epoch": 14.83, + "grad_norm": 0.22400274872779846, + "learning_rate": 4.259010019440706e-05, + "loss": 0.0212, + "step": 49600 + }, + { + "epoch": 14.86, + "grad_norm": 0.8345464468002319, + "learning_rate": 4.2575145805293855e-05, + "loss": 0.0182, + "step": 49700 + }, + { + "epoch": 14.89, + "grad_norm": 0.2443341612815857, + "learning_rate": 4.256019141618065e-05, + "loss": 0.0177, + "step": 49800 + }, + { + "epoch": 14.92, + "grad_norm": 0.697216272354126, + "learning_rate": 4.254523702706745e-05, + "loss": 0.0216, + "step": 49900 + }, + { + "epoch": 14.95, + "grad_norm": 0.5050187706947327, + "learning_rate": 4.253028263795424e-05, + "loss": 0.0166, + "step": 50000 + }, + { + "epoch": 14.95, + "eval_loss": 0.282767653465271, + "eval_precision": 0.9254008757836374, + "eval_recall": 0.9044305551279288, + "eval_runtime": 303.3682, + "eval_samples_per_second": 44.085, + "eval_steps_per_second": 1.378, + "step": 50000 + }, + { + "epoch": 14.98, + "grad_norm": 0.4018344283103943, + "learning_rate": 4.2515328248841034e-05, + "loss": 0.02, + "step": 50100 + }, + { + "epoch": 15.01, + "grad_norm": 2.2681732177734375, + "learning_rate": 4.250037385972783e-05, + "loss": 0.0169, + "step": 50200 + }, + { + "epoch": 15.04, + "grad_norm": 0.18065716326236725, + "learning_rate": 4.2485419470614626e-05, + "loss": 0.0163, + "step": 50300 + }, + { + "epoch": 15.07, + "grad_norm": 1.0265353918075562, + "learning_rate": 4.247046508150142e-05, + "loss": 0.0201, + "step": 50400 + }, + { + "epoch": 15.1, + "grad_norm": 1.7455101013183594, + "learning_rate": 4.245551069238822e-05, + "loss": 0.0174, + "step": 50500 + }, + { + "epoch": 15.13, + "grad_norm": 0.03697839379310608, + "learning_rate": 4.244055630327501e-05, + "loss": 0.021, + "step": 50600 + }, + { + "epoch": 15.16, + "grad_norm": 0.10842275619506836, + "learning_rate": 4.242560191416181e-05, + "loss": 0.0196, + "step": 50700 + }, + { + "epoch": 15.19, + "grad_norm": 0.6541497111320496, + "learning_rate": 4.2410647525048604e-05, + "loss": 0.019, + "step": 50800 + }, + { + "epoch": 15.22, + "grad_norm": 1.3006408214569092, + "learning_rate": 4.23956931359354e-05, + "loss": 0.0178, + "step": 50900 + }, + { + "epoch": 15.25, + "grad_norm": 0.6021150350570679, + "learning_rate": 4.23807387468222e-05, + "loss": 0.0199, + "step": 51000 + }, + { + "epoch": 15.25, + "eval_loss": 0.2640076279640198, + "eval_precision": 0.9301819557882123, + "eval_recall": 0.9081868284122048, + "eval_runtime": 302.9987, + "eval_samples_per_second": 44.139, + "eval_steps_per_second": 1.38, + "step": 51000 + }, + { + "epoch": 15.28, + "grad_norm": 0.8783787488937378, + "learning_rate": 4.236578435770899e-05, + "loss": 0.0175, + "step": 51100 + }, + { + "epoch": 15.31, + "grad_norm": 0.18405625224113464, + "learning_rate": 4.235082996859578e-05, + "loss": 0.0152, + "step": 51200 + }, + { + "epoch": 15.34, + "grad_norm": 0.03877532109618187, + "learning_rate": 4.233587557948258e-05, + "loss": 0.0174, + "step": 51300 + }, + { + "epoch": 15.37, + "grad_norm": 0.3079793155193329, + "learning_rate": 4.2320921190369375e-05, + "loss": 0.015, + "step": 51400 + }, + { + "epoch": 15.4, + "grad_norm": 0.9296764731407166, + "learning_rate": 4.230596680125617e-05, + "loss": 0.0177, + "step": 51500 + }, + { + "epoch": 15.43, + "grad_norm": 0.7762422561645508, + "learning_rate": 4.229101241214297e-05, + "loss": 0.0195, + "step": 51600 + }, + { + "epoch": 15.46, + "grad_norm": 2.472615957260132, + "learning_rate": 4.227605802302976e-05, + "loss": 0.0195, + "step": 51700 + }, + { + "epoch": 15.49, + "grad_norm": 2.8045852184295654, + "learning_rate": 4.226110363391655e-05, + "loss": 0.0201, + "step": 51800 + }, + { + "epoch": 15.52, + "grad_norm": 0.053874421864748, + "learning_rate": 4.224614924480335e-05, + "loss": 0.018, + "step": 51900 + }, + { + "epoch": 15.55, + "grad_norm": 0.3398553729057312, + "learning_rate": 4.2231194855690146e-05, + "loss": 0.0167, + "step": 52000 + }, + { + "epoch": 15.55, + "eval_loss": 0.2754287123680115, + "eval_precision": 0.927292017724521, + "eval_recall": 0.914929646848733, + "eval_runtime": 302.6973, + "eval_samples_per_second": 44.183, + "eval_steps_per_second": 1.381, + "step": 52000 + }, + { + "epoch": 15.58, + "grad_norm": 1.1841187477111816, + "learning_rate": 4.2216240466576945e-05, + "loss": 0.0157, + "step": 52100 + }, + { + "epoch": 15.61, + "grad_norm": 1.0184565782546997, + "learning_rate": 4.220128607746374e-05, + "loss": 0.0145, + "step": 52200 + }, + { + "epoch": 15.64, + "grad_norm": 0.6707783937454224, + "learning_rate": 4.218633168835053e-05, + "loss": 0.0215, + "step": 52300 + }, + { + "epoch": 15.67, + "grad_norm": 0.8084210157394409, + "learning_rate": 4.217137729923733e-05, + "loss": 0.0185, + "step": 52400 + }, + { + "epoch": 15.7, + "grad_norm": 0.24998579919338226, + "learning_rate": 4.2156422910124124e-05, + "loss": 0.0192, + "step": 52500 + }, + { + "epoch": 15.73, + "grad_norm": 0.11048603802919388, + "learning_rate": 4.214146852101092e-05, + "loss": 0.0177, + "step": 52600 + }, + { + "epoch": 15.76, + "grad_norm": 0.8540931940078735, + "learning_rate": 4.2126514131897716e-05, + "loss": 0.018, + "step": 52700 + }, + { + "epoch": 15.79, + "grad_norm": 0.3726775646209717, + "learning_rate": 4.211155974278451e-05, + "loss": 0.0181, + "step": 52800 + }, + { + "epoch": 15.82, + "grad_norm": 0.13543102145195007, + "learning_rate": 4.20966053536713e-05, + "loss": 0.0201, + "step": 52900 + }, + { + "epoch": 15.85, + "grad_norm": 0.3862367570400238, + "learning_rate": 4.20816509645581e-05, + "loss": 0.0184, + "step": 53000 + }, + { + "epoch": 15.85, + "eval_loss": 0.2746909558773041, + "eval_precision": 0.9290507850298093, + "eval_recall": 0.9164075248622187, + "eval_runtime": 304.9585, + "eval_samples_per_second": 43.855, + "eval_steps_per_second": 1.371, + "step": 53000 + }, + { + "epoch": 15.88, + "grad_norm": 0.5059983730316162, + "learning_rate": 4.2066696575444895e-05, + "loss": 0.0182, + "step": 53100 + }, + { + "epoch": 15.91, + "grad_norm": 0.45346036553382874, + "learning_rate": 4.205174218633169e-05, + "loss": 0.0208, + "step": 53200 + }, + { + "epoch": 15.94, + "grad_norm": 1.0658683776855469, + "learning_rate": 4.203678779721849e-05, + "loss": 0.0149, + "step": 53300 + }, + { + "epoch": 15.97, + "grad_norm": 0.2168959081172943, + "learning_rate": 4.202183340810528e-05, + "loss": 0.0191, + "step": 53400 + }, + { + "epoch": 16.0, + "grad_norm": 1.7620713710784912, + "learning_rate": 4.200687901899207e-05, + "loss": 0.0219, + "step": 53500 + }, + { + "epoch": 16.03, + "grad_norm": 0.33198004961013794, + "learning_rate": 4.199192462987887e-05, + "loss": 0.014, + "step": 53600 + }, + { + "epoch": 16.06, + "grad_norm": 3.614070415496826, + "learning_rate": 4.1976970240765665e-05, + "loss": 0.0132, + "step": 53700 + }, + { + "epoch": 16.09, + "grad_norm": 0.7846044898033142, + "learning_rate": 4.1962015851652465e-05, + "loss": 0.014, + "step": 53800 + }, + { + "epoch": 16.12, + "grad_norm": 1.2382973432540894, + "learning_rate": 4.194706146253926e-05, + "loss": 0.0198, + "step": 53900 + }, + { + "epoch": 16.15, + "grad_norm": 1.7487576007843018, + "learning_rate": 4.193210707342605e-05, + "loss": 0.0156, + "step": 54000 + }, + { + "epoch": 16.15, + "eval_loss": 0.27493321895599365, + "eval_precision": 0.926791958041958, + "eval_recall": 0.9140367622155855, + "eval_runtime": 304.8434, + "eval_samples_per_second": 43.872, + "eval_steps_per_second": 1.371, + "step": 54000 + }, + { + "epoch": 16.18, + "grad_norm": 2.473257541656494, + "learning_rate": 4.191715268431285e-05, + "loss": 0.0144, + "step": 54100 + }, + { + "epoch": 16.21, + "grad_norm": 1.7735458612442017, + "learning_rate": 4.1902198295199643e-05, + "loss": 0.0128, + "step": 54200 + }, + { + "epoch": 16.24, + "grad_norm": 0.09201900660991669, + "learning_rate": 4.1887243906086436e-05, + "loss": 0.0121, + "step": 54300 + }, + { + "epoch": 16.27, + "grad_norm": 4.265335559844971, + "learning_rate": 4.1872289516973236e-05, + "loss": 0.0193, + "step": 54400 + }, + { + "epoch": 16.3, + "grad_norm": 0.05550719425082207, + "learning_rate": 4.185733512786003e-05, + "loss": 0.0191, + "step": 54500 + }, + { + "epoch": 16.33, + "grad_norm": 1.2244312763214111, + "learning_rate": 4.184238073874682e-05, + "loss": 0.0144, + "step": 54600 + }, + { + "epoch": 16.36, + "grad_norm": 0.11609119921922684, + "learning_rate": 4.182742634963362e-05, + "loss": 0.0195, + "step": 54700 + }, + { + "epoch": 16.39, + "grad_norm": 0.7442992329597473, + "learning_rate": 4.1812471960520414e-05, + "loss": 0.0161, + "step": 54800 + }, + { + "epoch": 16.42, + "grad_norm": 1.913397192955017, + "learning_rate": 4.179751757140721e-05, + "loss": 0.017, + "step": 54900 + }, + { + "epoch": 16.45, + "grad_norm": 1.5975757837295532, + "learning_rate": 4.178256318229401e-05, + "loss": 0.0131, + "step": 55000 + }, + { + "epoch": 16.45, + "eval_loss": 0.28440138697624207, + "eval_precision": 0.9323552610821896, + "eval_recall": 0.9098494411773762, + "eval_runtime": 302.3846, + "eval_samples_per_second": 44.228, + "eval_steps_per_second": 1.382, + "step": 55000 + }, + { + "epoch": 16.48, + "grad_norm": 0.02616269886493683, + "learning_rate": 4.17676087931808e-05, + "loss": 0.0166, + "step": 55100 + }, + { + "epoch": 16.51, + "grad_norm": 0.270749032497406, + "learning_rate": 4.17526544040676e-05, + "loss": 0.0167, + "step": 55200 + }, + { + "epoch": 16.54, + "grad_norm": 0.8699542880058289, + "learning_rate": 4.173770001495439e-05, + "loss": 0.0178, + "step": 55300 + }, + { + "epoch": 16.57, + "grad_norm": 0.15558452904224396, + "learning_rate": 4.1722745625841185e-05, + "loss": 0.0155, + "step": 55400 + }, + { + "epoch": 16.6, + "grad_norm": 1.3881036043167114, + "learning_rate": 4.1707791236727985e-05, + "loss": 0.0162, + "step": 55500 + }, + { + "epoch": 16.63, + "grad_norm": 1.0590258836746216, + "learning_rate": 4.169283684761478e-05, + "loss": 0.019, + "step": 55600 + }, + { + "epoch": 16.66, + "grad_norm": 0.6527047157287598, + "learning_rate": 4.167788245850157e-05, + "loss": 0.0162, + "step": 55700 + }, + { + "epoch": 16.69, + "grad_norm": 0.7468928694725037, + "learning_rate": 4.166292806938837e-05, + "loss": 0.0187, + "step": 55800 + }, + { + "epoch": 16.72, + "grad_norm": 1.1580772399902344, + "learning_rate": 4.164797368027516e-05, + "loss": 0.0152, + "step": 55900 + }, + { + "epoch": 16.75, + "grad_norm": 0.27484288811683655, + "learning_rate": 4.1633019291161956e-05, + "loss": 0.018, + "step": 56000 + }, + { + "epoch": 16.75, + "eval_loss": 0.2911526560783386, + "eval_precision": 0.9246059786783004, + "eval_recall": 0.9265987253302134, + "eval_runtime": 304.1503, + "eval_samples_per_second": 43.972, + "eval_steps_per_second": 1.374, + "step": 56000 + }, + { + "epoch": 16.78, + "grad_norm": 0.12976956367492676, + "learning_rate": 4.1618064902048756e-05, + "loss": 0.0185, + "step": 56100 + }, + { + "epoch": 16.81, + "grad_norm": 0.37897953391075134, + "learning_rate": 4.160311051293555e-05, + "loss": 0.0152, + "step": 56200 + }, + { + "epoch": 16.84, + "grad_norm": 0.07681228220462799, + "learning_rate": 4.158815612382234e-05, + "loss": 0.0163, + "step": 56300 + }, + { + "epoch": 16.87, + "grad_norm": 0.5966798663139343, + "learning_rate": 4.157320173470914e-05, + "loss": 0.014, + "step": 56400 + }, + { + "epoch": 16.9, + "grad_norm": 0.29120373725891113, + "learning_rate": 4.1558247345595934e-05, + "loss": 0.018, + "step": 56500 + }, + { + "epoch": 16.93, + "grad_norm": 0.4325448274612427, + "learning_rate": 4.1543292956482734e-05, + "loss": 0.0145, + "step": 56600 + }, + { + "epoch": 16.96, + "grad_norm": 1.473797082901001, + "learning_rate": 4.1528338567369527e-05, + "loss": 0.0164, + "step": 56700 + }, + { + "epoch": 16.99, + "grad_norm": 0.963238537311554, + "learning_rate": 4.151338417825632e-05, + "loss": 0.0168, + "step": 56800 + }, + { + "epoch": 17.02, + "grad_norm": 1.2749171257019043, + "learning_rate": 4.149842978914312e-05, + "loss": 0.0172, + "step": 56900 + }, + { + "epoch": 17.05, + "grad_norm": 0.1201496422290802, + "learning_rate": 4.148347540002991e-05, + "loss": 0.0132, + "step": 57000 + }, + { + "epoch": 17.05, + "eval_loss": 0.2895963788032532, + "eval_precision": 0.9242246747641655, + "eval_recall": 0.9230579759229041, + "eval_runtime": 304.3955, + "eval_samples_per_second": 43.936, + "eval_steps_per_second": 1.373, + "step": 57000 + }, + { + "epoch": 17.08, + "grad_norm": 0.0923817902803421, + "learning_rate": 4.1468521010916705e-05, + "loss": 0.0155, + "step": 57100 + }, + { + "epoch": 17.11, + "grad_norm": 0.17687027156352997, + "learning_rate": 4.1453566621803505e-05, + "loss": 0.0142, + "step": 57200 + }, + { + "epoch": 17.14, + "grad_norm": 0.5095121264457703, + "learning_rate": 4.14386122326903e-05, + "loss": 0.0122, + "step": 57300 + }, + { + "epoch": 17.17, + "grad_norm": 0.14807282388210297, + "learning_rate": 4.142365784357709e-05, + "loss": 0.0122, + "step": 57400 + }, + { + "epoch": 17.19, + "grad_norm": 0.22806455194950104, + "learning_rate": 4.140870345446389e-05, + "loss": 0.0126, + "step": 57500 + }, + { + "epoch": 17.22, + "grad_norm": 0.1654992550611496, + "learning_rate": 4.139374906535068e-05, + "loss": 0.012, + "step": 57600 + }, + { + "epoch": 17.25, + "grad_norm": 1.1821808815002441, + "learning_rate": 4.1378794676237476e-05, + "loss": 0.0154, + "step": 57700 + }, + { + "epoch": 17.28, + "grad_norm": 0.33708083629608154, + "learning_rate": 4.1363840287124275e-05, + "loss": 0.0118, + "step": 57800 + }, + { + "epoch": 17.31, + "grad_norm": 0.2778627276420593, + "learning_rate": 4.134888589801107e-05, + "loss": 0.0153, + "step": 57900 + }, + { + "epoch": 17.34, + "grad_norm": 0.4350825250148773, + "learning_rate": 4.133393150889787e-05, + "loss": 0.0131, + "step": 58000 + }, + { + "epoch": 17.34, + "eval_loss": 0.2985839247703552, + "eval_precision": 0.9294326572576876, + "eval_recall": 0.9185011853813233, + "eval_runtime": 303.6403, + "eval_samples_per_second": 44.046, + "eval_steps_per_second": 1.377, + "step": 58000 + }, + { + "epoch": 17.37, + "grad_norm": 1.0241811275482178, + "learning_rate": 4.131897711978466e-05, + "loss": 0.0152, + "step": 58100 + }, + { + "epoch": 17.4, + "grad_norm": 0.705042839050293, + "learning_rate": 4.1304022730671454e-05, + "loss": 0.0165, + "step": 58200 + }, + { + "epoch": 17.43, + "grad_norm": 0.9130484461784363, + "learning_rate": 4.128906834155825e-05, + "loss": 0.0143, + "step": 58300 + }, + { + "epoch": 17.46, + "grad_norm": 0.0633108988404274, + "learning_rate": 4.127411395244504e-05, + "loss": 0.0147, + "step": 58400 + }, + { + "epoch": 17.49, + "grad_norm": 1.2173391580581665, + "learning_rate": 4.125915956333184e-05, + "loss": 0.0134, + "step": 58500 + }, + { + "epoch": 17.52, + "grad_norm": 2.9922380447387695, + "learning_rate": 4.124420517421864e-05, + "loss": 0.0145, + "step": 58600 + }, + { + "epoch": 17.55, + "grad_norm": 0.015288499183952808, + "learning_rate": 4.1229250785105425e-05, + "loss": 0.0169, + "step": 58700 + }, + { + "epoch": 17.58, + "grad_norm": 1.87058424949646, + "learning_rate": 4.1214296395992225e-05, + "loss": 0.0158, + "step": 58800 + }, + { + "epoch": 17.61, + "grad_norm": 0.31113335490226746, + "learning_rate": 4.1199342006879024e-05, + "loss": 0.0151, + "step": 58900 + }, + { + "epoch": 17.64, + "grad_norm": 0.8044542670249939, + "learning_rate": 4.118438761776582e-05, + "loss": 0.0143, + "step": 59000 + }, + { + "epoch": 17.64, + "eval_loss": 0.2973649501800537, + "eval_precision": 0.9298240060774879, + "eval_recall": 0.9044305551279288, + "eval_runtime": 302.1441, + "eval_samples_per_second": 44.264, + "eval_steps_per_second": 1.383, + "step": 59000 + }, + { + "epoch": 17.67, + "grad_norm": 0.08827254921197891, + "learning_rate": 4.116943322865261e-05, + "loss": 0.0157, + "step": 59100 + }, + { + "epoch": 17.7, + "grad_norm": 1.8845312595367432, + "learning_rate": 4.115447883953941e-05, + "loss": 0.0155, + "step": 59200 + }, + { + "epoch": 17.73, + "grad_norm": 0.49602124094963074, + "learning_rate": 4.11395244504262e-05, + "loss": 0.0162, + "step": 59300 + }, + { + "epoch": 17.76, + "grad_norm": 0.3592805564403534, + "learning_rate": 4.1124570061312995e-05, + "loss": 0.0149, + "step": 59400 + }, + { + "epoch": 17.79, + "grad_norm": 1.320101261138916, + "learning_rate": 4.110961567219979e-05, + "loss": 0.0156, + "step": 59500 + }, + { + "epoch": 17.82, + "grad_norm": 0.4389740526676178, + "learning_rate": 4.109466128308659e-05, + "loss": 0.0151, + "step": 59600 + }, + { + "epoch": 17.85, + "grad_norm": 1.6578569412231445, + "learning_rate": 4.107970689397339e-05, + "loss": 0.0166, + "step": 59700 + }, + { + "epoch": 17.88, + "grad_norm": 1.7992475032806396, + "learning_rate": 4.1064752504860174e-05, + "loss": 0.0148, + "step": 59800 + }, + { + "epoch": 17.91, + "grad_norm": 0.026478100568056107, + "learning_rate": 4.1049798115746973e-05, + "loss": 0.0158, + "step": 59900 + }, + { + "epoch": 17.94, + "grad_norm": 2.8473379611968994, + "learning_rate": 4.103484372663377e-05, + "loss": 0.0159, + "step": 60000 + }, + { + "epoch": 17.94, + "eval_loss": 0.2935677468776703, + "eval_precision": 0.9302795129030222, + "eval_recall": 0.9079097262846763, + "eval_runtime": 302.5843, + "eval_samples_per_second": 44.199, + "eval_steps_per_second": 1.381, + "step": 60000 + }, + { + "epoch": 17.97, + "grad_norm": 2.1734695434570312, + "learning_rate": 4.101988933752056e-05, + "loss": 0.0183, + "step": 60100 + }, + { + "epoch": 18.0, + "grad_norm": 0.14518772065639496, + "learning_rate": 4.100493494840736e-05, + "loss": 0.0172, + "step": 60200 + }, + { + "epoch": 18.03, + "grad_norm": 0.3986850380897522, + "learning_rate": 4.098998055929416e-05, + "loss": 0.0101, + "step": 60300 + }, + { + "epoch": 18.06, + "grad_norm": 1.78749680519104, + "learning_rate": 4.097502617018095e-05, + "loss": 0.0123, + "step": 60400 + }, + { + "epoch": 18.09, + "grad_norm": 0.43207836151123047, + "learning_rate": 4.0960071781067744e-05, + "loss": 0.0132, + "step": 60500 + }, + { + "epoch": 18.12, + "grad_norm": 0.11268942803144455, + "learning_rate": 4.0945117391954544e-05, + "loss": 0.0131, + "step": 60600 + }, + { + "epoch": 18.15, + "grad_norm": 0.5929433107376099, + "learning_rate": 4.093016300284134e-05, + "loss": 0.0118, + "step": 60700 + }, + { + "epoch": 18.18, + "grad_norm": 0.012462102808058262, + "learning_rate": 4.091520861372813e-05, + "loss": 0.0114, + "step": 60800 + }, + { + "epoch": 18.21, + "grad_norm": 0.03992025554180145, + "learning_rate": 4.090025422461492e-05, + "loss": 0.0123, + "step": 60900 + }, + { + "epoch": 18.24, + "grad_norm": 0.2556318938732147, + "learning_rate": 4.088529983550172e-05, + "loss": 0.0163, + "step": 61000 + }, + { + "epoch": 18.24, + "eval_loss": 0.3005661070346832, + "eval_precision": 0.930046845034112, + "eval_recall": 0.9108039040610856, + "eval_runtime": 303.0262, + "eval_samples_per_second": 44.135, + "eval_steps_per_second": 1.379, + "step": 61000 + }, + { + "epoch": 18.27, + "grad_norm": 0.0933234691619873, + "learning_rate": 4.087034544638852e-05, + "loss": 0.0139, + "step": 61100 + }, + { + "epoch": 18.3, + "grad_norm": 4.561667442321777, + "learning_rate": 4.085539105727531e-05, + "loss": 0.015, + "step": 61200 + }, + { + "epoch": 18.33, + "grad_norm": 1.8393715620040894, + "learning_rate": 4.084043666816211e-05, + "loss": 0.0113, + "step": 61300 + }, + { + "epoch": 18.36, + "grad_norm": 0.5815320611000061, + "learning_rate": 4.082548227904891e-05, + "loss": 0.0158, + "step": 61400 + }, + { + "epoch": 18.39, + "grad_norm": 0.9265565872192383, + "learning_rate": 4.0810527889935693e-05, + "loss": 0.0165, + "step": 61500 + }, + { + "epoch": 18.42, + "grad_norm": 0.029577825218439102, + "learning_rate": 4.079557350082249e-05, + "loss": 0.0151, + "step": 61600 + }, + { + "epoch": 18.45, + "grad_norm": 0.13609355688095093, + "learning_rate": 4.078061911170929e-05, + "loss": 0.0147, + "step": 61700 + }, + { + "epoch": 18.48, + "grad_norm": 0.2505282461643219, + "learning_rate": 4.0765664722596086e-05, + "loss": 0.0117, + "step": 61800 + }, + { + "epoch": 18.51, + "grad_norm": 0.49616509675979614, + "learning_rate": 4.075071033348288e-05, + "loss": 0.0136, + "step": 61900 + }, + { + "epoch": 18.54, + "grad_norm": 1.4143670797348022, + "learning_rate": 4.073575594436967e-05, + "loss": 0.0199, + "step": 62000 + }, + { + "epoch": 18.54, + "eval_loss": 0.28239989280700684, + "eval_precision": 0.9322552865754473, + "eval_recall": 0.89993534283691, + "eval_runtime": 303.1737, + "eval_samples_per_second": 44.113, + "eval_steps_per_second": 1.379, + "step": 62000 + }, + { + "epoch": 18.57, + "grad_norm": 2.5461013317108154, + "learning_rate": 4.072080155525647e-05, + "loss": 0.0122, + "step": 62100 + }, + { + "epoch": 18.6, + "grad_norm": 0.3786807358264923, + "learning_rate": 4.0705847166143264e-05, + "loss": 0.0122, + "step": 62200 + }, + { + "epoch": 18.63, + "grad_norm": 1.546884536743164, + "learning_rate": 4.069089277703006e-05, + "loss": 0.0133, + "step": 62300 + }, + { + "epoch": 18.66, + "grad_norm": 0.04791215434670448, + "learning_rate": 4.0675938387916856e-05, + "loss": 0.0118, + "step": 62400 + }, + { + "epoch": 18.69, + "grad_norm": 0.12534143030643463, + "learning_rate": 4.0660983998803656e-05, + "loss": 0.0145, + "step": 62500 + }, + { + "epoch": 18.72, + "grad_norm": 1.358917474746704, + "learning_rate": 4.064602960969044e-05, + "loss": 0.0152, + "step": 62600 + }, + { + "epoch": 18.75, + "grad_norm": 0.10757000744342804, + "learning_rate": 4.063107522057724e-05, + "loss": 0.0155, + "step": 62700 + }, + { + "epoch": 18.78, + "grad_norm": 2.365614652633667, + "learning_rate": 4.061612083146404e-05, + "loss": 0.0156, + "step": 62800 + }, + { + "epoch": 18.81, + "grad_norm": 0.4936872124671936, + "learning_rate": 4.060116644235083e-05, + "loss": 0.0132, + "step": 62900 + }, + { + "epoch": 18.84, + "grad_norm": 0.022019200026988983, + "learning_rate": 4.058621205323763e-05, + "loss": 0.0124, + "step": 63000 + }, + { + "epoch": 18.84, + "eval_loss": 0.30277740955352783, + "eval_precision": 0.930499515185637, + "eval_recall": 0.9159456879830044, + "eval_runtime": 304.0566, + "eval_samples_per_second": 43.985, + "eval_steps_per_second": 1.375, + "step": 63000 + }, + { + "epoch": 18.87, + "grad_norm": 0.3624964654445648, + "learning_rate": 4.057125766412442e-05, + "loss": 0.0155, + "step": 63100 + }, + { + "epoch": 18.9, + "grad_norm": 1.7629303932189941, + "learning_rate": 4.055630327501122e-05, + "loss": 0.0139, + "step": 63200 + }, + { + "epoch": 18.93, + "grad_norm": 0.18042436242103577, + "learning_rate": 4.054134888589801e-05, + "loss": 0.0179, + "step": 63300 + }, + { + "epoch": 18.96, + "grad_norm": 0.20951129496097565, + "learning_rate": 4.0526394496784806e-05, + "loss": 0.0172, + "step": 63400 + }, + { + "epoch": 18.99, + "grad_norm": 0.8891457915306091, + "learning_rate": 4.0511440107671605e-05, + "loss": 0.0126, + "step": 63500 + }, + { + "epoch": 19.02, + "grad_norm": 0.22427305579185486, + "learning_rate": 4.04964857185584e-05, + "loss": 0.0112, + "step": 63600 + }, + { + "epoch": 19.05, + "grad_norm": 0.25893327593803406, + "learning_rate": 4.048153132944519e-05, + "loss": 0.0123, + "step": 63700 + }, + { + "epoch": 19.08, + "grad_norm": 1.579196810722351, + "learning_rate": 4.046657694033199e-05, + "loss": 0.0117, + "step": 63800 + }, + { + "epoch": 19.11, + "grad_norm": 1.801465630531311, + "learning_rate": 4.045162255121879e-05, + "loss": 0.0113, + "step": 63900 + }, + { + "epoch": 19.14, + "grad_norm": 3.969907522201538, + "learning_rate": 4.0436668162105577e-05, + "loss": 0.0132, + "step": 64000 + }, + { + "epoch": 19.14, + "eval_loss": 0.3150152266025543, + "eval_precision": 0.9289555972482801, + "eval_recall": 0.9146833338464854, + "eval_runtime": 304.0309, + "eval_samples_per_second": 43.989, + "eval_steps_per_second": 1.375, + "step": 64000 + }, + { + "epoch": 19.17, + "grad_norm": 1.5782831907272339, + "learning_rate": 4.0421713772992376e-05, + "loss": 0.0106, + "step": 64100 + }, + { + "epoch": 19.2, + "grad_norm": 1.0305448770523071, + "learning_rate": 4.0406759383879176e-05, + "loss": 0.0115, + "step": 64200 + }, + { + "epoch": 19.23, + "grad_norm": 0.8879725337028503, + "learning_rate": 4.039180499476596e-05, + "loss": 0.0108, + "step": 64300 + }, + { + "epoch": 19.26, + "grad_norm": 1.0525989532470703, + "learning_rate": 4.037685060565276e-05, + "loss": 0.0113, + "step": 64400 + }, + { + "epoch": 19.29, + "grad_norm": 0.19859521090984344, + "learning_rate": 4.0361896216539554e-05, + "loss": 0.011, + "step": 64500 + }, + { + "epoch": 19.32, + "grad_norm": 1.628808856010437, + "learning_rate": 4.034694182742635e-05, + "loss": 0.0126, + "step": 64600 + }, + { + "epoch": 19.35, + "grad_norm": 0.45845118165016174, + "learning_rate": 4.033198743831315e-05, + "loss": 0.0117, + "step": 64700 + }, + { + "epoch": 19.38, + "grad_norm": 0.02105000615119934, + "learning_rate": 4.031703304919994e-05, + "loss": 0.0103, + "step": 64800 + }, + { + "epoch": 19.41, + "grad_norm": 1.2173235416412354, + "learning_rate": 4.030207866008674e-05, + "loss": 0.013, + "step": 64900 + }, + { + "epoch": 19.44, + "grad_norm": 1.0716986656188965, + "learning_rate": 4.028712427097353e-05, + "loss": 0.0136, + "step": 65000 + }, + { + "epoch": 19.44, + "eval_loss": 0.30169057846069336, + "eval_precision": 0.9307780320366132, + "eval_recall": 0.9016903229779242, + "eval_runtime": 303.9363, + "eval_samples_per_second": 44.003, + "eval_steps_per_second": 1.375, + "step": 65000 + }, + { + "epoch": 19.47, + "grad_norm": 0.060731422156095505, + "learning_rate": 4.0272169881860325e-05, + "loss": 0.0103, + "step": 65100 + }, + { + "epoch": 19.5, + "grad_norm": 1.8369615077972412, + "learning_rate": 4.0257215492747125e-05, + "loss": 0.0149, + "step": 65200 + }, + { + "epoch": 19.53, + "grad_norm": 0.5922613143920898, + "learning_rate": 4.024226110363392e-05, + "loss": 0.0137, + "step": 65300 + }, + { + "epoch": 19.56, + "grad_norm": 1.1230493783950806, + "learning_rate": 4.022730671452071e-05, + "loss": 0.016, + "step": 65400 + }, + { + "epoch": 19.59, + "grad_norm": 0.9484757781028748, + "learning_rate": 4.021235232540751e-05, + "loss": 0.0126, + "step": 65500 + }, + { + "epoch": 19.62, + "grad_norm": 0.40328437089920044, + "learning_rate": 4.01973979362943e-05, + "loss": 0.014, + "step": 65600 + }, + { + "epoch": 19.65, + "grad_norm": 1.251897931098938, + "learning_rate": 4.0182443547181096e-05, + "loss": 0.0152, + "step": 65700 + }, + { + "epoch": 19.68, + "grad_norm": 0.06640147417783737, + "learning_rate": 4.0167489158067896e-05, + "loss": 0.0119, + "step": 65800 + }, + { + "epoch": 19.71, + "grad_norm": 0.08419325947761536, + "learning_rate": 4.015253476895469e-05, + "loss": 0.0104, + "step": 65900 + }, + { + "epoch": 19.74, + "grad_norm": 0.8898499011993408, + "learning_rate": 4.013758037984148e-05, + "loss": 0.013, + "step": 66000 + }, + { + "epoch": 19.74, + "eval_loss": 0.30586904287338257, + "eval_precision": 0.9286385564814235, + "eval_recall": 0.9127128298285045, + "eval_runtime": 303.8354, + "eval_samples_per_second": 44.017, + "eval_steps_per_second": 1.376, + "step": 66000 + }, + { + "epoch": 19.77, + "grad_norm": 0.8399672508239746, + "learning_rate": 4.012262599072828e-05, + "loss": 0.0156, + "step": 66100 + }, + { + "epoch": 19.8, + "grad_norm": 1.188772201538086, + "learning_rate": 4.0107671601615074e-05, + "loss": 0.0133, + "step": 66200 + }, + { + "epoch": 19.83, + "grad_norm": 0.3390734791755676, + "learning_rate": 4.0092717212501874e-05, + "loss": 0.011, + "step": 66300 + }, + { + "epoch": 19.86, + "grad_norm": 2.0773940086364746, + "learning_rate": 4.007776282338867e-05, + "loss": 0.0109, + "step": 66400 + }, + { + "epoch": 19.89, + "grad_norm": 1.667506456375122, + "learning_rate": 4.006280843427546e-05, + "loss": 0.0121, + "step": 66500 + }, + { + "epoch": 19.92, + "grad_norm": 0.036488935351371765, + "learning_rate": 4.004785404516226e-05, + "loss": 0.0121, + "step": 66600 + }, + { + "epoch": 19.95, + "grad_norm": 0.9762794375419617, + "learning_rate": 4.003289965604905e-05, + "loss": 0.0138, + "step": 66700 + }, + { + "epoch": 19.98, + "grad_norm": 1.04608952999115, + "learning_rate": 4.0017945266935845e-05, + "loss": 0.0117, + "step": 66800 + }, + { + "epoch": 20.01, + "grad_norm": 5.332238674163818, + "learning_rate": 4.0002990877822645e-05, + "loss": 0.0137, + "step": 66900 + }, + { + "epoch": 20.04, + "grad_norm": 0.01725686341524124, + "learning_rate": 3.998803648870944e-05, + "loss": 0.0131, + "step": 67000 + }, + { + "epoch": 20.04, + "eval_loss": 0.2912316620349884, + "eval_precision": 0.9311961240797836, + "eval_recall": 0.9113273191908617, + "eval_runtime": 303.1004, + "eval_samples_per_second": 44.124, + "eval_steps_per_second": 1.379, + "step": 67000 + }, + { + "epoch": 20.07, + "grad_norm": 0.0427197702229023, + "learning_rate": 3.997308209959623e-05, + "loss": 0.0077, + "step": 67100 + }, + { + "epoch": 20.1, + "grad_norm": 0.017879147082567215, + "learning_rate": 3.995812771048303e-05, + "loss": 0.0104, + "step": 67200 + }, + { + "epoch": 20.13, + "grad_norm": 0.07891906797885895, + "learning_rate": 3.994317332136982e-05, + "loss": 0.0141, + "step": 67300 + }, + { + "epoch": 20.16, + "grad_norm": 0.16812817752361298, + "learning_rate": 3.9928218932256616e-05, + "loss": 0.0097, + "step": 67400 + }, + { + "epoch": 20.19, + "grad_norm": 3.0790505409240723, + "learning_rate": 3.9913264543143416e-05, + "loss": 0.0106, + "step": 67500 + }, + { + "epoch": 20.22, + "grad_norm": 0.41399437189102173, + "learning_rate": 3.989831015403021e-05, + "loss": 0.0089, + "step": 67600 + }, + { + "epoch": 20.25, + "grad_norm": 0.4379628300666809, + "learning_rate": 3.988335576491701e-05, + "loss": 0.0086, + "step": 67700 + }, + { + "epoch": 20.28, + "grad_norm": 0.011956513859331608, + "learning_rate": 3.98684013758038e-05, + "loss": 0.0133, + "step": 67800 + }, + { + "epoch": 20.31, + "grad_norm": 2.477144718170166, + "learning_rate": 3.9853446986690594e-05, + "loss": 0.0091, + "step": 67900 + }, + { + "epoch": 20.33, + "grad_norm": 2.790292739868164, + "learning_rate": 3.9838492597577394e-05, + "loss": 0.0128, + "step": 68000 + }, + { + "epoch": 20.33, + "eval_loss": 0.3076106309890747, + "eval_precision": 0.9304780813715294, + "eval_recall": 0.9090489239200714, + "eval_runtime": 303.9942, + "eval_samples_per_second": 43.994, + "eval_steps_per_second": 1.375, + "step": 68000 + }, + { + "epoch": 20.36, + "grad_norm": 1.441587209701538, + "learning_rate": 3.9823538208464186e-05, + "loss": 0.0159, + "step": 68100 + }, + { + "epoch": 20.39, + "grad_norm": 1.7005335092544556, + "learning_rate": 3.980858381935098e-05, + "loss": 0.01, + "step": 68200 + }, + { + "epoch": 20.42, + "grad_norm": 0.30774638056755066, + "learning_rate": 3.979362943023778e-05, + "loss": 0.0124, + "step": 68300 + }, + { + "epoch": 20.45, + "grad_norm": 0.04803008586168289, + "learning_rate": 3.977867504112457e-05, + "loss": 0.0112, + "step": 68400 + }, + { + "epoch": 20.48, + "grad_norm": 3.551407814025879, + "learning_rate": 3.9763720652011365e-05, + "loss": 0.012, + "step": 68500 + }, + { + "epoch": 20.51, + "grad_norm": 0.037427909672260284, + "learning_rate": 3.9748766262898164e-05, + "loss": 0.0138, + "step": 68600 + }, + { + "epoch": 20.54, + "grad_norm": 0.0066105336882174015, + "learning_rate": 3.973381187378496e-05, + "loss": 0.0114, + "step": 68700 + }, + { + "epoch": 20.57, + "grad_norm": 0.05352969095110893, + "learning_rate": 3.971885748467175e-05, + "loss": 0.0106, + "step": 68800 + }, + { + "epoch": 20.6, + "grad_norm": 1.097419023513794, + "learning_rate": 3.970390309555855e-05, + "loss": 0.0113, + "step": 68900 + }, + { + "epoch": 20.63, + "grad_norm": 2.4684622287750244, + "learning_rate": 3.968894870644534e-05, + "loss": 0.0104, + "step": 69000 + }, + { + "epoch": 20.63, + "eval_loss": 0.3140137493610382, + "eval_precision": 0.9268018018018018, + "eval_recall": 0.9122202038240094, + "eval_runtime": 304.685, + "eval_samples_per_second": 43.895, + "eval_steps_per_second": 1.372, + "step": 69000 + }, + { + "epoch": 20.66, + "grad_norm": 0.03651382029056549, + "learning_rate": 3.967399431733214e-05, + "loss": 0.0086, + "step": 69100 + }, + { + "epoch": 20.69, + "grad_norm": 0.35381224751472473, + "learning_rate": 3.9659039928218935e-05, + "loss": 0.013, + "step": 69200 + }, + { + "epoch": 20.72, + "grad_norm": 0.06933160871267319, + "learning_rate": 3.964408553910573e-05, + "loss": 0.0106, + "step": 69300 + }, + { + "epoch": 20.75, + "grad_norm": 0.4022979140281677, + "learning_rate": 3.962913114999253e-05, + "loss": 0.013, + "step": 69400 + }, + { + "epoch": 20.78, + "grad_norm": 0.03529789671301842, + "learning_rate": 3.961417676087932e-05, + "loss": 0.0156, + "step": 69500 + }, + { + "epoch": 20.81, + "grad_norm": 0.7010594606399536, + "learning_rate": 3.9599222371766114e-05, + "loss": 0.0144, + "step": 69600 + }, + { + "epoch": 20.84, + "grad_norm": 0.37523359060287476, + "learning_rate": 3.958426798265291e-05, + "loss": 0.0127, + "step": 69700 + }, + { + "epoch": 20.87, + "grad_norm": 0.1500304788351059, + "learning_rate": 3.9569313593539706e-05, + "loss": 0.0151, + "step": 69800 + }, + { + "epoch": 20.9, + "grad_norm": 1.1849136352539062, + "learning_rate": 3.95543592044265e-05, + "loss": 0.0092, + "step": 69900 + }, + { + "epoch": 20.93, + "grad_norm": 0.37061455845832825, + "learning_rate": 3.95394048153133e-05, + "loss": 0.0125, + "step": 70000 + }, + { + "epoch": 20.93, + "eval_loss": 0.2996491491794586, + "eval_precision": 0.9277798530693563, + "eval_recall": 0.9176390898734567, + "eval_runtime": 305.225, + "eval_samples_per_second": 43.817, + "eval_steps_per_second": 1.369, + "step": 70000 + }, + { + "epoch": 20.96, + "grad_norm": 1.1082910299301147, + "learning_rate": 3.952445042620009e-05, + "loss": 0.0135, + "step": 70100 + }, + { + "epoch": 20.99, + "grad_norm": 0.21670883893966675, + "learning_rate": 3.9509496037086884e-05, + "loss": 0.0147, + "step": 70200 + }, + { + "epoch": 21.02, + "grad_norm": 1.7163949012756348, + "learning_rate": 3.9494541647973684e-05, + "loss": 0.0074, + "step": 70300 + }, + { + "epoch": 21.05, + "grad_norm": 0.49197930097579956, + "learning_rate": 3.947958725886048e-05, + "loss": 0.009, + "step": 70400 + }, + { + "epoch": 21.08, + "grad_norm": 0.20454080402851105, + "learning_rate": 3.946463286974727e-05, + "loss": 0.0106, + "step": 70500 + }, + { + "epoch": 21.11, + "grad_norm": 1.1480427980422974, + "learning_rate": 3.944967848063407e-05, + "loss": 0.0082, + "step": 70600 + }, + { + "epoch": 21.14, + "grad_norm": 0.012445613741874695, + "learning_rate": 3.943472409152086e-05, + "loss": 0.0124, + "step": 70700 + }, + { + "epoch": 21.17, + "grad_norm": 1.2859218120574951, + "learning_rate": 3.941976970240766e-05, + "loss": 0.0114, + "step": 70800 + }, + { + "epoch": 21.2, + "grad_norm": 1.9639800786972046, + "learning_rate": 3.9404815313294455e-05, + "loss": 0.0094, + "step": 70900 + }, + { + "epoch": 21.23, + "grad_norm": 0.5322540402412415, + "learning_rate": 3.938986092418125e-05, + "loss": 0.0127, + "step": 71000 + }, + { + "epoch": 21.23, + "eval_loss": 0.31439679861068726, + "eval_precision": 0.9300875853255618, + "eval_recall": 0.918747498383571, + "eval_runtime": 305.1026, + "eval_samples_per_second": 43.834, + "eval_steps_per_second": 1.37, + "step": 71000 + }, + { + "epoch": 21.26, + "grad_norm": 0.7698822021484375, + "learning_rate": 3.937490653506805e-05, + "loss": 0.0091, + "step": 71100 + }, + { + "epoch": 21.29, + "grad_norm": 0.058869846165180206, + "learning_rate": 3.935995214595484e-05, + "loss": 0.0116, + "step": 71200 + }, + { + "epoch": 21.32, + "grad_norm": 0.040317438542842865, + "learning_rate": 3.934499775684163e-05, + "loss": 0.0082, + "step": 71300 + }, + { + "epoch": 21.35, + "grad_norm": 0.3180629014968872, + "learning_rate": 3.933004336772843e-05, + "loss": 0.0086, + "step": 71400 + }, + { + "epoch": 21.38, + "grad_norm": 0.14002850651741028, + "learning_rate": 3.9315088978615226e-05, + "loss": 0.0083, + "step": 71500 + }, + { + "epoch": 21.41, + "grad_norm": 0.535882830619812, + "learning_rate": 3.930013458950202e-05, + "loss": 0.0083, + "step": 71600 + }, + { + "epoch": 21.44, + "grad_norm": 0.8898109793663025, + "learning_rate": 3.928518020038882e-05, + "loss": 0.0111, + "step": 71700 + }, + { + "epoch": 21.47, + "grad_norm": 7.178394317626953, + "learning_rate": 3.927022581127561e-05, + "loss": 0.0111, + "step": 71800 + }, + { + "epoch": 21.5, + "grad_norm": 0.03290112316608429, + "learning_rate": 3.9255271422162404e-05, + "loss": 0.0102, + "step": 71900 + }, + { + "epoch": 21.53, + "grad_norm": 0.013704554177820683, + "learning_rate": 3.9240317033049204e-05, + "loss": 0.0131, + "step": 72000 + }, + { + "epoch": 21.53, + "eval_loss": 0.30643701553344727, + "eval_precision": 0.9271496444430644, + "eval_recall": 0.9192709135133471, + "eval_runtime": 304.1697, + "eval_samples_per_second": 43.969, + "eval_steps_per_second": 1.374, + "step": 72000 + }, + { + "epoch": 21.56, + "grad_norm": 0.8118484020233154, + "learning_rate": 3.9225362643936e-05, + "loss": 0.0109, + "step": 72100 + }, + { + "epoch": 21.59, + "grad_norm": 0.8789449334144592, + "learning_rate": 3.9210408254822796e-05, + "loss": 0.0111, + "step": 72200 + }, + { + "epoch": 21.62, + "grad_norm": 1.8666021823883057, + "learning_rate": 3.919545386570959e-05, + "loss": 0.0112, + "step": 72300 + }, + { + "epoch": 21.65, + "grad_norm": 0.33622369170188904, + "learning_rate": 3.918049947659638e-05, + "loss": 0.0121, + "step": 72400 + }, + { + "epoch": 21.68, + "grad_norm": 1.5097126960754395, + "learning_rate": 3.916554508748318e-05, + "loss": 0.0104, + "step": 72500 + }, + { + "epoch": 21.71, + "grad_norm": 1.3149192333221436, + "learning_rate": 3.915059069836997e-05, + "loss": 0.01, + "step": 72600 + }, + { + "epoch": 21.74, + "grad_norm": 1.1172950267791748, + "learning_rate": 3.913563630925677e-05, + "loss": 0.0159, + "step": 72700 + }, + { + "epoch": 21.77, + "grad_norm": 0.7861026525497437, + "learning_rate": 3.912068192014357e-05, + "loss": 0.0102, + "step": 72800 + }, + { + "epoch": 21.8, + "grad_norm": 0.9385488033294678, + "learning_rate": 3.910572753103036e-05, + "loss": 0.0103, + "step": 72900 + }, + { + "epoch": 21.83, + "grad_norm": 0.2858407199382782, + "learning_rate": 3.909077314191715e-05, + "loss": 0.0095, + "step": 73000 + }, + { + "epoch": 21.83, + "eval_loss": 0.3220088481903076, + "eval_precision": 0.9313063063063063, + "eval_recall": 0.89119123125712, + "eval_runtime": 301.1978, + "eval_samples_per_second": 44.403, + "eval_steps_per_second": 1.388, + "step": 73000 + }, + { + "epoch": 21.86, + "grad_norm": 2.1585566997528076, + "learning_rate": 3.907581875280395e-05, + "loss": 0.0107, + "step": 73100 + }, + { + "epoch": 21.89, + "grad_norm": 0.21467708051204681, + "learning_rate": 3.9060864363690745e-05, + "loss": 0.0092, + "step": 73200 + }, + { + "epoch": 21.92, + "grad_norm": 0.0250945333391428, + "learning_rate": 3.904590997457754e-05, + "loss": 0.0095, + "step": 73300 + }, + { + "epoch": 21.95, + "grad_norm": 0.08200676739215851, + "learning_rate": 3.903095558546434e-05, + "loss": 0.0127, + "step": 73400 + }, + { + "epoch": 21.98, + "grad_norm": 7.951723098754883, + "learning_rate": 3.901600119635113e-05, + "loss": 0.0118, + "step": 73500 + }, + { + "epoch": 22.01, + "grad_norm": 0.042703770101070404, + "learning_rate": 3.900104680723793e-05, + "loss": 0.0086, + "step": 73600 + }, + { + "epoch": 22.04, + "grad_norm": 0.13317295908927917, + "learning_rate": 3.898609241812472e-05, + "loss": 0.0117, + "step": 73700 + }, + { + "epoch": 22.07, + "grad_norm": 0.09529834240674973, + "learning_rate": 3.8971138029011516e-05, + "loss": 0.0077, + "step": 73800 + }, + { + "epoch": 22.1, + "grad_norm": 1.2312837839126587, + "learning_rate": 3.8956183639898316e-05, + "loss": 0.01, + "step": 73900 + }, + { + "epoch": 22.13, + "grad_norm": 0.20264630019664764, + "learning_rate": 3.89412292507851e-05, + "loss": 0.0079, + "step": 74000 + }, + { + "epoch": 22.13, + "eval_loss": 0.3207722306251526, + "eval_precision": 0.9257851445663011, + "eval_recall": 0.9148680685981712, + "eval_runtime": 304.4363, + "eval_samples_per_second": 43.93, + "eval_steps_per_second": 1.373, + "step": 74000 + }, + { + "epoch": 22.16, + "grad_norm": 0.007298531476408243, + "learning_rate": 3.89262748616719e-05, + "loss": 0.0083, + "step": 74100 + }, + { + "epoch": 22.19, + "grad_norm": 0.030803361907601357, + "learning_rate": 3.89113204725587e-05, + "loss": 0.0128, + "step": 74200 + }, + { + "epoch": 22.22, + "grad_norm": 0.04404568299651146, + "learning_rate": 3.8896366083445494e-05, + "loss": 0.0094, + "step": 74300 + }, + { + "epoch": 22.25, + "grad_norm": 0.14884673058986664, + "learning_rate": 3.888141169433229e-05, + "loss": 0.0081, + "step": 74400 + }, + { + "epoch": 22.28, + "grad_norm": 0.07467024773359299, + "learning_rate": 3.886645730521909e-05, + "loss": 0.0144, + "step": 74500 + }, + { + "epoch": 22.31, + "grad_norm": 0.6713554859161377, + "learning_rate": 3.885150291610588e-05, + "loss": 0.0136, + "step": 74600 + }, + { + "epoch": 22.34, + "grad_norm": 0.16354040801525116, + "learning_rate": 3.883654852699267e-05, + "loss": 0.0109, + "step": 74700 + }, + { + "epoch": 22.37, + "grad_norm": 1.4964691400527954, + "learning_rate": 3.882159413787947e-05, + "loss": 0.0116, + "step": 74800 + }, + { + "epoch": 22.4, + "grad_norm": 1.4973292350769043, + "learning_rate": 3.8806639748766265e-05, + "loss": 0.008, + "step": 74900 + }, + { + "epoch": 22.43, + "grad_norm": 0.17059992253780365, + "learning_rate": 3.8791685359653065e-05, + "loss": 0.0111, + "step": 75000 + }, + { + "epoch": 22.43, + "eval_loss": 0.30246666073799133, + "eval_precision": 0.9313384217417686, + "eval_recall": 0.8979032605683672, + "eval_runtime": 301.8023, + "eval_samples_per_second": 44.314, + "eval_steps_per_second": 1.385, + "step": 75000 + }, + { + "epoch": 22.46, + "grad_norm": 0.05614122748374939, + "learning_rate": 3.877673097053985e-05, + "loss": 0.0101, + "step": 75100 + }, + { + "epoch": 22.49, + "grad_norm": 0.23737676441669464, + "learning_rate": 3.876177658142665e-05, + "loss": 0.0111, + "step": 75200 + }, + { + "epoch": 22.52, + "grad_norm": 0.11609382182359695, + "learning_rate": 3.874682219231345e-05, + "loss": 0.0129, + "step": 75300 + }, + { + "epoch": 22.55, + "grad_norm": 0.006964783184230328, + "learning_rate": 3.8731867803200236e-05, + "loss": 0.014, + "step": 75400 + }, + { + "epoch": 22.58, + "grad_norm": 0.6018117070198059, + "learning_rate": 3.8716913414087036e-05, + "loss": 0.0092, + "step": 75500 + }, + { + "epoch": 22.61, + "grad_norm": 1.5463790893554688, + "learning_rate": 3.8701959024973836e-05, + "loss": 0.0129, + "step": 75600 + }, + { + "epoch": 22.64, + "grad_norm": 0.3491170108318329, + "learning_rate": 3.868700463586062e-05, + "loss": 0.0124, + "step": 75700 + }, + { + "epoch": 22.67, + "grad_norm": 0.3379780650138855, + "learning_rate": 3.867205024674742e-05, + "loss": 0.0105, + "step": 75800 + }, + { + "epoch": 22.7, + "grad_norm": 0.6625536680221558, + "learning_rate": 3.865709585763422e-05, + "loss": 0.0101, + "step": 75900 + }, + { + "epoch": 22.73, + "grad_norm": 0.5047014951705933, + "learning_rate": 3.8642141468521014e-05, + "loss": 0.0116, + "step": 76000 + }, + { + "epoch": 22.73, + "eval_loss": 0.309579074382782, + "eval_precision": 0.9289195145420119, + "eval_recall": 0.9214261522830136, + "eval_runtime": 306.5207, + "eval_samples_per_second": 43.632, + "eval_steps_per_second": 1.364, + "step": 76000 + }, + { + "epoch": 22.76, + "grad_norm": 2.8879668712615967, + "learning_rate": 3.862718707940781e-05, + "loss": 0.0084, + "step": 76100 + }, + { + "epoch": 22.79, + "grad_norm": 1.4628148078918457, + "learning_rate": 3.86122326902946e-05, + "loss": 0.0091, + "step": 76200 + }, + { + "epoch": 22.82, + "grad_norm": 0.01455759722739458, + "learning_rate": 3.85972783011814e-05, + "loss": 0.0087, + "step": 76300 + }, + { + "epoch": 22.85, + "grad_norm": 0.005665886681526899, + "learning_rate": 3.858232391206819e-05, + "loss": 0.0117, + "step": 76400 + }, + { + "epoch": 22.88, + "grad_norm": 0.5273276567459106, + "learning_rate": 3.8567369522954985e-05, + "loss": 0.009, + "step": 76500 + }, + { + "epoch": 22.91, + "grad_norm": 0.06718481332063675, + "learning_rate": 3.8552415133841785e-05, + "loss": 0.0118, + "step": 76600 + }, + { + "epoch": 22.94, + "grad_norm": 0.30258700251579285, + "learning_rate": 3.8537460744728585e-05, + "loss": 0.0109, + "step": 76700 + }, + { + "epoch": 22.97, + "grad_norm": 2.678166627883911, + "learning_rate": 3.852250635561537e-05, + "loss": 0.015, + "step": 76800 + }, + { + "epoch": 23.0, + "grad_norm": 0.15017007291316986, + "learning_rate": 3.850755196650217e-05, + "loss": 0.0104, + "step": 76900 + }, + { + "epoch": 23.03, + "grad_norm": 0.3501853048801422, + "learning_rate": 3.849259757738897e-05, + "loss": 0.0096, + "step": 77000 + }, + { + "epoch": 23.03, + "eval_loss": 0.2935163080692291, + "eval_precision": 0.9276991482965932, + "eval_recall": 0.9121894146987284, + "eval_runtime": 303.8246, + "eval_samples_per_second": 44.019, + "eval_steps_per_second": 1.376, + "step": 77000 + }, + { + "epoch": 23.06, + "grad_norm": 0.729576587677002, + "learning_rate": 3.8477643188275756e-05, + "loss": 0.0076, + "step": 77100 + }, + { + "epoch": 23.09, + "grad_norm": 0.03431198373436928, + "learning_rate": 3.8462688799162556e-05, + "loss": 0.0068, + "step": 77200 + }, + { + "epoch": 23.12, + "grad_norm": 0.022281186655163765, + "learning_rate": 3.844773441004935e-05, + "loss": 0.0099, + "step": 77300 + }, + { + "epoch": 23.15, + "grad_norm": 0.06289653480052948, + "learning_rate": 3.843278002093615e-05, + "loss": 0.0088, + "step": 77400 + }, + { + "epoch": 23.18, + "grad_norm": 1.1686757802963257, + "learning_rate": 3.841782563182294e-05, + "loss": 0.0113, + "step": 77500 + }, + { + "epoch": 23.21, + "grad_norm": 0.6460024118423462, + "learning_rate": 3.8402871242709734e-05, + "loss": 0.0098, + "step": 77600 + }, + { + "epoch": 23.24, + "grad_norm": 0.04333605244755745, + "learning_rate": 3.8387916853596534e-05, + "loss": 0.0078, + "step": 77700 + }, + { + "epoch": 23.27, + "grad_norm": 1.6560355424880981, + "learning_rate": 3.8372962464483327e-05, + "loss": 0.0069, + "step": 77800 + }, + { + "epoch": 23.3, + "grad_norm": 1.7110439538955688, + "learning_rate": 3.835800807537012e-05, + "loss": 0.0079, + "step": 77900 + }, + { + "epoch": 23.33, + "grad_norm": 0.34755662083625793, + "learning_rate": 3.834305368625692e-05, + "loss": 0.0117, + "step": 78000 + }, + { + "epoch": 23.33, + "eval_loss": 0.31362003087997437, + "eval_precision": 0.9317794739166089, + "eval_recall": 0.9096031281751286, + "eval_runtime": 302.9137, + "eval_samples_per_second": 44.151, + "eval_steps_per_second": 1.38, + "step": 78000 + }, + { + "epoch": 23.36, + "grad_norm": 0.07322967052459717, + "learning_rate": 3.832809929714372e-05, + "loss": 0.0086, + "step": 78100 + }, + { + "epoch": 23.39, + "grad_norm": 0.1620834916830063, + "learning_rate": 3.8313144908030505e-05, + "loss": 0.0105, + "step": 78200 + }, + { + "epoch": 23.42, + "grad_norm": 1.0541850328445435, + "learning_rate": 3.8298190518917305e-05, + "loss": 0.011, + "step": 78300 + }, + { + "epoch": 23.44, + "grad_norm": 0.008509721606969833, + "learning_rate": 3.8283236129804104e-05, + "loss": 0.009, + "step": 78400 + }, + { + "epoch": 23.47, + "grad_norm": 0.2723921537399292, + "learning_rate": 3.826828174069089e-05, + "loss": 0.0089, + "step": 78500 + }, + { + "epoch": 23.5, + "grad_norm": 0.7700883150100708, + "learning_rate": 3.825332735157769e-05, + "loss": 0.0084, + "step": 78600 + }, + { + "epoch": 23.53, + "grad_norm": 0.7245194911956787, + "learning_rate": 3.823837296246448e-05, + "loss": 0.0068, + "step": 78700 + }, + { + "epoch": 23.56, + "grad_norm": 1.283056378364563, + "learning_rate": 3.822341857335128e-05, + "loss": 0.0108, + "step": 78800 + }, + { + "epoch": 23.59, + "grad_norm": 0.016398323699831963, + "learning_rate": 3.8208464184238075e-05, + "loss": 0.0104, + "step": 78900 + }, + { + "epoch": 23.62, + "grad_norm": 0.32268649339675903, + "learning_rate": 3.819350979512487e-05, + "loss": 0.0085, + "step": 79000 + }, + { + "epoch": 23.62, + "eval_loss": 0.30707934498786926, + "eval_precision": 0.9256538985992314, + "eval_recall": 0.9196403830167185, + "eval_runtime": 304.8987, + "eval_samples_per_second": 43.864, + "eval_steps_per_second": 1.371, + "step": 79000 + }, + { + "epoch": 23.65, + "grad_norm": 0.1340191662311554, + "learning_rate": 3.817855540601167e-05, + "loss": 0.0132, + "step": 79100 + }, + { + "epoch": 23.68, + "grad_norm": 1.2741714715957642, + "learning_rate": 3.816360101689846e-05, + "loss": 0.0086, + "step": 79200 + }, + { + "epoch": 23.71, + "grad_norm": 3.2270684242248535, + "learning_rate": 3.8148646627785254e-05, + "loss": 0.012, + "step": 79300 + }, + { + "epoch": 23.74, + "grad_norm": 0.0873398706316948, + "learning_rate": 3.813369223867205e-05, + "loss": 0.0071, + "step": 79400 + }, + { + "epoch": 23.77, + "grad_norm": 0.36740046739578247, + "learning_rate": 3.811873784955885e-05, + "loss": 0.0082, + "step": 79500 + }, + { + "epoch": 23.8, + "grad_norm": 0.7461920976638794, + "learning_rate": 3.810378346044564e-05, + "loss": 0.0133, + "step": 79600 + }, + { + "epoch": 23.83, + "grad_norm": 1.0577598810195923, + "learning_rate": 3.808882907133244e-05, + "loss": 0.0118, + "step": 79700 + }, + { + "epoch": 23.86, + "grad_norm": 1.9472182989120483, + "learning_rate": 3.807387468221923e-05, + "loss": 0.0116, + "step": 79800 + }, + { + "epoch": 23.89, + "grad_norm": 1.6104402542114258, + "learning_rate": 3.8058920293106025e-05, + "loss": 0.0114, + "step": 79900 + }, + { + "epoch": 23.92, + "grad_norm": 0.03251710161566734, + "learning_rate": 3.8043965903992824e-05, + "loss": 0.0091, + "step": 80000 + }, + { + "epoch": 23.92, + "eval_loss": 0.3046566843986511, + "eval_precision": 0.9268397735663303, + "eval_recall": 0.9275531882139229, + "eval_runtime": 305.7377, + "eval_samples_per_second": 43.743, + "eval_steps_per_second": 1.367, + "step": 80000 + }, + { + "epoch": 23.95, + "grad_norm": 0.8245527744293213, + "learning_rate": 3.802901151487962e-05, + "loss": 0.0067, + "step": 80100 + }, + { + "epoch": 23.98, + "grad_norm": 2.3082966804504395, + "learning_rate": 3.801405712576642e-05, + "loss": 0.0103, + "step": 80200 + }, + { + "epoch": 24.01, + "grad_norm": 0.05168503150343895, + "learning_rate": 3.799910273665321e-05, + "loss": 0.0086, + "step": 80300 + }, + { + "epoch": 24.04, + "grad_norm": 0.3247091770172119, + "learning_rate": 3.798414834754e-05, + "loss": 0.0082, + "step": 80400 + }, + { + "epoch": 24.07, + "grad_norm": 0.30284127593040466, + "learning_rate": 3.79691939584268e-05, + "loss": 0.0065, + "step": 80500 + }, + { + "epoch": 24.1, + "grad_norm": 0.041343070566654205, + "learning_rate": 3.7954239569313595e-05, + "loss": 0.0072, + "step": 80600 + }, + { + "epoch": 24.13, + "grad_norm": 0.5980477929115295, + "learning_rate": 3.793928518020039e-05, + "loss": 0.0088, + "step": 80700 + }, + { + "epoch": 24.16, + "grad_norm": 0.0064304666593670845, + "learning_rate": 3.792433079108719e-05, + "loss": 0.0094, + "step": 80800 + }, + { + "epoch": 24.19, + "grad_norm": 0.6040250062942505, + "learning_rate": 3.790937640197398e-05, + "loss": 0.0079, + "step": 80900 + }, + { + "epoch": 24.22, + "grad_norm": 0.3337300419807434, + "learning_rate": 3.7894422012860773e-05, + "loss": 0.0086, + "step": 81000 + }, + { + "epoch": 24.22, + "eval_loss": 0.3350207209587097, + "eval_precision": 0.9268361054008597, + "eval_recall": 0.916192000985252, + "eval_runtime": 304.7162, + "eval_samples_per_second": 43.89, + "eval_steps_per_second": 1.372, + "step": 81000 + }, + { + "epoch": 24.25, + "grad_norm": 0.710114061832428, + "learning_rate": 3.787946762374757e-05, + "loss": 0.008, + "step": 81100 + }, + { + "epoch": 24.28, + "grad_norm": 0.03623099625110626, + "learning_rate": 3.7864513234634366e-05, + "loss": 0.0131, + "step": 81200 + }, + { + "epoch": 24.31, + "grad_norm": 0.09887418150901794, + "learning_rate": 3.784955884552116e-05, + "loss": 0.0086, + "step": 81300 + }, + { + "epoch": 24.34, + "grad_norm": 0.6916789412498474, + "learning_rate": 3.783460445640796e-05, + "loss": 0.0101, + "step": 81400 + }, + { + "epoch": 24.37, + "grad_norm": 1.4278247356414795, + "learning_rate": 3.781965006729475e-05, + "loss": 0.0107, + "step": 81500 + }, + { + "epoch": 24.4, + "grad_norm": 0.16397880017757416, + "learning_rate": 3.7804695678181544e-05, + "loss": 0.008, + "step": 81600 + }, + { + "epoch": 24.43, + "grad_norm": 0.08632964640855789, + "learning_rate": 3.7789741289068344e-05, + "loss": 0.0078, + "step": 81700 + }, + { + "epoch": 24.46, + "grad_norm": 2.2472782135009766, + "learning_rate": 3.777478689995514e-05, + "loss": 0.011, + "step": 81800 + }, + { + "epoch": 24.49, + "grad_norm": 0.14701958000659943, + "learning_rate": 3.7759832510841936e-05, + "loss": 0.0096, + "step": 81900 + }, + { + "epoch": 24.52, + "grad_norm": 0.051196735352277756, + "learning_rate": 3.774487812172873e-05, + "loss": 0.0111, + "step": 82000 + }, + { + "epoch": 24.52, + "eval_loss": 0.30252349376678467, + "eval_precision": 0.928390712570056, + "eval_recall": 0.8925459527694818, + "eval_runtime": 302.8814, + "eval_samples_per_second": 44.156, + "eval_steps_per_second": 1.38, + "step": 82000 + }, + { + "epoch": 24.55, + "grad_norm": 0.013324776664376259, + "learning_rate": 3.772992373261552e-05, + "loss": 0.0075, + "step": 82100 + }, + { + "epoch": 24.58, + "grad_norm": 0.10291430354118347, + "learning_rate": 3.771496934350232e-05, + "loss": 0.0099, + "step": 82200 + }, + { + "epoch": 24.61, + "grad_norm": 0.07137342542409897, + "learning_rate": 3.7700014954389115e-05, + "loss": 0.012, + "step": 82300 + }, + { + "epoch": 24.64, + "grad_norm": 0.3020240068435669, + "learning_rate": 3.768506056527591e-05, + "loss": 0.0087, + "step": 82400 + }, + { + "epoch": 24.67, + "grad_norm": 1.067194938659668, + "learning_rate": 3.767010617616271e-05, + "loss": 0.0096, + "step": 82500 + }, + { + "epoch": 24.7, + "grad_norm": 0.014255263842642307, + "learning_rate": 3.76551517870495e-05, + "loss": 0.007, + "step": 82600 + }, + { + "epoch": 24.73, + "grad_norm": 0.02688017673790455, + "learning_rate": 3.764019739793629e-05, + "loss": 0.0089, + "step": 82700 + }, + { + "epoch": 24.76, + "grad_norm": 0.3376453220844269, + "learning_rate": 3.762524300882309e-05, + "loss": 0.0066, + "step": 82800 + }, + { + "epoch": 24.79, + "grad_norm": 0.10389913618564606, + "learning_rate": 3.7610288619709886e-05, + "loss": 0.0066, + "step": 82900 + }, + { + "epoch": 24.82, + "grad_norm": 0.7046878337860107, + "learning_rate": 3.759533423059668e-05, + "loss": 0.01, + "step": 83000 + }, + { + "epoch": 24.82, + "eval_loss": 0.3185621201992035, + "eval_precision": 0.9291735873891379, + "eval_recall": 0.9128667754549094, + "eval_runtime": 303.4192, + "eval_samples_per_second": 44.078, + "eval_steps_per_second": 1.378, + "step": 83000 + }, + { + "epoch": 24.85, + "grad_norm": 0.4447859227657318, + "learning_rate": 3.758037984148348e-05, + "loss": 0.0085, + "step": 83100 + }, + { + "epoch": 24.88, + "grad_norm": 2.2701525688171387, + "learning_rate": 3.756542545237027e-05, + "loss": 0.0114, + "step": 83200 + }, + { + "epoch": 24.91, + "grad_norm": 0.05526027828454971, + "learning_rate": 3.755047106325707e-05, + "loss": 0.012, + "step": 83300 + }, + { + "epoch": 24.94, + "grad_norm": 0.8909191489219666, + "learning_rate": 3.7535516674143864e-05, + "loss": 0.0097, + "step": 83400 + }, + { + "epoch": 24.97, + "grad_norm": 0.004659523721784353, + "learning_rate": 3.7520562285030656e-05, + "loss": 0.0085, + "step": 83500 + }, + { + "epoch": 25.0, + "grad_norm": 0.05222604423761368, + "learning_rate": 3.7505607895917456e-05, + "loss": 0.0088, + "step": 83600 + }, + { + "epoch": 25.03, + "grad_norm": 0.014093970879912376, + "learning_rate": 3.749065350680425e-05, + "loss": 0.0085, + "step": 83700 + }, + { + "epoch": 25.06, + "grad_norm": 0.0026446671690791845, + "learning_rate": 3.747569911769104e-05, + "loss": 0.005, + "step": 83800 + }, + { + "epoch": 25.09, + "grad_norm": 0.1448344588279724, + "learning_rate": 3.746074472857784e-05, + "loss": 0.0064, + "step": 83900 + }, + { + "epoch": 25.12, + "grad_norm": 0.295718789100647, + "learning_rate": 3.7445790339464634e-05, + "loss": 0.0067, + "step": 84000 + }, + { + "epoch": 25.12, + "eval_loss": 0.32626327872276306, + "eval_precision": 0.9313109964567663, + "eval_recall": 0.9225653499184088, + "eval_runtime": 304.7239, + "eval_samples_per_second": 43.889, + "eval_steps_per_second": 1.372, + "step": 84000 + }, + { + "epoch": 25.15, + "grad_norm": 0.028157589957118034, + "learning_rate": 3.743083595035143e-05, + "loss": 0.0094, + "step": 84100 + }, + { + "epoch": 25.18, + "grad_norm": 0.002226242097094655, + "learning_rate": 3.741588156123823e-05, + "loss": 0.0072, + "step": 84200 + }, + { + "epoch": 25.21, + "grad_norm": 0.7868858575820923, + "learning_rate": 3.740092717212502e-05, + "loss": 0.0103, + "step": 84300 + }, + { + "epoch": 25.24, + "grad_norm": 0.031047280877828598, + "learning_rate": 3.738597278301181e-05, + "loss": 0.01, + "step": 84400 + }, + { + "epoch": 25.27, + "grad_norm": 0.30554434657096863, + "learning_rate": 3.737101839389861e-05, + "loss": 0.0076, + "step": 84500 + }, + { + "epoch": 25.3, + "grad_norm": 1.2695821523666382, + "learning_rate": 3.7356064004785405e-05, + "loss": 0.0092, + "step": 84600 + }, + { + "epoch": 25.33, + "grad_norm": 0.039061836898326874, + "learning_rate": 3.7341109615672205e-05, + "loss": 0.0129, + "step": 84700 + }, + { + "epoch": 25.36, + "grad_norm": 1.0094258785247803, + "learning_rate": 3.7326155226559e-05, + "loss": 0.012, + "step": 84800 + }, + { + "epoch": 25.39, + "grad_norm": 0.16602523624897003, + "learning_rate": 3.731120083744579e-05, + "loss": 0.0072, + "step": 84900 + }, + { + "epoch": 25.42, + "grad_norm": 0.6232153177261353, + "learning_rate": 3.729624644833259e-05, + "loss": 0.0094, + "step": 85000 + }, + { + "epoch": 25.42, + "eval_loss": 0.32043251395225525, + "eval_precision": 0.9310592123725484, + "eval_recall": 0.91936328088919, + "eval_runtime": 304.0822, + "eval_samples_per_second": 43.982, + "eval_steps_per_second": 1.375, + "step": 85000 + }, + { + "epoch": 25.45, + "grad_norm": 1.6009403467178345, + "learning_rate": 3.728129205921938e-05, + "loss": 0.0103, + "step": 85100 + }, + { + "epoch": 25.48, + "grad_norm": 0.6107264757156372, + "learning_rate": 3.7266337670106176e-05, + "loss": 0.0079, + "step": 85200 + }, + { + "epoch": 25.51, + "grad_norm": 0.44173404574394226, + "learning_rate": 3.7251383280992976e-05, + "loss": 0.0065, + "step": 85300 + }, + { + "epoch": 25.54, + "grad_norm": 0.9073717594146729, + "learning_rate": 3.723642889187977e-05, + "loss": 0.0071, + "step": 85400 + }, + { + "epoch": 25.57, + "grad_norm": 0.3392820656299591, + "learning_rate": 3.722147450276656e-05, + "loss": 0.0101, + "step": 85500 + }, + { + "epoch": 25.6, + "grad_norm": 0.07929588109254837, + "learning_rate": 3.720652011365336e-05, + "loss": 0.0083, + "step": 85600 + }, + { + "epoch": 25.63, + "grad_norm": 0.35071372985839844, + "learning_rate": 3.7191565724540154e-05, + "loss": 0.0121, + "step": 85700 + }, + { + "epoch": 25.66, + "grad_norm": 0.20559339225292206, + "learning_rate": 3.717661133542695e-05, + "loss": 0.0073, + "step": 85800 + }, + { + "epoch": 25.69, + "grad_norm": 0.045159224420785904, + "learning_rate": 3.716165694631375e-05, + "loss": 0.0087, + "step": 85900 + }, + { + "epoch": 25.72, + "grad_norm": 0.10148915648460388, + "learning_rate": 3.714670255720054e-05, + "loss": 0.0119, + "step": 86000 + }, + { + "epoch": 25.72, + "eval_loss": 0.31306663155555725, + "eval_precision": 0.9333648989898989, + "eval_recall": 0.9104036454324332, + "eval_runtime": 304.164, + "eval_samples_per_second": 43.97, + "eval_steps_per_second": 1.374, + "step": 86000 + }, + { + "epoch": 25.75, + "grad_norm": 0.18669423460960388, + "learning_rate": 3.713174816808734e-05, + "loss": 0.0063, + "step": 86100 + }, + { + "epoch": 25.78, + "grad_norm": 0.10197019577026367, + "learning_rate": 3.711679377897413e-05, + "loss": 0.0083, + "step": 86200 + }, + { + "epoch": 25.81, + "grad_norm": 0.0219405684620142, + "learning_rate": 3.7101839389860925e-05, + "loss": 0.0088, + "step": 86300 + }, + { + "epoch": 25.84, + "grad_norm": 0.941899836063385, + "learning_rate": 3.7086885000747725e-05, + "loss": 0.006, + "step": 86400 + }, + { + "epoch": 25.87, + "grad_norm": 0.042357202619314194, + "learning_rate": 3.707193061163452e-05, + "loss": 0.0107, + "step": 86500 + }, + { + "epoch": 25.9, + "grad_norm": 0.04090040549635887, + "learning_rate": 3.705697622252131e-05, + "loss": 0.0076, + "step": 86600 + }, + { + "epoch": 25.93, + "grad_norm": 1.0006482601165771, + "learning_rate": 3.704202183340811e-05, + "loss": 0.0081, + "step": 86700 + }, + { + "epoch": 25.96, + "grad_norm": 0.01344706118106842, + "learning_rate": 3.70270674442949e-05, + "loss": 0.0061, + "step": 86800 + }, + { + "epoch": 25.99, + "grad_norm": 0.039950937032699585, + "learning_rate": 3.7012113055181696e-05, + "loss": 0.0095, + "step": 86900 + }, + { + "epoch": 26.02, + "grad_norm": 0.007412883453071117, + "learning_rate": 3.6997158666068496e-05, + "loss": 0.0061, + "step": 87000 + }, + { + "epoch": 26.02, + "eval_loss": 0.3440411686897278, + "eval_precision": 0.9280669958127618, + "eval_recall": 0.9144370208442378, + "eval_runtime": 304.1449, + "eval_samples_per_second": 43.972, + "eval_steps_per_second": 1.374, + "step": 87000 + }, + { + "epoch": 26.05, + "grad_norm": 0.045031215995550156, + "learning_rate": 3.698220427695529e-05, + "loss": 0.0083, + "step": 87100 + }, + { + "epoch": 26.08, + "grad_norm": 0.5366631150245667, + "learning_rate": 3.696724988784208e-05, + "loss": 0.0069, + "step": 87200 + }, + { + "epoch": 26.11, + "grad_norm": 0.24467185139656067, + "learning_rate": 3.695229549872888e-05, + "loss": 0.0065, + "step": 87300 + }, + { + "epoch": 26.14, + "grad_norm": 0.7528616786003113, + "learning_rate": 3.6937341109615674e-05, + "loss": 0.0087, + "step": 87400 + }, + { + "epoch": 26.17, + "grad_norm": 0.15506117045879364, + "learning_rate": 3.692238672050247e-05, + "loss": 0.0072, + "step": 87500 + }, + { + "epoch": 26.2, + "grad_norm": 0.2464226335287094, + "learning_rate": 3.6907432331389266e-05, + "loss": 0.0053, + "step": 87600 + }, + { + "epoch": 26.23, + "grad_norm": 0.15138311684131622, + "learning_rate": 3.689247794227606e-05, + "loss": 0.0063, + "step": 87700 + }, + { + "epoch": 26.26, + "grad_norm": 0.07477385550737381, + "learning_rate": 3.687752355316286e-05, + "loss": 0.0076, + "step": 87800 + }, + { + "epoch": 26.29, + "grad_norm": 0.661697268486023, + "learning_rate": 3.686256916404965e-05, + "loss": 0.0078, + "step": 87900 + }, + { + "epoch": 26.32, + "grad_norm": 0.16399236023426056, + "learning_rate": 3.6847614774936445e-05, + "loss": 0.0085, + "step": 88000 + }, + { + "epoch": 26.32, + "eval_loss": 0.326471209526062, + "eval_precision": 0.9298322483725588, + "eval_recall": 0.9147449120970473, + "eval_runtime": 305.1957, + "eval_samples_per_second": 43.821, + "eval_steps_per_second": 1.37, + "step": 88000 + }, + { + "epoch": 26.35, + "grad_norm": 0.5788341164588928, + "learning_rate": 3.6832660385823244e-05, + "loss": 0.0097, + "step": 88100 + }, + { + "epoch": 26.38, + "grad_norm": 0.38478532433509827, + "learning_rate": 3.681770599671003e-05, + "loss": 0.0083, + "step": 88200 + }, + { + "epoch": 26.41, + "grad_norm": 1.8616811037063599, + "learning_rate": 3.680275160759683e-05, + "loss": 0.0082, + "step": 88300 + }, + { + "epoch": 26.44, + "grad_norm": 0.005648652091622353, + "learning_rate": 3.678779721848363e-05, + "loss": 0.0074, + "step": 88400 + }, + { + "epoch": 26.47, + "grad_norm": 0.013662021607160568, + "learning_rate": 3.677284282937042e-05, + "loss": 0.0054, + "step": 88500 + }, + { + "epoch": 26.5, + "grad_norm": 0.21754692494869232, + "learning_rate": 3.6757888440257216e-05, + "loss": 0.0115, + "step": 88600 + }, + { + "epoch": 26.53, + "grad_norm": 0.0358903631567955, + "learning_rate": 3.6742934051144015e-05, + "loss": 0.0097, + "step": 88700 + }, + { + "epoch": 26.56, + "grad_norm": 0.9966431856155396, + "learning_rate": 3.672797966203081e-05, + "loss": 0.0074, + "step": 88800 + }, + { + "epoch": 26.58, + "grad_norm": 0.7227293848991394, + "learning_rate": 3.67130252729176e-05, + "loss": 0.0088, + "step": 88900 + }, + { + "epoch": 26.61, + "grad_norm": 1.3261148929595947, + "learning_rate": 3.66980708838044e-05, + "loss": 0.0072, + "step": 89000 + }, + { + "epoch": 26.61, + "eval_loss": 0.3263101279735565, + "eval_precision": 0.9263782601905357, + "eval_recall": 0.9131438775824379, + "eval_runtime": 306.4472, + "eval_samples_per_second": 43.642, + "eval_steps_per_second": 1.364, + "step": 89000 + }, + { + "epoch": 26.64, + "grad_norm": 0.11170350760221481, + "learning_rate": 3.6683116494691194e-05, + "loss": 0.0092, + "step": 89100 + }, + { + "epoch": 26.67, + "grad_norm": 1.529340147972107, + "learning_rate": 3.666816210557799e-05, + "loss": 0.0089, + "step": 89200 + }, + { + "epoch": 26.7, + "grad_norm": 0.01682981289923191, + "learning_rate": 3.665320771646478e-05, + "loss": 0.0093, + "step": 89300 + }, + { + "epoch": 26.73, + "grad_norm": 0.3299085199832916, + "learning_rate": 3.663825332735158e-05, + "loss": 0.0063, + "step": 89400 + }, + { + "epoch": 26.76, + "grad_norm": 1.9823254346847534, + "learning_rate": 3.662329893823838e-05, + "loss": 0.0091, + "step": 89500 + }, + { + "epoch": 26.79, + "grad_norm": 0.07487453520298004, + "learning_rate": 3.6608344549125165e-05, + "loss": 0.009, + "step": 89600 + }, + { + "epoch": 26.82, + "grad_norm": 0.015319288708269596, + "learning_rate": 3.6593390160011964e-05, + "loss": 0.0078, + "step": 89700 + }, + { + "epoch": 26.85, + "grad_norm": 0.004087815526872873, + "learning_rate": 3.6578435770898764e-05, + "loss": 0.0069, + "step": 89800 + }, + { + "epoch": 26.88, + "grad_norm": 0.00753753213211894, + "learning_rate": 3.656348138178556e-05, + "loss": 0.0057, + "step": 89900 + }, + { + "epoch": 26.91, + "grad_norm": 0.012257667258381844, + "learning_rate": 3.654852699267235e-05, + "loss": 0.0095, + "step": 90000 + }, + { + "epoch": 26.91, + "eval_loss": 0.3233014643192291, + "eval_precision": 0.9329517062525696, + "eval_recall": 0.9082484066627667, + "eval_runtime": 304.4964, + "eval_samples_per_second": 43.922, + "eval_steps_per_second": 1.373, + "step": 90000 + }, + { + "epoch": 26.94, + "grad_norm": 0.030741436406970024, + "learning_rate": 3.653357260355915e-05, + "loss": 0.0067, + "step": 90100 + }, + { + "epoch": 26.97, + "grad_norm": 0.429049551486969, + "learning_rate": 3.651861821444594e-05, + "loss": 0.012, + "step": 90200 + }, + { + "epoch": 27.0, + "grad_norm": 0.002479678951203823, + "learning_rate": 3.6503663825332735e-05, + "loss": 0.005, + "step": 90300 + }, + { + "epoch": 27.03, + "grad_norm": 0.12390375137329102, + "learning_rate": 3.648870943621953e-05, + "loss": 0.0083, + "step": 90400 + }, + { + "epoch": 27.06, + "grad_norm": 0.044969938695430756, + "learning_rate": 3.647375504710633e-05, + "loss": 0.0073, + "step": 90500 + }, + { + "epoch": 27.09, + "grad_norm": 0.06378799676895142, + "learning_rate": 3.645880065799313e-05, + "loss": 0.0073, + "step": 90600 + }, + { + "epoch": 27.12, + "grad_norm": 0.323734849691391, + "learning_rate": 3.6443846268879914e-05, + "loss": 0.0078, + "step": 90700 + }, + { + "epoch": 27.15, + "grad_norm": 1.6457269191741943, + "learning_rate": 3.642889187976671e-05, + "loss": 0.0055, + "step": 90800 + }, + { + "epoch": 27.18, + "grad_norm": 0.007004741113632917, + "learning_rate": 3.641393749065351e-05, + "loss": 0.0065, + "step": 90900 + }, + { + "epoch": 27.21, + "grad_norm": 0.06395163387060165, + "learning_rate": 3.63989831015403e-05, + "loss": 0.0062, + "step": 91000 + }, + { + "epoch": 27.21, + "eval_loss": 0.32764899730682373, + "eval_precision": 0.9317584480600751, + "eval_recall": 0.916869361741433, + "eval_runtime": 309.1631, + "eval_samples_per_second": 43.259, + "eval_steps_per_second": 1.352, + "step": 91000 + }, + { + "epoch": 27.24, + "grad_norm": 0.005486265290528536, + "learning_rate": 3.63840287124271e-05, + "loss": 0.0082, + "step": 91100 + }, + { + "epoch": 27.27, + "grad_norm": 2.3132262229919434, + "learning_rate": 3.63690743233139e-05, + "loss": 0.0067, + "step": 91200 + }, + { + "epoch": 27.3, + "grad_norm": 0.07687461376190186, + "learning_rate": 3.635411993420069e-05, + "loss": 0.0051, + "step": 91300 + }, + { + "epoch": 27.33, + "grad_norm": 0.05096305161714554, + "learning_rate": 3.6339165545087484e-05, + "loss": 0.0061, + "step": 91400 + }, + { + "epoch": 27.36, + "grad_norm": 0.21200311183929443, + "learning_rate": 3.6324211155974284e-05, + "loss": 0.0072, + "step": 91500 + }, + { + "epoch": 27.39, + "grad_norm": 0.07336900383234024, + "learning_rate": 3.630925676686108e-05, + "loss": 0.008, + "step": 91600 + }, + { + "epoch": 27.42, + "grad_norm": 0.026788916438817978, + "learning_rate": 3.629430237774787e-05, + "loss": 0.0068, + "step": 91700 + }, + { + "epoch": 27.45, + "grad_norm": 0.03046250529587269, + "learning_rate": 3.627934798863466e-05, + "loss": 0.0081, + "step": 91800 + }, + { + "epoch": 27.48, + "grad_norm": 0.32240158319473267, + "learning_rate": 3.626439359952146e-05, + "loss": 0.0091, + "step": 91900 + }, + { + "epoch": 27.51, + "grad_norm": 0.1428656429052353, + "learning_rate": 3.624943921040826e-05, + "loss": 0.007, + "step": 92000 + }, + { + "epoch": 27.51, + "eval_loss": 0.3499869704246521, + "eval_precision": 0.9278612426685068, + "eval_recall": 0.9108346931863666, + "eval_runtime": 310.2456, + "eval_samples_per_second": 43.108, + "eval_steps_per_second": 1.347, + "step": 92000 + } + ], + "logging_steps": 100, + "max_steps": 334400, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 500, + "total_flos": 4.8090441780412416e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}