|
{ |
|
"best_metric": 2.129138708114624, |
|
"best_model_checkpoint": "/home/sunggeunan/data/ICL/outputs/lora/SKIML-ICL_mrqa_nq_v3/Meta-Llama-3-8B-Instruct-unanswerable-2Q-1U-0C-qa_first/checkpoint-402", |
|
"epoch": 0.9996891513832763, |
|
"eval_steps": 500, |
|
"global_step": 402, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0024867889337892445, |
|
"grad_norm": 0.4153629243373871, |
|
"learning_rate": 8.19672131147541e-09, |
|
"loss": 2.121, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004973577867578489, |
|
"grad_norm": 0.4008300006389618, |
|
"learning_rate": 1.639344262295082e-08, |
|
"loss": 2.1542, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.007460366801367734, |
|
"grad_norm": 0.42680642008781433, |
|
"learning_rate": 2.459016393442623e-08, |
|
"loss": 2.1439, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.009947155735156978, |
|
"grad_norm": 0.3831591308116913, |
|
"learning_rate": 3.278688524590164e-08, |
|
"loss": 2.1025, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.012433944668946224, |
|
"grad_norm": 0.40637049078941345, |
|
"learning_rate": 4.0983606557377046e-08, |
|
"loss": 2.1374, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.014920733602735468, |
|
"grad_norm": 0.3883218467235565, |
|
"learning_rate": 4.918032786885246e-08, |
|
"loss": 2.1412, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.017407522536524712, |
|
"grad_norm": 0.42491665482521057, |
|
"learning_rate": 5.7377049180327866e-08, |
|
"loss": 2.205, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.019894311470313956, |
|
"grad_norm": 0.40381714701652527, |
|
"learning_rate": 6.557377049180328e-08, |
|
"loss": 2.1575, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.022381100404103203, |
|
"grad_norm": 0.3807780146598816, |
|
"learning_rate": 7.377049180327868e-08, |
|
"loss": 2.2206, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.024867889337892447, |
|
"grad_norm": 0.3880959451198578, |
|
"learning_rate": 8.196721311475409e-08, |
|
"loss": 2.1772, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02735467827168169, |
|
"grad_norm": 0.36500561237335205, |
|
"learning_rate": 9.01639344262295e-08, |
|
"loss": 2.1232, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.029841467205470935, |
|
"grad_norm": 0.3805394172668457, |
|
"learning_rate": 9.836065573770492e-08, |
|
"loss": 2.1446, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03232825613926018, |
|
"grad_norm": 0.38014543056488037, |
|
"learning_rate": 1.0655737704918032e-07, |
|
"loss": 2.1277, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.034815045073049423, |
|
"grad_norm": 0.3875851333141327, |
|
"learning_rate": 1.1475409836065573e-07, |
|
"loss": 2.1064, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03730183400683867, |
|
"grad_norm": 0.39165419340133667, |
|
"learning_rate": 1.2295081967213113e-07, |
|
"loss": 2.1352, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03978862294062791, |
|
"grad_norm": 0.4067535102367401, |
|
"learning_rate": 1.3114754098360656e-07, |
|
"loss": 2.1601, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.042275411874417156, |
|
"grad_norm": 0.41718506813049316, |
|
"learning_rate": 1.3934426229508196e-07, |
|
"loss": 2.1471, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04476220080820641, |
|
"grad_norm": 0.4221360981464386, |
|
"learning_rate": 1.4754098360655736e-07, |
|
"loss": 2.1499, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.04724898974199565, |
|
"grad_norm": 0.39923396706581116, |
|
"learning_rate": 1.5573770491803278e-07, |
|
"loss": 2.1003, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.049735778675784895, |
|
"grad_norm": 0.3728751242160797, |
|
"learning_rate": 1.6393442622950818e-07, |
|
"loss": 2.0842, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05222256760957414, |
|
"grad_norm": 0.3873041868209839, |
|
"learning_rate": 1.7213114754098358e-07, |
|
"loss": 2.1152, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.05470935654336338, |
|
"grad_norm": 0.3714573383331299, |
|
"learning_rate": 1.80327868852459e-07, |
|
"loss": 2.1215, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.05719614547715263, |
|
"grad_norm": 0.4204677939414978, |
|
"learning_rate": 1.885245901639344e-07, |
|
"loss": 2.1859, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.05968293441094187, |
|
"grad_norm": 0.4137566089630127, |
|
"learning_rate": 1.9672131147540984e-07, |
|
"loss": 2.1317, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.062169723344731115, |
|
"grad_norm": 0.3629921078681946, |
|
"learning_rate": 2.0491803278688524e-07, |
|
"loss": 2.0563, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06465651227852036, |
|
"grad_norm": 0.39072492718696594, |
|
"learning_rate": 2.1311475409836064e-07, |
|
"loss": 2.164, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0671433012123096, |
|
"grad_norm": 0.37331125140190125, |
|
"learning_rate": 2.2131147540983606e-07, |
|
"loss": 2.2048, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.06963009014609885, |
|
"grad_norm": 0.3944483697414398, |
|
"learning_rate": 2.2950819672131146e-07, |
|
"loss": 2.1835, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07211687907988809, |
|
"grad_norm": 0.39379164576530457, |
|
"learning_rate": 2.3770491803278686e-07, |
|
"loss": 2.1465, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.07460366801367734, |
|
"grad_norm": 0.3914564549922943, |
|
"learning_rate": 2.4590163934426226e-07, |
|
"loss": 2.091, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07709045694746658, |
|
"grad_norm": 0.4301564395427704, |
|
"learning_rate": 2.540983606557377e-07, |
|
"loss": 2.1183, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.07957724588125582, |
|
"grad_norm": 0.40827327966690063, |
|
"learning_rate": 2.622950819672131e-07, |
|
"loss": 2.1588, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08206403481504507, |
|
"grad_norm": 0.3868783116340637, |
|
"learning_rate": 2.704918032786885e-07, |
|
"loss": 2.1667, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.08455082374883431, |
|
"grad_norm": 0.40489786863327026, |
|
"learning_rate": 2.786885245901639e-07, |
|
"loss": 2.1857, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.08703761268262357, |
|
"grad_norm": 0.3836217224597931, |
|
"learning_rate": 2.868852459016393e-07, |
|
"loss": 2.1313, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08952440161641281, |
|
"grad_norm": 0.4212404787540436, |
|
"learning_rate": 2.950819672131147e-07, |
|
"loss": 2.2024, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09201119055020206, |
|
"grad_norm": 0.395867258310318, |
|
"learning_rate": 3.0327868852459017e-07, |
|
"loss": 2.1355, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0944979794839913, |
|
"grad_norm": 0.3836336135864258, |
|
"learning_rate": 3.1147540983606557e-07, |
|
"loss": 2.1254, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.09698476841778055, |
|
"grad_norm": 0.42212599515914917, |
|
"learning_rate": 3.1967213114754097e-07, |
|
"loss": 2.1262, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.09947155735156979, |
|
"grad_norm": 0.43291711807250977, |
|
"learning_rate": 3.2786885245901637e-07, |
|
"loss": 2.1664, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10195834628535903, |
|
"grad_norm": 0.3876365125179291, |
|
"learning_rate": 3.3606557377049177e-07, |
|
"loss": 2.147, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.10444513521914828, |
|
"grad_norm": 0.39074528217315674, |
|
"learning_rate": 3.4426229508196717e-07, |
|
"loss": 2.1113, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.10693192415293752, |
|
"grad_norm": 0.4135940670967102, |
|
"learning_rate": 3.524590163934426e-07, |
|
"loss": 2.1834, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.10941871308672677, |
|
"grad_norm": 0.4124310612678528, |
|
"learning_rate": 3.60655737704918e-07, |
|
"loss": 2.1019, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.11190550202051601, |
|
"grad_norm": 0.3812576234340668, |
|
"learning_rate": 3.6885245901639347e-07, |
|
"loss": 2.1368, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11439229095430525, |
|
"grad_norm": 0.3919021189212799, |
|
"learning_rate": 3.770491803278688e-07, |
|
"loss": 2.1226, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1168790798880945, |
|
"grad_norm": 0.37712955474853516, |
|
"learning_rate": 3.852459016393442e-07, |
|
"loss": 2.1723, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.11936586882188374, |
|
"grad_norm": 0.40433424711227417, |
|
"learning_rate": 3.9344262295081967e-07, |
|
"loss": 2.1469, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12185265775567299, |
|
"grad_norm": 0.4323996603488922, |
|
"learning_rate": 4.0163934426229507e-07, |
|
"loss": 2.1625, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.12433944668946223, |
|
"grad_norm": 0.37467238306999207, |
|
"learning_rate": 4.0983606557377047e-07, |
|
"loss": 2.0877, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1268262356232515, |
|
"grad_norm": 0.3842613399028778, |
|
"learning_rate": 4.180327868852459e-07, |
|
"loss": 2.1367, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.12931302455704072, |
|
"grad_norm": 0.41727927327156067, |
|
"learning_rate": 4.2622950819672127e-07, |
|
"loss": 2.158, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.13179981349082998, |
|
"grad_norm": 0.427172988653183, |
|
"learning_rate": 4.3442622950819667e-07, |
|
"loss": 2.185, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.1342866024246192, |
|
"grad_norm": 0.3944658041000366, |
|
"learning_rate": 4.426229508196721e-07, |
|
"loss": 2.1537, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.13677339135840846, |
|
"grad_norm": 0.3892759382724762, |
|
"learning_rate": 4.508196721311475e-07, |
|
"loss": 2.1187, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1392601802921977, |
|
"grad_norm": 0.40089288353919983, |
|
"learning_rate": 4.590163934426229e-07, |
|
"loss": 2.1299, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.14174696922598695, |
|
"grad_norm": 0.4039812982082367, |
|
"learning_rate": 4.672131147540984e-07, |
|
"loss": 2.1734, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.14423375815977618, |
|
"grad_norm": 0.43650051951408386, |
|
"learning_rate": 4.754098360655737e-07, |
|
"loss": 2.1511, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.14672054709356544, |
|
"grad_norm": 0.40934914350509644, |
|
"learning_rate": 4.836065573770492e-07, |
|
"loss": 2.1244, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.14920733602735467, |
|
"grad_norm": 0.40043023228645325, |
|
"learning_rate": 4.918032786885245e-07, |
|
"loss": 2.1584, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15169412496114393, |
|
"grad_norm": 0.4129016697406769, |
|
"learning_rate": 5e-07, |
|
"loss": 2.158, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.15418091389493316, |
|
"grad_norm": 0.38239961862564087, |
|
"learning_rate": 4.995633187772926e-07, |
|
"loss": 2.1423, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.15666770282872242, |
|
"grad_norm": 0.4175527095794678, |
|
"learning_rate": 4.991266375545852e-07, |
|
"loss": 2.1224, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.15915449176251165, |
|
"grad_norm": 0.4162661135196686, |
|
"learning_rate": 4.986899563318778e-07, |
|
"loss": 2.1816, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1616412806963009, |
|
"grad_norm": 0.4039028584957123, |
|
"learning_rate": 4.982532751091702e-07, |
|
"loss": 2.1734, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.16412806963009013, |
|
"grad_norm": 0.4020048677921295, |
|
"learning_rate": 4.978165938864628e-07, |
|
"loss": 2.1546, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.1666148585638794, |
|
"grad_norm": 0.4302126169204712, |
|
"learning_rate": 4.973799126637554e-07, |
|
"loss": 2.1906, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.16910164749766862, |
|
"grad_norm": 0.4021979570388794, |
|
"learning_rate": 4.96943231441048e-07, |
|
"loss": 2.1297, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.17158843643145788, |
|
"grad_norm": 0.3942105174064636, |
|
"learning_rate": 4.965065502183406e-07, |
|
"loss": 2.1292, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.17407522536524714, |
|
"grad_norm": 0.4106265902519226, |
|
"learning_rate": 4.960698689956332e-07, |
|
"loss": 2.1454, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.17656201429903637, |
|
"grad_norm": 0.4014648199081421, |
|
"learning_rate": 4.956331877729257e-07, |
|
"loss": 2.0864, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.17904880323282563, |
|
"grad_norm": 0.41134366393089294, |
|
"learning_rate": 4.951965065502184e-07, |
|
"loss": 2.1021, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.18153559216661486, |
|
"grad_norm": 0.40096017718315125, |
|
"learning_rate": 4.947598253275109e-07, |
|
"loss": 2.1465, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.18402238110040411, |
|
"grad_norm": 0.41377922892570496, |
|
"learning_rate": 4.943231441048035e-07, |
|
"loss": 2.1694, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.18650917003419334, |
|
"grad_norm": 0.39552953839302063, |
|
"learning_rate": 4.93886462882096e-07, |
|
"loss": 2.1748, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1889959589679826, |
|
"grad_norm": 0.44786471128463745, |
|
"learning_rate": 4.934497816593886e-07, |
|
"loss": 2.1785, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.19148274790177183, |
|
"grad_norm": 0.42525768280029297, |
|
"learning_rate": 4.930131004366812e-07, |
|
"loss": 2.1825, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.1939695368355611, |
|
"grad_norm": 0.427071750164032, |
|
"learning_rate": 4.925764192139738e-07, |
|
"loss": 2.1463, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.19645632576935032, |
|
"grad_norm": 0.41076913475990295, |
|
"learning_rate": 4.921397379912663e-07, |
|
"loss": 2.124, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.19894311470313958, |
|
"grad_norm": 0.4056430160999298, |
|
"learning_rate": 4.917030567685589e-07, |
|
"loss": 2.143, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2014299036369288, |
|
"grad_norm": 0.4058414101600647, |
|
"learning_rate": 4.912663755458515e-07, |
|
"loss": 2.1337, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.20391669257071807, |
|
"grad_norm": 0.4427083730697632, |
|
"learning_rate": 4.908296943231441e-07, |
|
"loss": 2.1631, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.2064034815045073, |
|
"grad_norm": 0.4002906084060669, |
|
"learning_rate": 4.903930131004367e-07, |
|
"loss": 2.0538, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.20889027043829655, |
|
"grad_norm": 0.40065857768058777, |
|
"learning_rate": 4.899563318777293e-07, |
|
"loss": 2.1129, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.21137705937208578, |
|
"grad_norm": 0.42688536643981934, |
|
"learning_rate": 4.895196506550219e-07, |
|
"loss": 2.139, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.21386384830587504, |
|
"grad_norm": 0.4278879165649414, |
|
"learning_rate": 4.890829694323143e-07, |
|
"loss": 2.1395, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.21635063723966427, |
|
"grad_norm": 0.43649378418922424, |
|
"learning_rate": 4.886462882096069e-07, |
|
"loss": 2.0751, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.21883742617345353, |
|
"grad_norm": 0.3865818977355957, |
|
"learning_rate": 4.882096069868995e-07, |
|
"loss": 2.0862, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.22132421510724276, |
|
"grad_norm": 0.42509347200393677, |
|
"learning_rate": 4.877729257641921e-07, |
|
"loss": 2.1478, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.22381100404103202, |
|
"grad_norm": 0.4220832884311676, |
|
"learning_rate": 4.873362445414847e-07, |
|
"loss": 2.1609, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.22629779297482125, |
|
"grad_norm": 0.40812230110168457, |
|
"learning_rate": 4.868995633187773e-07, |
|
"loss": 2.1551, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.2287845819086105, |
|
"grad_norm": 0.4381932020187378, |
|
"learning_rate": 4.864628820960698e-07, |
|
"loss": 2.1594, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.23127137084239976, |
|
"grad_norm": 0.4095819890499115, |
|
"learning_rate": 4.860262008733625e-07, |
|
"loss": 2.1396, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.233758159776189, |
|
"grad_norm": 0.42679563164711, |
|
"learning_rate": 4.85589519650655e-07, |
|
"loss": 2.1771, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.23624494870997825, |
|
"grad_norm": 0.44320186972618103, |
|
"learning_rate": 4.851528384279476e-07, |
|
"loss": 2.1842, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.23873173764376748, |
|
"grad_norm": 0.39184531569480896, |
|
"learning_rate": 4.847161572052402e-07, |
|
"loss": 2.1378, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.24121852657755674, |
|
"grad_norm": 0.4558006823062897, |
|
"learning_rate": 4.842794759825327e-07, |
|
"loss": 2.1706, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.24370531551134597, |
|
"grad_norm": 0.4423806667327881, |
|
"learning_rate": 4.838427947598253e-07, |
|
"loss": 2.1841, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.24619210444513523, |
|
"grad_norm": 0.4314688444137573, |
|
"learning_rate": 4.834061135371178e-07, |
|
"loss": 2.1154, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.24867889337892446, |
|
"grad_norm": 0.44223618507385254, |
|
"learning_rate": 4.829694323144104e-07, |
|
"loss": 2.1124, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2511656823127137, |
|
"grad_norm": 0.44006800651550293, |
|
"learning_rate": 4.82532751091703e-07, |
|
"loss": 2.1937, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.253652471246503, |
|
"grad_norm": 0.4089645445346832, |
|
"learning_rate": 4.820960698689956e-07, |
|
"loss": 2.1236, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2561392601802922, |
|
"grad_norm": 0.4407235383987427, |
|
"learning_rate": 4.816593886462882e-07, |
|
"loss": 2.1116, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.25862604911408144, |
|
"grad_norm": 0.4155865013599396, |
|
"learning_rate": 4.812227074235808e-07, |
|
"loss": 2.1259, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.26111283804787067, |
|
"grad_norm": 0.4286578893661499, |
|
"learning_rate": 4.807860262008734e-07, |
|
"loss": 2.1831, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.26359962698165995, |
|
"grad_norm": 0.44605061411857605, |
|
"learning_rate": 4.80349344978166e-07, |
|
"loss": 2.1373, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2660864159154492, |
|
"grad_norm": 0.4532274007797241, |
|
"learning_rate": 4.799126637554585e-07, |
|
"loss": 2.1249, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.2685732048492384, |
|
"grad_norm": 0.4327315092086792, |
|
"learning_rate": 4.79475982532751e-07, |
|
"loss": 2.1414, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.27105999378302764, |
|
"grad_norm": 0.4438115358352661, |
|
"learning_rate": 4.790393013100436e-07, |
|
"loss": 2.0787, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2735467827168169, |
|
"grad_norm": 0.4239655137062073, |
|
"learning_rate": 4.786026200873362e-07, |
|
"loss": 2.1234, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.27603357165060616, |
|
"grad_norm": 0.4541226327419281, |
|
"learning_rate": 4.781659388646288e-07, |
|
"loss": 2.2023, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.2785203605843954, |
|
"grad_norm": 0.4275488555431366, |
|
"learning_rate": 4.777292576419214e-07, |
|
"loss": 2.1872, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2810071495181846, |
|
"grad_norm": 0.4427001476287842, |
|
"learning_rate": 4.772925764192139e-07, |
|
"loss": 2.1646, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.2834939384519739, |
|
"grad_norm": 0.43924546241760254, |
|
"learning_rate": 4.768558951965065e-07, |
|
"loss": 2.1175, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.28598072738576313, |
|
"grad_norm": 0.42863723635673523, |
|
"learning_rate": 4.764192139737991e-07, |
|
"loss": 2.1269, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.28846751631955236, |
|
"grad_norm": 0.40726035833358765, |
|
"learning_rate": 4.759825327510917e-07, |
|
"loss": 2.071, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.29095430525334165, |
|
"grad_norm": 0.4581323564052582, |
|
"learning_rate": 4.7554585152838427e-07, |
|
"loss": 2.1981, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.2934410941871309, |
|
"grad_norm": 0.46849963068962097, |
|
"learning_rate": 4.751091703056768e-07, |
|
"loss": 2.1689, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.2959278831209201, |
|
"grad_norm": 0.45309266448020935, |
|
"learning_rate": 4.746724890829694e-07, |
|
"loss": 2.1122, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.29841467205470934, |
|
"grad_norm": 0.4709586203098297, |
|
"learning_rate": 4.7423580786026193e-07, |
|
"loss": 2.1774, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3009014609884986, |
|
"grad_norm": 0.4502153992652893, |
|
"learning_rate": 4.737991266375546e-07, |
|
"loss": 2.1357, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.30338824992228786, |
|
"grad_norm": 0.4565674066543579, |
|
"learning_rate": 4.7336244541484717e-07, |
|
"loss": 2.1982, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3058750388560771, |
|
"grad_norm": 0.4817062020301819, |
|
"learning_rate": 4.729257641921397e-07, |
|
"loss": 2.1124, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.3083618277898663, |
|
"grad_norm": 0.43636277318000793, |
|
"learning_rate": 4.724890829694323e-07, |
|
"loss": 2.1345, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3108486167236556, |
|
"grad_norm": 0.4348713159561157, |
|
"learning_rate": 4.720524017467249e-07, |
|
"loss": 2.1382, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.31333540565744483, |
|
"grad_norm": 0.43462586402893066, |
|
"learning_rate": 4.7161572052401743e-07, |
|
"loss": 2.1708, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.31582219459123406, |
|
"grad_norm": 0.4370459318161011, |
|
"learning_rate": 4.7117903930131e-07, |
|
"loss": 2.128, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3183089835250233, |
|
"grad_norm": 0.43912699818611145, |
|
"learning_rate": 4.7074235807860256e-07, |
|
"loss": 2.1016, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3207957724588126, |
|
"grad_norm": 0.4648686647415161, |
|
"learning_rate": 4.7030567685589515e-07, |
|
"loss": 2.073, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.3232825613926018, |
|
"grad_norm": 0.45214056968688965, |
|
"learning_rate": 4.6986899563318775e-07, |
|
"loss": 2.1608, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.32576935032639104, |
|
"grad_norm": 0.4569113850593567, |
|
"learning_rate": 4.6943231441048034e-07, |
|
"loss": 2.1456, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.32825613926018027, |
|
"grad_norm": 0.4468495547771454, |
|
"learning_rate": 4.6899563318777293e-07, |
|
"loss": 2.104, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.33074292819396955, |
|
"grad_norm": 0.4579126834869385, |
|
"learning_rate": 4.685589519650655e-07, |
|
"loss": 2.1415, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3332297171277588, |
|
"grad_norm": 0.41133925318717957, |
|
"learning_rate": 4.6812227074235806e-07, |
|
"loss": 2.0556, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.335716506061548, |
|
"grad_norm": 0.45494189858436584, |
|
"learning_rate": 4.6768558951965065e-07, |
|
"loss": 2.1353, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.33820329499533724, |
|
"grad_norm": 0.45598360896110535, |
|
"learning_rate": 4.672489082969432e-07, |
|
"loss": 2.1608, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.34069008392912653, |
|
"grad_norm": 0.4804225564002991, |
|
"learning_rate": 4.668122270742358e-07, |
|
"loss": 2.125, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.34317687286291576, |
|
"grad_norm": 0.45764321088790894, |
|
"learning_rate": 4.6637554585152837e-07, |
|
"loss": 2.1547, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.345663661796705, |
|
"grad_norm": 0.469670832157135, |
|
"learning_rate": 4.659388646288209e-07, |
|
"loss": 2.1538, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.3481504507304943, |
|
"grad_norm": 0.47154125571250916, |
|
"learning_rate": 4.655021834061135e-07, |
|
"loss": 2.1144, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3506372396642835, |
|
"grad_norm": 0.4631963074207306, |
|
"learning_rate": 4.6506550218340604e-07, |
|
"loss": 2.194, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.35312402859807274, |
|
"grad_norm": 0.4393676519393921, |
|
"learning_rate": 4.646288209606987e-07, |
|
"loss": 2.0945, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.35561081753186197, |
|
"grad_norm": 0.444672554731369, |
|
"learning_rate": 4.641921397379913e-07, |
|
"loss": 2.111, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.35809760646565125, |
|
"grad_norm": 0.46494928002357483, |
|
"learning_rate": 4.637554585152838e-07, |
|
"loss": 2.1777, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3605843953994405, |
|
"grad_norm": 0.4453777074813843, |
|
"learning_rate": 4.633187772925764e-07, |
|
"loss": 2.1631, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3630711843332297, |
|
"grad_norm": 0.4813487231731415, |
|
"learning_rate": 4.62882096069869e-07, |
|
"loss": 2.1595, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.36555797326701894, |
|
"grad_norm": 0.4742406904697418, |
|
"learning_rate": 4.6244541484716154e-07, |
|
"loss": 2.1576, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.36804476220080823, |
|
"grad_norm": 0.45844781398773193, |
|
"learning_rate": 4.6200873362445413e-07, |
|
"loss": 2.1756, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.37053155113459746, |
|
"grad_norm": 0.44323110580444336, |
|
"learning_rate": 4.6157205240174667e-07, |
|
"loss": 2.1483, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.3730183400683867, |
|
"grad_norm": 0.49860548973083496, |
|
"learning_rate": 4.6113537117903926e-07, |
|
"loss": 2.2144, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3755051290021759, |
|
"grad_norm": 0.43856751918792725, |
|
"learning_rate": 4.6069868995633185e-07, |
|
"loss": 2.0581, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.3779919179359652, |
|
"grad_norm": 0.4472333788871765, |
|
"learning_rate": 4.602620087336244e-07, |
|
"loss": 2.117, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.38047870686975443, |
|
"grad_norm": 0.4927634298801422, |
|
"learning_rate": 4.5982532751091704e-07, |
|
"loss": 2.153, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.38296549580354367, |
|
"grad_norm": 0.4599962532520294, |
|
"learning_rate": 4.5938864628820963e-07, |
|
"loss": 2.2226, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.3854522847373329, |
|
"grad_norm": 0.45448487997055054, |
|
"learning_rate": 4.5895196506550217e-07, |
|
"loss": 2.1189, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3879390736711222, |
|
"grad_norm": 0.4686853587627411, |
|
"learning_rate": 4.5851528384279476e-07, |
|
"loss": 2.1257, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.3904258626049114, |
|
"grad_norm": 0.5059552192687988, |
|
"learning_rate": 4.580786026200873e-07, |
|
"loss": 2.1586, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.39291265153870064, |
|
"grad_norm": 0.4529350996017456, |
|
"learning_rate": 4.576419213973799e-07, |
|
"loss": 2.1436, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.39539944047248987, |
|
"grad_norm": 0.44359931349754333, |
|
"learning_rate": 4.572052401746725e-07, |
|
"loss": 2.1086, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.39788622940627916, |
|
"grad_norm": 0.4643580913543701, |
|
"learning_rate": 4.56768558951965e-07, |
|
"loss": 2.1566, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4003730183400684, |
|
"grad_norm": 0.47713202238082886, |
|
"learning_rate": 4.563318777292576e-07, |
|
"loss": 2.1494, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.4028598072738576, |
|
"grad_norm": 0.4480564296245575, |
|
"learning_rate": 4.5589519650655015e-07, |
|
"loss": 2.1188, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4053465962076469, |
|
"grad_norm": 0.4450179636478424, |
|
"learning_rate": 4.554585152838428e-07, |
|
"loss": 2.1035, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.40783338514143613, |
|
"grad_norm": 0.4772661328315735, |
|
"learning_rate": 4.550218340611354e-07, |
|
"loss": 2.1598, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.41032017407522536, |
|
"grad_norm": 0.47646352648735046, |
|
"learning_rate": 4.545851528384279e-07, |
|
"loss": 2.1995, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4128069630090146, |
|
"grad_norm": 0.4821939468383789, |
|
"learning_rate": 4.541484716157205e-07, |
|
"loss": 2.1441, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.4152937519428039, |
|
"grad_norm": 0.4545115828514099, |
|
"learning_rate": 4.537117903930131e-07, |
|
"loss": 2.1598, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.4177805408765931, |
|
"grad_norm": 0.45019111037254333, |
|
"learning_rate": 4.5327510917030565e-07, |
|
"loss": 2.1148, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.42026732981038234, |
|
"grad_norm": 0.4689873456954956, |
|
"learning_rate": 4.5283842794759824e-07, |
|
"loss": 2.1509, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.42275411874417157, |
|
"grad_norm": 0.47073739767074585, |
|
"learning_rate": 4.524017467248908e-07, |
|
"loss": 2.1425, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.42524090767796086, |
|
"grad_norm": 0.4607613980770111, |
|
"learning_rate": 4.5196506550218337e-07, |
|
"loss": 2.1226, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.4277276966117501, |
|
"grad_norm": 0.46717172861099243, |
|
"learning_rate": 4.5152838427947596e-07, |
|
"loss": 2.1222, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4302144855455393, |
|
"grad_norm": 0.5047352313995361, |
|
"learning_rate": 4.510917030567685e-07, |
|
"loss": 2.2121, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.43270127447932855, |
|
"grad_norm": 0.49366188049316406, |
|
"learning_rate": 4.5065502183406115e-07, |
|
"loss": 2.1806, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.43518806341311783, |
|
"grad_norm": 0.4741223454475403, |
|
"learning_rate": 4.502183406113537e-07, |
|
"loss": 2.0808, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.43767485234690706, |
|
"grad_norm": 0.4672994911670685, |
|
"learning_rate": 4.497816593886463e-07, |
|
"loss": 2.1021, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.4401616412806963, |
|
"grad_norm": 0.4813832640647888, |
|
"learning_rate": 4.4934497816593887e-07, |
|
"loss": 2.139, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.4426484302144855, |
|
"grad_norm": 0.4757406413555145, |
|
"learning_rate": 4.489082969432314e-07, |
|
"loss": 2.1245, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4451352191482748, |
|
"grad_norm": 0.4674074351787567, |
|
"learning_rate": 4.48471615720524e-07, |
|
"loss": 2.1619, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.44762200808206404, |
|
"grad_norm": 0.4354044795036316, |
|
"learning_rate": 4.480349344978166e-07, |
|
"loss": 2.0624, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.45010879701585327, |
|
"grad_norm": 0.4484567940235138, |
|
"learning_rate": 4.4759825327510913e-07, |
|
"loss": 2.1071, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.4525955859496425, |
|
"grad_norm": 0.4580535292625427, |
|
"learning_rate": 4.471615720524017e-07, |
|
"loss": 2.1039, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.4550823748834318, |
|
"grad_norm": 0.504393994808197, |
|
"learning_rate": 4.4672489082969426e-07, |
|
"loss": 2.1172, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.457569163817221, |
|
"grad_norm": 0.4727741777896881, |
|
"learning_rate": 4.4628820960698685e-07, |
|
"loss": 2.1343, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.46005595275101024, |
|
"grad_norm": 0.4549051523208618, |
|
"learning_rate": 4.458515283842795e-07, |
|
"loss": 2.1586, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.46254274168479953, |
|
"grad_norm": 0.4877924919128418, |
|
"learning_rate": 4.4541484716157203e-07, |
|
"loss": 2.2136, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.46502953061858876, |
|
"grad_norm": 0.47328630089759827, |
|
"learning_rate": 4.449781659388646e-07, |
|
"loss": 2.1065, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.467516319552378, |
|
"grad_norm": 0.46814873814582825, |
|
"learning_rate": 4.445414847161572e-07, |
|
"loss": 2.0879, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4700031084861672, |
|
"grad_norm": 0.46883970499038696, |
|
"learning_rate": 4.4410480349344976e-07, |
|
"loss": 2.1495, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.4724898974199565, |
|
"grad_norm": 0.5020297169685364, |
|
"learning_rate": 4.4366812227074235e-07, |
|
"loss": 2.161, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.47497668635374574, |
|
"grad_norm": 0.47197675704956055, |
|
"learning_rate": 4.432314410480349e-07, |
|
"loss": 2.1354, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.47746347528753497, |
|
"grad_norm": 0.47488582134246826, |
|
"learning_rate": 4.427947598253275e-07, |
|
"loss": 2.0913, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.4799502642213242, |
|
"grad_norm": 0.49505242705345154, |
|
"learning_rate": 4.4235807860262007e-07, |
|
"loss": 2.1499, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.4824370531551135, |
|
"grad_norm": 0.48239609599113464, |
|
"learning_rate": 4.419213973799126e-07, |
|
"loss": 2.1432, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.4849238420889027, |
|
"grad_norm": 0.46357694268226624, |
|
"learning_rate": 4.4148471615720525e-07, |
|
"loss": 2.1352, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.48741063102269194, |
|
"grad_norm": 0.4855436086654663, |
|
"learning_rate": 4.410480349344978e-07, |
|
"loss": 2.0876, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.4898974199564812, |
|
"grad_norm": 0.5182051658630371, |
|
"learning_rate": 4.406113537117904e-07, |
|
"loss": 2.1872, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.49238420889027046, |
|
"grad_norm": 0.4700855016708374, |
|
"learning_rate": 4.40174672489083e-07, |
|
"loss": 2.0742, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.4948709978240597, |
|
"grad_norm": 0.4741506278514862, |
|
"learning_rate": 4.397379912663755e-07, |
|
"loss": 2.0981, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.4973577867578489, |
|
"grad_norm": 0.4737417697906494, |
|
"learning_rate": 4.393013100436681e-07, |
|
"loss": 2.1206, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.49984457569163815, |
|
"grad_norm": 0.4860036075115204, |
|
"learning_rate": 4.388646288209607e-07, |
|
"loss": 2.0539, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5023313646254274, |
|
"grad_norm": 0.4895828068256378, |
|
"learning_rate": 4.3842794759825324e-07, |
|
"loss": 2.2017, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5048181535592167, |
|
"grad_norm": 0.5107592940330505, |
|
"learning_rate": 4.3799126637554583e-07, |
|
"loss": 2.2033, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.507304942493006, |
|
"grad_norm": 0.49359220266342163, |
|
"learning_rate": 4.3755458515283837e-07, |
|
"loss": 2.1245, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5097917314267951, |
|
"grad_norm": 0.43286237120628357, |
|
"learning_rate": 4.3711790393013096e-07, |
|
"loss": 2.1112, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5122785203605844, |
|
"grad_norm": 0.4412092864513397, |
|
"learning_rate": 4.366812227074236e-07, |
|
"loss": 2.032, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5147653092943736, |
|
"grad_norm": 0.5042114853858948, |
|
"learning_rate": 4.3624454148471614e-07, |
|
"loss": 2.1303, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.5172520982281629, |
|
"grad_norm": 0.4746697247028351, |
|
"learning_rate": 4.3580786026200873e-07, |
|
"loss": 2.1337, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5197388871619522, |
|
"grad_norm": 0.4542432427406311, |
|
"learning_rate": 4.353711790393013e-07, |
|
"loss": 2.1181, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.5222256760957413, |
|
"grad_norm": 0.5013236403465271, |
|
"learning_rate": 4.3493449781659386e-07, |
|
"loss": 2.1283, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5247124650295306, |
|
"grad_norm": 0.44694000482559204, |
|
"learning_rate": 4.3449781659388646e-07, |
|
"loss": 2.1317, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.5271992539633199, |
|
"grad_norm": 0.5043014287948608, |
|
"learning_rate": 4.34061135371179e-07, |
|
"loss": 2.1751, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5296860428971091, |
|
"grad_norm": 0.45841050148010254, |
|
"learning_rate": 4.336244541484716e-07, |
|
"loss": 2.1164, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5321728318308984, |
|
"grad_norm": 0.4801214635372162, |
|
"learning_rate": 4.331877729257642e-07, |
|
"loss": 2.0855, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5346596207646876, |
|
"grad_norm": 0.5102494955062866, |
|
"learning_rate": 4.327510917030567e-07, |
|
"loss": 2.1819, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5371464096984768, |
|
"grad_norm": 0.5200817584991455, |
|
"learning_rate": 4.323144104803493e-07, |
|
"loss": 2.175, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5396331986322661, |
|
"grad_norm": 0.49951592087745667, |
|
"learning_rate": 4.318777292576419e-07, |
|
"loss": 2.1599, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.5421199875660553, |
|
"grad_norm": 0.5104175209999084, |
|
"learning_rate": 4.314410480349345e-07, |
|
"loss": 2.1793, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5446067764998446, |
|
"grad_norm": 0.49184200167655945, |
|
"learning_rate": 4.310043668122271e-07, |
|
"loss": 2.1548, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.5470935654336339, |
|
"grad_norm": 0.48827120661735535, |
|
"learning_rate": 4.305676855895196e-07, |
|
"loss": 2.1406, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.549580354367423, |
|
"grad_norm": 0.47386690974235535, |
|
"learning_rate": 4.301310043668122e-07, |
|
"loss": 2.1014, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.5520671433012123, |
|
"grad_norm": 0.47124195098876953, |
|
"learning_rate": 4.296943231441048e-07, |
|
"loss": 2.086, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5545539322350016, |
|
"grad_norm": 0.5211581587791443, |
|
"learning_rate": 4.2925764192139734e-07, |
|
"loss": 2.0998, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5570407211687908, |
|
"grad_norm": 0.4680314064025879, |
|
"learning_rate": 4.2882096069868994e-07, |
|
"loss": 2.0983, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5595275101025801, |
|
"grad_norm": 0.4839833080768585, |
|
"learning_rate": 4.283842794759825e-07, |
|
"loss": 2.1144, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5620142990363692, |
|
"grad_norm": 0.4539274275302887, |
|
"learning_rate": 4.2794759825327507e-07, |
|
"loss": 2.0693, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.5645010879701585, |
|
"grad_norm": 0.4671003818511963, |
|
"learning_rate": 4.275109170305677e-07, |
|
"loss": 2.0779, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.5669878769039478, |
|
"grad_norm": 0.5179879069328308, |
|
"learning_rate": 4.2707423580786025e-07, |
|
"loss": 2.1674, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.569474665837737, |
|
"grad_norm": 0.4587318003177643, |
|
"learning_rate": 4.2663755458515284e-07, |
|
"loss": 2.1075, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.5719614547715263, |
|
"grad_norm": 0.5107843279838562, |
|
"learning_rate": 4.262008733624454e-07, |
|
"loss": 2.1966, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5744482437053156, |
|
"grad_norm": 0.48125070333480835, |
|
"learning_rate": 4.2576419213973797e-07, |
|
"loss": 2.111, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.5769350326391047, |
|
"grad_norm": 0.5291087031364441, |
|
"learning_rate": 4.2532751091703056e-07, |
|
"loss": 2.1401, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.579421821572894, |
|
"grad_norm": 0.5241518020629883, |
|
"learning_rate": 4.248908296943231e-07, |
|
"loss": 2.1662, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.5819086105066833, |
|
"grad_norm": 0.5210862755775452, |
|
"learning_rate": 4.244541484716157e-07, |
|
"loss": 2.2149, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.5843953994404725, |
|
"grad_norm": 0.5254886150360107, |
|
"learning_rate": 4.240174672489083e-07, |
|
"loss": 2.2019, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5868821883742618, |
|
"grad_norm": 0.49172264337539673, |
|
"learning_rate": 4.235807860262008e-07, |
|
"loss": 2.1671, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.5893689773080509, |
|
"grad_norm": 0.47738420963287354, |
|
"learning_rate": 4.231441048034934e-07, |
|
"loss": 2.0794, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.5918557662418402, |
|
"grad_norm": 0.5100018978118896, |
|
"learning_rate": 4.22707423580786e-07, |
|
"loss": 2.1181, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5943425551756295, |
|
"grad_norm": 0.5403950810432434, |
|
"learning_rate": 4.222707423580786e-07, |
|
"loss": 2.1559, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.5968293441094187, |
|
"grad_norm": 0.5101498961448669, |
|
"learning_rate": 4.218340611353712e-07, |
|
"loss": 2.2091, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.599316133043208, |
|
"grad_norm": 0.5162122845649719, |
|
"learning_rate": 4.2139737991266373e-07, |
|
"loss": 2.1471, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.6018029219769973, |
|
"grad_norm": 0.5074677467346191, |
|
"learning_rate": 4.209606986899563e-07, |
|
"loss": 2.0849, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.6042897109107864, |
|
"grad_norm": 0.5043840408325195, |
|
"learning_rate": 4.205240174672489e-07, |
|
"loss": 2.109, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.6067764998445757, |
|
"grad_norm": 0.49023503065109253, |
|
"learning_rate": 4.2008733624454145e-07, |
|
"loss": 2.157, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6092632887783649, |
|
"grad_norm": 0.5031821131706238, |
|
"learning_rate": 4.1965065502183404e-07, |
|
"loss": 2.1597, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6117500777121542, |
|
"grad_norm": 0.5147417783737183, |
|
"learning_rate": 4.192139737991266e-07, |
|
"loss": 2.1502, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6142368666459435, |
|
"grad_norm": 0.5135524272918701, |
|
"learning_rate": 4.187772925764192e-07, |
|
"loss": 2.1204, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.6167236555797326, |
|
"grad_norm": 0.516242265701294, |
|
"learning_rate": 4.1834061135371177e-07, |
|
"loss": 2.1121, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6192104445135219, |
|
"grad_norm": 0.5270472764968872, |
|
"learning_rate": 4.1790393013100436e-07, |
|
"loss": 2.1296, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6216972334473112, |
|
"grad_norm": 0.5023481249809265, |
|
"learning_rate": 4.1746724890829695e-07, |
|
"loss": 2.1138, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6241840223811004, |
|
"grad_norm": 0.5072234869003296, |
|
"learning_rate": 4.170305676855895e-07, |
|
"loss": 2.1852, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.6266708113148897, |
|
"grad_norm": 0.5261276364326477, |
|
"learning_rate": 4.165938864628821e-07, |
|
"loss": 2.1707, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6291576002486788, |
|
"grad_norm": 0.524861216545105, |
|
"learning_rate": 4.1615720524017467e-07, |
|
"loss": 2.1885, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.6316443891824681, |
|
"grad_norm": 0.5122174620628357, |
|
"learning_rate": 4.157205240174672e-07, |
|
"loss": 2.1095, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6341311781162574, |
|
"grad_norm": 0.4976103603839874, |
|
"learning_rate": 4.152838427947598e-07, |
|
"loss": 2.1686, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6366179670500466, |
|
"grad_norm": 0.48984527587890625, |
|
"learning_rate": 4.148471615720524e-07, |
|
"loss": 2.0661, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6391047559838359, |
|
"grad_norm": 0.5184794664382935, |
|
"learning_rate": 4.1441048034934493e-07, |
|
"loss": 2.1323, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.6415915449176252, |
|
"grad_norm": 0.4991200566291809, |
|
"learning_rate": 4.139737991266375e-07, |
|
"loss": 2.1294, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6440783338514143, |
|
"grad_norm": 0.5302152037620544, |
|
"learning_rate": 4.1353711790393006e-07, |
|
"loss": 2.0922, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.6465651227852036, |
|
"grad_norm": 0.5143322348594666, |
|
"learning_rate": 4.131004366812227e-07, |
|
"loss": 2.1694, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6490519117189929, |
|
"grad_norm": 0.5043548941612244, |
|
"learning_rate": 4.126637554585153e-07, |
|
"loss": 2.0899, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.6515387006527821, |
|
"grad_norm": 0.5160046815872192, |
|
"learning_rate": 4.1222707423580784e-07, |
|
"loss": 2.1185, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.6540254895865714, |
|
"grad_norm": 0.5054792761802673, |
|
"learning_rate": 4.1179039301310043e-07, |
|
"loss": 2.1503, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.6565122785203605, |
|
"grad_norm": 0.5056222677230835, |
|
"learning_rate": 4.11353711790393e-07, |
|
"loss": 2.1089, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6589990674541498, |
|
"grad_norm": 0.5285047292709351, |
|
"learning_rate": 4.1091703056768556e-07, |
|
"loss": 2.1556, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6614858563879391, |
|
"grad_norm": 0.48898041248321533, |
|
"learning_rate": 4.1048034934497815e-07, |
|
"loss": 2.1107, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.6639726453217283, |
|
"grad_norm": 0.525590717792511, |
|
"learning_rate": 4.100436681222707e-07, |
|
"loss": 2.1374, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.6664594342555176, |
|
"grad_norm": 0.5363737344741821, |
|
"learning_rate": 4.096069868995633e-07, |
|
"loss": 2.1198, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6689462231893069, |
|
"grad_norm": 0.5182633399963379, |
|
"learning_rate": 4.091703056768559e-07, |
|
"loss": 2.126, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.671433012123096, |
|
"grad_norm": 0.4978923201560974, |
|
"learning_rate": 4.0873362445414847e-07, |
|
"loss": 2.0764, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6739198010568853, |
|
"grad_norm": 0.5094720125198364, |
|
"learning_rate": 4.0829694323144106e-07, |
|
"loss": 2.145, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.6764065899906745, |
|
"grad_norm": 0.540023148059845, |
|
"learning_rate": 4.078602620087336e-07, |
|
"loss": 2.1327, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.6788933789244638, |
|
"grad_norm": 0.5420276522636414, |
|
"learning_rate": 4.074235807860262e-07, |
|
"loss": 2.1707, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.6813801678582531, |
|
"grad_norm": 0.5282043218612671, |
|
"learning_rate": 4.069868995633188e-07, |
|
"loss": 2.1473, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.6838669567920422, |
|
"grad_norm": 0.5049037933349609, |
|
"learning_rate": 4.065502183406113e-07, |
|
"loss": 2.1335, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6863537457258315, |
|
"grad_norm": 0.5107303261756897, |
|
"learning_rate": 4.061135371179039e-07, |
|
"loss": 2.1349, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.6888405346596208, |
|
"grad_norm": 0.4959608018398285, |
|
"learning_rate": 4.056768558951965e-07, |
|
"loss": 2.1044, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.69132732359341, |
|
"grad_norm": 0.5125852227210999, |
|
"learning_rate": 4.0524017467248904e-07, |
|
"loss": 2.1428, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.6938141125271993, |
|
"grad_norm": 0.511873185634613, |
|
"learning_rate": 4.0480349344978163e-07, |
|
"loss": 2.0763, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.6963009014609886, |
|
"grad_norm": 0.5032888054847717, |
|
"learning_rate": 4.0436681222707417e-07, |
|
"loss": 2.122, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6987876903947777, |
|
"grad_norm": 0.5102598667144775, |
|
"learning_rate": 4.039301310043668e-07, |
|
"loss": 2.0693, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.701274479328567, |
|
"grad_norm": 0.5118304491043091, |
|
"learning_rate": 4.034934497816594e-07, |
|
"loss": 2.11, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.7037612682623562, |
|
"grad_norm": 0.5202342867851257, |
|
"learning_rate": 4.0305676855895195e-07, |
|
"loss": 2.1582, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.7062480571961455, |
|
"grad_norm": 0.48433917760849, |
|
"learning_rate": 4.0262008733624454e-07, |
|
"loss": 2.103, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7087348461299348, |
|
"grad_norm": 0.4986036717891693, |
|
"learning_rate": 4.0218340611353713e-07, |
|
"loss": 2.1203, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7112216350637239, |
|
"grad_norm": 0.5467602014541626, |
|
"learning_rate": 4.0174672489082967e-07, |
|
"loss": 2.1204, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7137084239975132, |
|
"grad_norm": 0.5108657479286194, |
|
"learning_rate": 4.0131004366812226e-07, |
|
"loss": 2.1478, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.7161952129313025, |
|
"grad_norm": 0.5145993232727051, |
|
"learning_rate": 4.008733624454148e-07, |
|
"loss": 2.1563, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7186820018650917, |
|
"grad_norm": 0.5134692788124084, |
|
"learning_rate": 4.004366812227074e-07, |
|
"loss": 2.1067, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.721168790798881, |
|
"grad_norm": 0.5436774492263794, |
|
"learning_rate": 4e-07, |
|
"loss": 2.1369, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7236555797326701, |
|
"grad_norm": 0.5296205282211304, |
|
"learning_rate": 3.995633187772925e-07, |
|
"loss": 2.1452, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.7261423686664594, |
|
"grad_norm": 0.4911108911037445, |
|
"learning_rate": 3.9912663755458517e-07, |
|
"loss": 2.1279, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7286291576002487, |
|
"grad_norm": 0.5625902414321899, |
|
"learning_rate": 3.986899563318777e-07, |
|
"loss": 2.169, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.7311159465340379, |
|
"grad_norm": 0.5042857527732849, |
|
"learning_rate": 3.982532751091703e-07, |
|
"loss": 2.0692, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7336027354678272, |
|
"grad_norm": 0.5251498222351074, |
|
"learning_rate": 3.978165938864629e-07, |
|
"loss": 2.1261, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7360895244016165, |
|
"grad_norm": 0.5093502402305603, |
|
"learning_rate": 3.973799126637554e-07, |
|
"loss": 2.1175, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7385763133354056, |
|
"grad_norm": 0.49675241112709045, |
|
"learning_rate": 3.96943231441048e-07, |
|
"loss": 2.1679, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.7410631022691949, |
|
"grad_norm": 0.523313045501709, |
|
"learning_rate": 3.965065502183406e-07, |
|
"loss": 2.1195, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7435498912029841, |
|
"grad_norm": 0.5194100737571716, |
|
"learning_rate": 3.9606986899563315e-07, |
|
"loss": 2.1431, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.7460366801367734, |
|
"grad_norm": 0.5145063996315002, |
|
"learning_rate": 3.9563318777292574e-07, |
|
"loss": 2.0968, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7485234690705627, |
|
"grad_norm": 0.5165944695472717, |
|
"learning_rate": 3.951965065502183e-07, |
|
"loss": 2.1316, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.7510102580043518, |
|
"grad_norm": 0.5502745509147644, |
|
"learning_rate": 3.947598253275109e-07, |
|
"loss": 2.1215, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.7534970469381411, |
|
"grad_norm": 0.5752532482147217, |
|
"learning_rate": 3.943231441048035e-07, |
|
"loss": 2.165, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.7559838358719304, |
|
"grad_norm": 0.5388760566711426, |
|
"learning_rate": 3.9388646288209605e-07, |
|
"loss": 2.136, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7584706248057196, |
|
"grad_norm": 0.5256951451301575, |
|
"learning_rate": 3.9344978165938865e-07, |
|
"loss": 2.0978, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.7609574137395089, |
|
"grad_norm": 0.5247829556465149, |
|
"learning_rate": 3.930131004366812e-07, |
|
"loss": 2.1591, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.7634442026732982, |
|
"grad_norm": 0.508574366569519, |
|
"learning_rate": 3.925764192139738e-07, |
|
"loss": 2.1542, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.7659309916070873, |
|
"grad_norm": 0.49671751260757446, |
|
"learning_rate": 3.9213973799126637e-07, |
|
"loss": 2.1469, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.7684177805408766, |
|
"grad_norm": 0.5673956274986267, |
|
"learning_rate": 3.917030567685589e-07, |
|
"loss": 2.2209, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.7709045694746658, |
|
"grad_norm": 0.4881182610988617, |
|
"learning_rate": 3.912663755458515e-07, |
|
"loss": 2.1353, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7733913584084551, |
|
"grad_norm": 0.5374391078948975, |
|
"learning_rate": 3.908296943231441e-07, |
|
"loss": 2.2102, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.7758781473422444, |
|
"grad_norm": 0.520723283290863, |
|
"learning_rate": 3.9039301310043663e-07, |
|
"loss": 2.1716, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.7783649362760335, |
|
"grad_norm": 0.5542478561401367, |
|
"learning_rate": 3.8995633187772927e-07, |
|
"loss": 2.2128, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.7808517252098228, |
|
"grad_norm": 0.5180374979972839, |
|
"learning_rate": 3.895196506550218e-07, |
|
"loss": 2.1245, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.7833385141436121, |
|
"grad_norm": 0.5454829931259155, |
|
"learning_rate": 3.890829694323144e-07, |
|
"loss": 2.1609, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7858253030774013, |
|
"grad_norm": 0.568573534488678, |
|
"learning_rate": 3.88646288209607e-07, |
|
"loss": 2.175, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.7883120920111906, |
|
"grad_norm": 0.5162298679351807, |
|
"learning_rate": 3.8820960698689953e-07, |
|
"loss": 2.1474, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.7907988809449797, |
|
"grad_norm": 0.5148350596427917, |
|
"learning_rate": 3.877729257641921e-07, |
|
"loss": 2.0697, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.793285669878769, |
|
"grad_norm": 0.5210283994674683, |
|
"learning_rate": 3.873362445414847e-07, |
|
"loss": 2.1587, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.7957724588125583, |
|
"grad_norm": 0.4845898151397705, |
|
"learning_rate": 3.8689956331877726e-07, |
|
"loss": 2.0581, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7982592477463475, |
|
"grad_norm": 0.5222198367118835, |
|
"learning_rate": 3.8646288209606985e-07, |
|
"loss": 2.0504, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.8007460366801368, |
|
"grad_norm": 0.5040515065193176, |
|
"learning_rate": 3.860262008733624e-07, |
|
"loss": 2.0898, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.8032328256139261, |
|
"grad_norm": 0.501930296421051, |
|
"learning_rate": 3.85589519650655e-07, |
|
"loss": 2.1335, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.8057196145477152, |
|
"grad_norm": 0.5378695726394653, |
|
"learning_rate": 3.851528384279476e-07, |
|
"loss": 2.0953, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8082064034815045, |
|
"grad_norm": 0.49689194560050964, |
|
"learning_rate": 3.8471615720524016e-07, |
|
"loss": 2.0837, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8106931924152938, |
|
"grad_norm": 0.5431040525436401, |
|
"learning_rate": 3.8427947598253275e-07, |
|
"loss": 2.1151, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.813179981349083, |
|
"grad_norm": 0.510339081287384, |
|
"learning_rate": 3.838427947598253e-07, |
|
"loss": 2.1399, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.8156667702828723, |
|
"grad_norm": 0.5451592206954956, |
|
"learning_rate": 3.834061135371179e-07, |
|
"loss": 2.1182, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8181535592166614, |
|
"grad_norm": 0.5272311568260193, |
|
"learning_rate": 3.829694323144105e-07, |
|
"loss": 2.139, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.8206403481504507, |
|
"grad_norm": 0.5389718413352966, |
|
"learning_rate": 3.82532751091703e-07, |
|
"loss": 2.1086, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.82312713708424, |
|
"grad_norm": 0.5228806138038635, |
|
"learning_rate": 3.820960698689956e-07, |
|
"loss": 2.1435, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.8256139260180292, |
|
"grad_norm": 0.5186501741409302, |
|
"learning_rate": 3.816593886462882e-07, |
|
"loss": 2.095, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8281007149518185, |
|
"grad_norm": 0.5484049320220947, |
|
"learning_rate": 3.8122270742358074e-07, |
|
"loss": 2.1666, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.8305875038856078, |
|
"grad_norm": 0.5375939607620239, |
|
"learning_rate": 3.807860262008734e-07, |
|
"loss": 2.125, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.8330742928193969, |
|
"grad_norm": 0.5139255523681641, |
|
"learning_rate": 3.803493449781659e-07, |
|
"loss": 2.075, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8355610817531862, |
|
"grad_norm": 0.5235688090324402, |
|
"learning_rate": 3.799126637554585e-07, |
|
"loss": 2.0731, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8380478706869754, |
|
"grad_norm": 0.5630027651786804, |
|
"learning_rate": 3.794759825327511e-07, |
|
"loss": 2.1261, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.8405346596207647, |
|
"grad_norm": 0.547572910785675, |
|
"learning_rate": 3.7903930131004364e-07, |
|
"loss": 2.1641, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.843021448554554, |
|
"grad_norm": 0.5533425211906433, |
|
"learning_rate": 3.7860262008733623e-07, |
|
"loss": 2.1293, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.8455082374883431, |
|
"grad_norm": 0.5256425738334656, |
|
"learning_rate": 3.781659388646288e-07, |
|
"loss": 2.1397, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8479950264221324, |
|
"grad_norm": 0.5411325693130493, |
|
"learning_rate": 3.7772925764192136e-07, |
|
"loss": 2.1631, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.8504818153559217, |
|
"grad_norm": 0.5244682431221008, |
|
"learning_rate": 3.7729257641921396e-07, |
|
"loss": 2.1655, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.8529686042897109, |
|
"grad_norm": 0.5120859742164612, |
|
"learning_rate": 3.768558951965065e-07, |
|
"loss": 2.0929, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.8554553932235002, |
|
"grad_norm": 0.5486117601394653, |
|
"learning_rate": 3.764192139737991e-07, |
|
"loss": 2.1333, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8579421821572893, |
|
"grad_norm": 0.5485012531280518, |
|
"learning_rate": 3.7598253275109173e-07, |
|
"loss": 2.1497, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.8604289710910786, |
|
"grad_norm": 0.5423093438148499, |
|
"learning_rate": 3.7554585152838427e-07, |
|
"loss": 2.2169, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.8629157600248679, |
|
"grad_norm": 0.5003622770309448, |
|
"learning_rate": 3.7510917030567686e-07, |
|
"loss": 2.1818, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.8654025489586571, |
|
"grad_norm": 0.5931081771850586, |
|
"learning_rate": 3.746724890829694e-07, |
|
"loss": 2.1631, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.8678893378924464, |
|
"grad_norm": 0.5221492052078247, |
|
"learning_rate": 3.74235807860262e-07, |
|
"loss": 2.1087, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.8703761268262357, |
|
"grad_norm": 0.5065641403198242, |
|
"learning_rate": 3.737991266375546e-07, |
|
"loss": 2.0622, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8728629157600248, |
|
"grad_norm": 0.5329532027244568, |
|
"learning_rate": 3.733624454148471e-07, |
|
"loss": 2.1275, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.8753497046938141, |
|
"grad_norm": 0.5383079648017883, |
|
"learning_rate": 3.729257641921397e-07, |
|
"loss": 2.1342, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.8778364936276034, |
|
"grad_norm": 0.49477216601371765, |
|
"learning_rate": 3.724890829694323e-07, |
|
"loss": 2.0404, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.8803232825613926, |
|
"grad_norm": 0.5197799205780029, |
|
"learning_rate": 3.7205240174672484e-07, |
|
"loss": 2.1228, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.8828100714951819, |
|
"grad_norm": 0.5122123956680298, |
|
"learning_rate": 3.7161572052401744e-07, |
|
"loss": 2.1329, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.885296860428971, |
|
"grad_norm": 0.5379232168197632, |
|
"learning_rate": 3.7117903930131003e-07, |
|
"loss": 2.0743, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.8877836493627603, |
|
"grad_norm": 0.5164668560028076, |
|
"learning_rate": 3.707423580786026e-07, |
|
"loss": 2.1474, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.8902704382965496, |
|
"grad_norm": 0.518368661403656, |
|
"learning_rate": 3.703056768558952e-07, |
|
"loss": 2.1987, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.8927572272303388, |
|
"grad_norm": 0.5662968754768372, |
|
"learning_rate": 3.6986899563318775e-07, |
|
"loss": 2.1301, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.8952440161641281, |
|
"grad_norm": 0.5161558389663696, |
|
"learning_rate": 3.6943231441048034e-07, |
|
"loss": 2.1033, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8977308050979174, |
|
"grad_norm": 0.5516855120658875, |
|
"learning_rate": 3.6899563318777293e-07, |
|
"loss": 2.1003, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.9002175940317065, |
|
"grad_norm": 0.5291304588317871, |
|
"learning_rate": 3.6855895196506547e-07, |
|
"loss": 2.0533, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.9027043829654958, |
|
"grad_norm": 0.5586827397346497, |
|
"learning_rate": 3.6812227074235806e-07, |
|
"loss": 2.1052, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.905191171899285, |
|
"grad_norm": 0.5328514575958252, |
|
"learning_rate": 3.676855895196506e-07, |
|
"loss": 2.1548, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9076779608330743, |
|
"grad_norm": 0.5259972810745239, |
|
"learning_rate": 3.672489082969432e-07, |
|
"loss": 2.1101, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9101647497668636, |
|
"grad_norm": 0.5482295751571655, |
|
"learning_rate": 3.6681222707423584e-07, |
|
"loss": 2.1678, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.9126515387006527, |
|
"grad_norm": 0.5381218194961548, |
|
"learning_rate": 3.663755458515284e-07, |
|
"loss": 2.2098, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.915138327634442, |
|
"grad_norm": 0.5494764447212219, |
|
"learning_rate": 3.6593886462882097e-07, |
|
"loss": 2.1338, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9176251165682313, |
|
"grad_norm": 0.5393621921539307, |
|
"learning_rate": 3.655021834061135e-07, |
|
"loss": 2.0952, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.9201119055020205, |
|
"grad_norm": 0.5395556092262268, |
|
"learning_rate": 3.650655021834061e-07, |
|
"loss": 2.1402, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9225986944358098, |
|
"grad_norm": 0.5069707632064819, |
|
"learning_rate": 3.646288209606987e-07, |
|
"loss": 2.0925, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.9250854833695991, |
|
"grad_norm": 0.5580669641494751, |
|
"learning_rate": 3.6419213973799123e-07, |
|
"loss": 2.1585, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9275722723033882, |
|
"grad_norm": 0.5407446026802063, |
|
"learning_rate": 3.637554585152838e-07, |
|
"loss": 2.1448, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.9300590612371775, |
|
"grad_norm": 0.5261268019676208, |
|
"learning_rate": 3.633187772925764e-07, |
|
"loss": 2.1687, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.9325458501709667, |
|
"grad_norm": 0.5728645920753479, |
|
"learning_rate": 3.6288209606986895e-07, |
|
"loss": 2.0929, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.935032639104756, |
|
"grad_norm": 0.536983072757721, |
|
"learning_rate": 3.6244541484716154e-07, |
|
"loss": 2.1669, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9375194280385453, |
|
"grad_norm": 0.5492017269134521, |
|
"learning_rate": 3.6200873362445414e-07, |
|
"loss": 2.1449, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.9400062169723344, |
|
"grad_norm": 0.5745022296905518, |
|
"learning_rate": 3.6157205240174673e-07, |
|
"loss": 2.1315, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.9424930059061237, |
|
"grad_norm": 0.5852669477462769, |
|
"learning_rate": 3.611353711790393e-07, |
|
"loss": 2.1405, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.944979794839913, |
|
"grad_norm": 0.5169341564178467, |
|
"learning_rate": 3.6069868995633186e-07, |
|
"loss": 2.0788, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9474665837737022, |
|
"grad_norm": 0.5499164462089539, |
|
"learning_rate": 3.6026200873362445e-07, |
|
"loss": 2.1843, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.9499533727074915, |
|
"grad_norm": 0.5195809006690979, |
|
"learning_rate": 3.59825327510917e-07, |
|
"loss": 2.1045, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.9524401616412806, |
|
"grad_norm": 0.5368107557296753, |
|
"learning_rate": 3.593886462882096e-07, |
|
"loss": 2.1261, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.9549269505750699, |
|
"grad_norm": 0.5721762776374817, |
|
"learning_rate": 3.5895196506550217e-07, |
|
"loss": 2.1323, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9574137395088592, |
|
"grad_norm": 0.5255040526390076, |
|
"learning_rate": 3.585152838427947e-07, |
|
"loss": 2.13, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9599005284426484, |
|
"grad_norm": 0.5373786687850952, |
|
"learning_rate": 3.580786026200873e-07, |
|
"loss": 2.0763, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.9623873173764377, |
|
"grad_norm": 0.5432249307632446, |
|
"learning_rate": 3.576419213973799e-07, |
|
"loss": 2.1305, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.964874106310227, |
|
"grad_norm": 0.5505443811416626, |
|
"learning_rate": 3.572052401746725e-07, |
|
"loss": 2.1311, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.9673608952440161, |
|
"grad_norm": 0.5119839906692505, |
|
"learning_rate": 3.567685589519651e-07, |
|
"loss": 2.1121, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.9698476841778054, |
|
"grad_norm": 0.5414577126502991, |
|
"learning_rate": 3.563318777292576e-07, |
|
"loss": 2.1076, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9723344731115946, |
|
"grad_norm": 0.5283794403076172, |
|
"learning_rate": 3.558951965065502e-07, |
|
"loss": 2.1293, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.9748212620453839, |
|
"grad_norm": 0.5475645065307617, |
|
"learning_rate": 3.554585152838428e-07, |
|
"loss": 2.1373, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.9773080509791732, |
|
"grad_norm": 0.5172975063323975, |
|
"learning_rate": 3.5502183406113534e-07, |
|
"loss": 2.1378, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.9797948399129623, |
|
"grad_norm": 0.5674493312835693, |
|
"learning_rate": 3.5458515283842793e-07, |
|
"loss": 2.0797, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.9822816288467516, |
|
"grad_norm": 0.510979950428009, |
|
"learning_rate": 3.541484716157205e-07, |
|
"loss": 2.091, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.9847684177805409, |
|
"grad_norm": 0.5517850518226624, |
|
"learning_rate": 3.5371179039301306e-07, |
|
"loss": 2.1703, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.9872552067143301, |
|
"grad_norm": 0.5487313270568848, |
|
"learning_rate": 3.5327510917030565e-07, |
|
"loss": 2.1213, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.9897419956481194, |
|
"grad_norm": 0.5256079435348511, |
|
"learning_rate": 3.528384279475982e-07, |
|
"loss": 2.1052, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.9922287845819087, |
|
"grad_norm": 0.5553068518638611, |
|
"learning_rate": 3.5240174672489084e-07, |
|
"loss": 2.0818, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.9947155735156978, |
|
"grad_norm": 0.5434982180595398, |
|
"learning_rate": 3.5196506550218343e-07, |
|
"loss": 2.0612, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9972023624494871, |
|
"grad_norm": 0.5237376689910889, |
|
"learning_rate": 3.5152838427947597e-07, |
|
"loss": 2.1489, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.9996891513832763, |
|
"grad_norm": 0.5455615520477295, |
|
"learning_rate": 3.5109170305676856e-07, |
|
"loss": 2.0467, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.9996891513832763, |
|
"eval_loss": 2.129138708114624, |
|
"eval_runtime": 458.2461, |
|
"eval_samples_per_second": 1.065, |
|
"eval_steps_per_second": 0.266, |
|
"step": 402 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1206, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1872324383890473e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|