|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0006379585326954, |
|
"eval_steps": 294, |
|
"global_step": 1176, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008506113769271664, |
|
"grad_norm": 0.7357208728790283, |
|
"learning_rate": 2e-05, |
|
"loss": 2.8145, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0017012227538543328, |
|
"grad_norm": 0.771299421787262, |
|
"learning_rate": 4e-05, |
|
"loss": 3.195, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.002551834130781499, |
|
"grad_norm": 0.7344720363616943, |
|
"learning_rate": 6e-05, |
|
"loss": 2.8861, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0034024455077086655, |
|
"grad_norm": 0.7500324845314026, |
|
"learning_rate": 8e-05, |
|
"loss": 2.7421, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.004253056884635832, |
|
"grad_norm": 0.9078495502471924, |
|
"learning_rate": 0.0001, |
|
"loss": 2.9622, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.005103668261562998, |
|
"grad_norm": 1.0794708728790283, |
|
"learning_rate": 0.00012, |
|
"loss": 3.1124, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.005954279638490165, |
|
"grad_norm": 1.0218361616134644, |
|
"learning_rate": 0.00014, |
|
"loss": 2.7233, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.006804891015417331, |
|
"grad_norm": 1.059141755104065, |
|
"learning_rate": 0.00016, |
|
"loss": 2.8784, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.007655502392344498, |
|
"grad_norm": 0.4901650547981262, |
|
"learning_rate": 0.00018, |
|
"loss": 2.6192, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.008506113769271665, |
|
"grad_norm": 0.8344933390617371, |
|
"learning_rate": 0.0002, |
|
"loss": 2.6448, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00935672514619883, |
|
"grad_norm": 1.5278894901275635, |
|
"learning_rate": 0.00019999963702861705, |
|
"loss": 2.7457, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.010207336523125997, |
|
"grad_norm": 1.2650033235549927, |
|
"learning_rate": 0.00019999854811710317, |
|
"loss": 2.7532, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.011057947900053162, |
|
"grad_norm": 0.740222156047821, |
|
"learning_rate": 0.0001999967332733632, |
|
"loss": 2.6836, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.01190855927698033, |
|
"grad_norm": 0.49257639050483704, |
|
"learning_rate": 0.0001999941925105719, |
|
"loss": 2.6658, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.012759170653907496, |
|
"grad_norm": 0.3310573399066925, |
|
"learning_rate": 0.00019999092584717374, |
|
"loss": 2.5043, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.013609782030834662, |
|
"grad_norm": 0.33361560106277466, |
|
"learning_rate": 0.00019998693330688282, |
|
"loss": 2.6252, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.014460393407761828, |
|
"grad_norm": 0.4449865221977234, |
|
"learning_rate": 0.00019998221491868273, |
|
"loss": 2.648, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.015311004784688996, |
|
"grad_norm": 0.4820970892906189, |
|
"learning_rate": 0.0001999767707168262, |
|
"loss": 2.7337, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.01616161616161616, |
|
"grad_norm": 0.5144203901290894, |
|
"learning_rate": 0.0001999706007408351, |
|
"loss": 2.6967, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.01701222753854333, |
|
"grad_norm": 0.501557469367981, |
|
"learning_rate": 0.0001999637050354999, |
|
"loss": 2.7318, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.017862838915470493, |
|
"grad_norm": 0.4480394423007965, |
|
"learning_rate": 0.00019995608365087946, |
|
"loss": 2.4126, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.01871345029239766, |
|
"grad_norm": 0.4459284842014313, |
|
"learning_rate": 0.00019994773664230064, |
|
"loss": 2.7072, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.01956406166932483, |
|
"grad_norm": 0.39909827709198, |
|
"learning_rate": 0.00019993866407035798, |
|
"loss": 2.6358, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.020414673046251993, |
|
"grad_norm": 0.36802783608436584, |
|
"learning_rate": 0.0001999288660009132, |
|
"loss": 2.6751, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.02126528442317916, |
|
"grad_norm": 0.43287962675094604, |
|
"learning_rate": 0.0001999183425050946, |
|
"loss": 2.7518, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.022115895800106325, |
|
"grad_norm": 0.4289425313472748, |
|
"learning_rate": 0.00019990709365929677, |
|
"loss": 2.7535, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.022966507177033493, |
|
"grad_norm": 0.4627043604850769, |
|
"learning_rate": 0.00019989511954517992, |
|
"loss": 2.8111, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.02381711855396066, |
|
"grad_norm": 0.4823961853981018, |
|
"learning_rate": 0.00019988242024966923, |
|
"loss": 2.9493, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.024667729930887825, |
|
"grad_norm": 0.4622437059879303, |
|
"learning_rate": 0.00019986899586495432, |
|
"loss": 2.788, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.025518341307814992, |
|
"grad_norm": 0.4963669776916504, |
|
"learning_rate": 0.00019985484648848853, |
|
"loss": 2.8304, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02636895268474216, |
|
"grad_norm": 0.47957557439804077, |
|
"learning_rate": 0.00019983997222298828, |
|
"loss": 2.7323, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.027219564061669324, |
|
"grad_norm": 0.445528507232666, |
|
"learning_rate": 0.00019982437317643217, |
|
"loss": 3.015, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.028070175438596492, |
|
"grad_norm": 0.46085312962532043, |
|
"learning_rate": 0.00019980804946206036, |
|
"loss": 2.8556, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.028920786815523656, |
|
"grad_norm": 0.5078282356262207, |
|
"learning_rate": 0.0001997910011983737, |
|
"loss": 2.8472, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.029771398192450824, |
|
"grad_norm": 0.4612430930137634, |
|
"learning_rate": 0.00019977322850913283, |
|
"loss": 2.6399, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03062200956937799, |
|
"grad_norm": 0.499965101480484, |
|
"learning_rate": 0.00019975473152335726, |
|
"loss": 2.9121, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.03147262094630516, |
|
"grad_norm": 0.5101069808006287, |
|
"learning_rate": 0.0001997355103753246, |
|
"loss": 2.8488, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.03232323232323232, |
|
"grad_norm": 0.5065872669219971, |
|
"learning_rate": 0.00019971556520456929, |
|
"loss": 2.8311, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.03317384370015949, |
|
"grad_norm": 0.5324426889419556, |
|
"learning_rate": 0.00019969489615588189, |
|
"loss": 2.7454, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.03402445507708666, |
|
"grad_norm": 0.5128815770149231, |
|
"learning_rate": 0.0001996735033793079, |
|
"loss": 2.8116, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03487506645401382, |
|
"grad_norm": 0.5330538153648376, |
|
"learning_rate": 0.00019965138703014655, |
|
"loss": 2.7584, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.03572567783094099, |
|
"grad_norm": 0.556816577911377, |
|
"learning_rate": 0.00019962854726894997, |
|
"loss": 2.8902, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.03657628920786816, |
|
"grad_norm": 0.5452866554260254, |
|
"learning_rate": 0.0001996049842615217, |
|
"loss": 2.7984, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.03742690058479532, |
|
"grad_norm": 0.5836021304130554, |
|
"learning_rate": 0.0001995806981789157, |
|
"loss": 2.803, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.03827751196172249, |
|
"grad_norm": 0.5968561172485352, |
|
"learning_rate": 0.00019955568919743507, |
|
"loss": 2.8592, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.03912812333864966, |
|
"grad_norm": 0.6416970491409302, |
|
"learning_rate": 0.0001995299574986306, |
|
"loss": 2.7488, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.03997873471557682, |
|
"grad_norm": 0.704325795173645, |
|
"learning_rate": 0.0001995035032692998, |
|
"loss": 2.6983, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.040829346092503986, |
|
"grad_norm": 0.7766572833061218, |
|
"learning_rate": 0.00019947632670148517, |
|
"loss": 2.9677, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.04167995746943115, |
|
"grad_norm": 0.7186003923416138, |
|
"learning_rate": 0.00019944842799247308, |
|
"loss": 3.0728, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.04253056884635832, |
|
"grad_norm": 0.7572959065437317, |
|
"learning_rate": 0.00019941980734479214, |
|
"loss": 3.0345, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.043381180223285486, |
|
"grad_norm": 0.48461732268333435, |
|
"learning_rate": 0.00019939046496621194, |
|
"loss": 2.6307, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.04423179160021265, |
|
"grad_norm": 0.468675434589386, |
|
"learning_rate": 0.0001993604010697413, |
|
"loss": 2.4616, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.04508240297713982, |
|
"grad_norm": 0.3815957009792328, |
|
"learning_rate": 0.0001993296158736269, |
|
"loss": 2.7479, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.045933014354066985, |
|
"grad_norm": 0.3313361704349518, |
|
"learning_rate": 0.00019929810960135172, |
|
"loss": 2.4983, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.04678362573099415, |
|
"grad_norm": 0.32521429657936096, |
|
"learning_rate": 0.00019926588248163316, |
|
"loss": 2.5446, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.04763423710792132, |
|
"grad_norm": 0.2972453236579895, |
|
"learning_rate": 0.00019923293474842174, |
|
"loss": 2.5472, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.048484848484848485, |
|
"grad_norm": 0.2972238063812256, |
|
"learning_rate": 0.00019919926664089909, |
|
"loss": 2.5389, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.04933545986177565, |
|
"grad_norm": 0.27498453855514526, |
|
"learning_rate": 0.00019916487840347644, |
|
"loss": 2.571, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.05018607123870282, |
|
"grad_norm": 0.2938655614852905, |
|
"learning_rate": 0.00019912977028579268, |
|
"loss": 2.7134, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.051036682615629984, |
|
"grad_norm": 0.26742392778396606, |
|
"learning_rate": 0.0001990939425427127, |
|
"loss": 2.5632, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05188729399255715, |
|
"grad_norm": 0.28117692470550537, |
|
"learning_rate": 0.00019905739543432536, |
|
"loss": 2.5297, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.05273790536948432, |
|
"grad_norm": 0.28916725516319275, |
|
"learning_rate": 0.00019902012922594177, |
|
"loss": 2.7096, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.053588516746411484, |
|
"grad_norm": 0.32468459010124207, |
|
"learning_rate": 0.0001989821441880933, |
|
"loss": 2.6192, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.05443912812333865, |
|
"grad_norm": 0.2806537449359894, |
|
"learning_rate": 0.0001989434405965295, |
|
"loss": 2.6747, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.05528973950026582, |
|
"grad_norm": 0.2876998782157898, |
|
"learning_rate": 0.0001989040187322164, |
|
"loss": 2.7443, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.056140350877192984, |
|
"grad_norm": 0.27619123458862305, |
|
"learning_rate": 0.00019886387888133413, |
|
"loss": 2.7379, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.05699096225412015, |
|
"grad_norm": 0.31479549407958984, |
|
"learning_rate": 0.000198823021335275, |
|
"loss": 2.4039, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.05784157363104731, |
|
"grad_norm": 0.300857812166214, |
|
"learning_rate": 0.00019878144639064144, |
|
"loss": 2.5705, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.05869218500797448, |
|
"grad_norm": 0.3776433765888214, |
|
"learning_rate": 0.00019873915434924375, |
|
"loss": 2.863, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.05954279638490165, |
|
"grad_norm": 0.30585938692092896, |
|
"learning_rate": 0.00019869614551809795, |
|
"loss": 2.5312, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06039340776182881, |
|
"grad_norm": 0.3163856267929077, |
|
"learning_rate": 0.00019865242020942353, |
|
"loss": 2.8491, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.06124401913875598, |
|
"grad_norm": 0.30077147483825684, |
|
"learning_rate": 0.00019860797874064122, |
|
"loss": 2.7777, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.06209463051568315, |
|
"grad_norm": 0.4153176248073578, |
|
"learning_rate": 0.0001985628214343706, |
|
"loss": 2.7499, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.06294524189261032, |
|
"grad_norm": 0.35611122846603394, |
|
"learning_rate": 0.00019851694861842793, |
|
"loss": 2.7089, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.06379585326953748, |
|
"grad_norm": 0.3143812417984009, |
|
"learning_rate": 0.00019847036062582357, |
|
"loss": 2.758, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06464646464646465, |
|
"grad_norm": 0.32024794816970825, |
|
"learning_rate": 0.00019842305779475968, |
|
"loss": 2.4616, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.06549707602339182, |
|
"grad_norm": 0.3146126866340637, |
|
"learning_rate": 0.00019837504046862775, |
|
"loss": 2.6104, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.06634768740031897, |
|
"grad_norm": 0.32578444480895996, |
|
"learning_rate": 0.00019832630899600608, |
|
"loss": 2.6297, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.06719829877724615, |
|
"grad_norm": 0.36873045563697815, |
|
"learning_rate": 0.00019827686373065728, |
|
"loss": 2.6358, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.06804891015417332, |
|
"grad_norm": 0.3558378517627716, |
|
"learning_rate": 0.00019822670503152567, |
|
"loss": 2.6308, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06889952153110047, |
|
"grad_norm": 0.37967684864997864, |
|
"learning_rate": 0.00019817583326273467, |
|
"loss": 2.7577, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.06975013290802765, |
|
"grad_norm": 0.3737669885158539, |
|
"learning_rate": 0.00019812424879358425, |
|
"loss": 2.9207, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.07060074428495482, |
|
"grad_norm": 0.39410829544067383, |
|
"learning_rate": 0.0001980719519985481, |
|
"loss": 2.9544, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.07145135566188197, |
|
"grad_norm": 0.3863750696182251, |
|
"learning_rate": 0.00019801894325727104, |
|
"loss": 2.7794, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.07230196703880915, |
|
"grad_norm": 0.4226458966732025, |
|
"learning_rate": 0.0001979652229545662, |
|
"loss": 2.7491, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.07315257841573632, |
|
"grad_norm": 0.42758506536483765, |
|
"learning_rate": 0.0001979107914804122, |
|
"loss": 2.8524, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.07400318979266347, |
|
"grad_norm": 0.4379200041294098, |
|
"learning_rate": 0.0001978556492299504, |
|
"loss": 2.6526, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.07485380116959064, |
|
"grad_norm": 0.44331902265548706, |
|
"learning_rate": 0.000197799796603482, |
|
"loss": 2.8028, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.07570441254651782, |
|
"grad_norm": 0.4358711540699005, |
|
"learning_rate": 0.0001977432340064651, |
|
"loss": 2.5426, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.07655502392344497, |
|
"grad_norm": 0.45511335134506226, |
|
"learning_rate": 0.00019768596184951173, |
|
"loss": 2.7067, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07740563530037214, |
|
"grad_norm": 0.5394377112388611, |
|
"learning_rate": 0.00019762798054838502, |
|
"loss": 2.8189, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.07825624667729932, |
|
"grad_norm": 0.5124706625938416, |
|
"learning_rate": 0.00019756929052399603, |
|
"loss": 2.7702, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.07910685805422647, |
|
"grad_norm": 0.5025349855422974, |
|
"learning_rate": 0.00019750989220240073, |
|
"loss": 2.6872, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.07995746943115364, |
|
"grad_norm": 0.5144663453102112, |
|
"learning_rate": 0.00019744978601479694, |
|
"loss": 2.6366, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.08080808080808081, |
|
"grad_norm": 0.5908443927764893, |
|
"learning_rate": 0.00019738897239752118, |
|
"loss": 2.7918, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.08165869218500797, |
|
"grad_norm": 0.6398508548736572, |
|
"learning_rate": 0.00019732745179204552, |
|
"loss": 2.9972, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.08250930356193514, |
|
"grad_norm": 0.6032273173332214, |
|
"learning_rate": 0.00019726522464497435, |
|
"loss": 2.7638, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.0833599149388623, |
|
"grad_norm": 0.6310097575187683, |
|
"learning_rate": 0.0001972022914080411, |
|
"loss": 2.9328, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.08421052631578947, |
|
"grad_norm": 0.7050711512565613, |
|
"learning_rate": 0.00019713865253810506, |
|
"loss": 2.8143, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.08506113769271664, |
|
"grad_norm": 0.755136251449585, |
|
"learning_rate": 0.00019707430849714807, |
|
"loss": 3.036, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0859117490696438, |
|
"grad_norm": 0.35153907537460327, |
|
"learning_rate": 0.00019700925975227096, |
|
"loss": 2.4444, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.08676236044657097, |
|
"grad_norm": 0.40153488516807556, |
|
"learning_rate": 0.0001969435067756904, |
|
"loss": 2.6068, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.08761297182349814, |
|
"grad_norm": 0.3474213480949402, |
|
"learning_rate": 0.00019687705004473545, |
|
"loss": 2.4261, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.0884635832004253, |
|
"grad_norm": 0.3283519744873047, |
|
"learning_rate": 0.00019680989004184382, |
|
"loss": 2.6736, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.08931419457735247, |
|
"grad_norm": 0.29034170508384705, |
|
"learning_rate": 0.00019674202725455877, |
|
"loss": 2.5551, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.09016480595427964, |
|
"grad_norm": 0.2918970584869385, |
|
"learning_rate": 0.00019667346217552527, |
|
"loss": 2.6039, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.0910154173312068, |
|
"grad_norm": 0.2852106988430023, |
|
"learning_rate": 0.00019660419530248655, |
|
"loss": 2.5432, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.09186602870813397, |
|
"grad_norm": 0.30997323989868164, |
|
"learning_rate": 0.0001965342271382805, |
|
"loss": 2.7324, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.09271664008506114, |
|
"grad_norm": 0.34156399965286255, |
|
"learning_rate": 0.00019646355819083589, |
|
"loss": 2.6548, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.0935672514619883, |
|
"grad_norm": 0.2763843238353729, |
|
"learning_rate": 0.00019639218897316883, |
|
"loss": 2.5254, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09441786283891547, |
|
"grad_norm": 0.2835611402988434, |
|
"learning_rate": 0.00019632012000337908, |
|
"loss": 2.5677, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.09526847421584264, |
|
"grad_norm": 0.2940271198749542, |
|
"learning_rate": 0.00019624735180464602, |
|
"loss": 2.5976, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.0961190855927698, |
|
"grad_norm": 0.2714485824108124, |
|
"learning_rate": 0.00019617388490522517, |
|
"loss": 2.6087, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.09696969696969697, |
|
"grad_norm": 0.30371204018592834, |
|
"learning_rate": 0.00019609971983844412, |
|
"loss": 2.6129, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.09782030834662414, |
|
"grad_norm": 0.2762625813484192, |
|
"learning_rate": 0.0001960248571426989, |
|
"loss": 2.5759, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0986709197235513, |
|
"grad_norm": 0.2702981233596802, |
|
"learning_rate": 0.00019594929736144976, |
|
"loss": 2.5443, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.09952153110047847, |
|
"grad_norm": 0.29210978746414185, |
|
"learning_rate": 0.00019587304104321746, |
|
"loss": 2.6425, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.10037214247740564, |
|
"grad_norm": 0.31620749831199646, |
|
"learning_rate": 0.00019579608874157928, |
|
"loss": 2.703, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.1012227538543328, |
|
"grad_norm": 0.2803102433681488, |
|
"learning_rate": 0.00019571844101516484, |
|
"loss": 2.6886, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.10207336523125997, |
|
"grad_norm": 0.30169349908828735, |
|
"learning_rate": 0.00019564009842765225, |
|
"loss": 2.8221, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.10292397660818714, |
|
"grad_norm": 0.297553151845932, |
|
"learning_rate": 0.00019556106154776379, |
|
"loss": 2.6897, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.1037745879851143, |
|
"grad_norm": 0.30721086263656616, |
|
"learning_rate": 0.000195481330949262, |
|
"loss": 2.6551, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.10462519936204147, |
|
"grad_norm": 0.29124605655670166, |
|
"learning_rate": 0.00019540090721094542, |
|
"loss": 2.6292, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.10547581073896864, |
|
"grad_norm": 0.31037285923957825, |
|
"learning_rate": 0.0001953197909166443, |
|
"loss": 2.5459, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.1063264221158958, |
|
"grad_norm": 0.3543750047683716, |
|
"learning_rate": 0.00019523798265521654, |
|
"loss": 2.5622, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.10717703349282297, |
|
"grad_norm": 0.3356544077396393, |
|
"learning_rate": 0.00019515548302054335, |
|
"loss": 2.7272, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.10802764486975014, |
|
"grad_norm": 0.34296396374702454, |
|
"learning_rate": 0.00019507229261152476, |
|
"loss": 2.6629, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.1088782562466773, |
|
"grad_norm": 0.34629112482070923, |
|
"learning_rate": 0.0001949884120320756, |
|
"loss": 2.6371, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.10972886762360447, |
|
"grad_norm": 0.34170377254486084, |
|
"learning_rate": 0.00019490384189112082, |
|
"loss": 2.7218, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.11057947900053164, |
|
"grad_norm": 0.38438230752944946, |
|
"learning_rate": 0.0001948185828025913, |
|
"loss": 2.7096, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1114300903774588, |
|
"grad_norm": 0.40347060561180115, |
|
"learning_rate": 0.00019473263538541914, |
|
"loss": 2.8129, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.11228070175438597, |
|
"grad_norm": 0.3742891848087311, |
|
"learning_rate": 0.00019464600026353348, |
|
"loss": 2.7916, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.11313131313131314, |
|
"grad_norm": 0.4015231430530548, |
|
"learning_rate": 0.0001945586780658557, |
|
"loss": 2.6099, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.1139819245082403, |
|
"grad_norm": 0.40618133544921875, |
|
"learning_rate": 0.00019447066942629491, |
|
"loss": 2.6669, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.11483253588516747, |
|
"grad_norm": 0.4171842932701111, |
|
"learning_rate": 0.00019438197498374357, |
|
"loss": 2.6272, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.11568314726209462, |
|
"grad_norm": 0.443013995885849, |
|
"learning_rate": 0.0001942925953820725, |
|
"loss": 2.5722, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.1165337586390218, |
|
"grad_norm": 0.4636158347129822, |
|
"learning_rate": 0.00019420253127012645, |
|
"loss": 2.8075, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.11738437001594897, |
|
"grad_norm": 0.4271916151046753, |
|
"learning_rate": 0.00019411178330171937, |
|
"loss": 2.6875, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.11823498139287612, |
|
"grad_norm": 0.47826603055000305, |
|
"learning_rate": 0.00019402035213562954, |
|
"loss": 2.7042, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.1190855927698033, |
|
"grad_norm": 0.46729791164398193, |
|
"learning_rate": 0.0001939282384355949, |
|
"loss": 2.6663, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.11993620414673047, |
|
"grad_norm": 0.4689824879169464, |
|
"learning_rate": 0.0001938354428703082, |
|
"loss": 2.6138, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.12078681552365762, |
|
"grad_norm": 0.526096522808075, |
|
"learning_rate": 0.0001937419661134121, |
|
"loss": 2.9258, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.1216374269005848, |
|
"grad_norm": 0.5075511932373047, |
|
"learning_rate": 0.0001936478088434944, |
|
"loss": 2.8021, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.12248803827751197, |
|
"grad_norm": 0.5048439502716064, |
|
"learning_rate": 0.00019355297174408298, |
|
"loss": 2.6274, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.12333864965443912, |
|
"grad_norm": 0.5787357687950134, |
|
"learning_rate": 0.00019345745550364087, |
|
"loss": 2.851, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.1241892610313663, |
|
"grad_norm": 0.5641311407089233, |
|
"learning_rate": 0.00019336126081556134, |
|
"loss": 2.7681, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.12503987240829345, |
|
"grad_norm": 0.5504147410392761, |
|
"learning_rate": 0.00019326438837816276, |
|
"loss": 2.6905, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.12589048378522064, |
|
"grad_norm": 0.6101283431053162, |
|
"learning_rate": 0.00019316683889468358, |
|
"loss": 2.589, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.1267410951621478, |
|
"grad_norm": 0.7153661847114563, |
|
"learning_rate": 0.00019306861307327725, |
|
"loss": 2.9563, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.12759170653907495, |
|
"grad_norm": 0.7049738168716431, |
|
"learning_rate": 0.00019296971162700694, |
|
"loss": 2.8023, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12844231791600214, |
|
"grad_norm": 0.3282754421234131, |
|
"learning_rate": 0.00019287013527384062, |
|
"loss": 2.4278, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.1292929292929293, |
|
"grad_norm": 0.350577712059021, |
|
"learning_rate": 0.00019276988473664557, |
|
"loss": 2.5845, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.13014354066985645, |
|
"grad_norm": 0.32433176040649414, |
|
"learning_rate": 0.00019266896074318334, |
|
"loss": 2.6126, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.13099415204678364, |
|
"grad_norm": 0.31844663619995117, |
|
"learning_rate": 0.00019256736402610436, |
|
"loss": 2.527, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.1318447634237108, |
|
"grad_norm": 0.2559802830219269, |
|
"learning_rate": 0.00019246509532294266, |
|
"loss": 2.2437, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.13269537480063795, |
|
"grad_norm": 0.28512275218963623, |
|
"learning_rate": 0.00019236215537611046, |
|
"loss": 2.5739, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.13354598617756513, |
|
"grad_norm": 0.26634740829467773, |
|
"learning_rate": 0.00019225854493289286, |
|
"loss": 2.4485, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.1343965975544923, |
|
"grad_norm": 0.2785400450229645, |
|
"learning_rate": 0.0001921542647454424, |
|
"loss": 2.7944, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.13524720893141945, |
|
"grad_norm": 0.27485981583595276, |
|
"learning_rate": 0.00019204931557077355, |
|
"loss": 2.6518, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.13609782030834663, |
|
"grad_norm": 0.2687318027019501, |
|
"learning_rate": 0.00019194369817075724, |
|
"loss": 2.6595, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1369484316852738, |
|
"grad_norm": 0.26418977975845337, |
|
"learning_rate": 0.00019183741331211537, |
|
"loss": 2.7045, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.13779904306220095, |
|
"grad_norm": 0.28258347511291504, |
|
"learning_rate": 0.00019173046176641513, |
|
"loss": 2.5896, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.13864965443912813, |
|
"grad_norm": 0.27390146255493164, |
|
"learning_rate": 0.00019162284431006358, |
|
"loss": 2.5566, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.1395002658160553, |
|
"grad_norm": 0.2916048765182495, |
|
"learning_rate": 0.00019151456172430183, |
|
"loss": 2.609, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.14035087719298245, |
|
"grad_norm": 0.30684247612953186, |
|
"learning_rate": 0.00019140561479519955, |
|
"loss": 2.5222, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.14120148856990963, |
|
"grad_norm": 0.26836761832237244, |
|
"learning_rate": 0.00019129600431364897, |
|
"loss": 2.5891, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.1420520999468368, |
|
"grad_norm": 0.2658300995826721, |
|
"learning_rate": 0.00019118573107535953, |
|
"loss": 2.644, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.14290271132376395, |
|
"grad_norm": 0.2789425551891327, |
|
"learning_rate": 0.00019107479588085182, |
|
"loss": 2.5641, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.14375332270069113, |
|
"grad_norm": 0.2909972071647644, |
|
"learning_rate": 0.00019096319953545185, |
|
"loss": 2.5982, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.1446039340776183, |
|
"grad_norm": 0.3741363286972046, |
|
"learning_rate": 0.0001908509428492852, |
|
"loss": 2.6293, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.14545454545454545, |
|
"grad_norm": 0.2989426851272583, |
|
"learning_rate": 0.0001907380266372712, |
|
"loss": 2.7364, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.14630515683147263, |
|
"grad_norm": 0.28862622380256653, |
|
"learning_rate": 0.00019062445171911686, |
|
"loss": 2.5656, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.1471557682083998, |
|
"grad_norm": 0.3215920329093933, |
|
"learning_rate": 0.0001905102189193112, |
|
"loss": 2.8443, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.14800637958532695, |
|
"grad_norm": 0.2994636595249176, |
|
"learning_rate": 0.00019039532906711882, |
|
"loss": 2.7014, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.14885699096225413, |
|
"grad_norm": 0.32109183073043823, |
|
"learning_rate": 0.00019027978299657436, |
|
"loss": 2.8364, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.1497076023391813, |
|
"grad_norm": 0.30813783407211304, |
|
"learning_rate": 0.00019016358154647618, |
|
"loss": 2.5102, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.15055821371610845, |
|
"grad_norm": 0.32674533128738403, |
|
"learning_rate": 0.00019004672556038028, |
|
"loss": 2.757, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.15140882509303563, |
|
"grad_norm": 0.34680357575416565, |
|
"learning_rate": 0.00018992921588659422, |
|
"loss": 2.5228, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.1522594364699628, |
|
"grad_norm": 0.35170817375183105, |
|
"learning_rate": 0.00018981105337817104, |
|
"loss": 2.6148, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.15311004784688995, |
|
"grad_norm": 0.3741483986377716, |
|
"learning_rate": 0.00018969223889290284, |
|
"loss": 2.8025, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.15396065922381713, |
|
"grad_norm": 0.4156269431114197, |
|
"learning_rate": 0.00018957277329331485, |
|
"loss": 2.72, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.1548112706007443, |
|
"grad_norm": 0.3726477324962616, |
|
"learning_rate": 0.00018945265744665886, |
|
"loss": 2.6197, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.15566188197767145, |
|
"grad_norm": 0.4135706424713135, |
|
"learning_rate": 0.00018933189222490726, |
|
"loss": 2.7176, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.15651249335459863, |
|
"grad_norm": 0.38799911737442017, |
|
"learning_rate": 0.00018921047850474642, |
|
"loss": 2.5641, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.1573631047315258, |
|
"grad_norm": 0.4622843265533447, |
|
"learning_rate": 0.00018908841716757042, |
|
"loss": 2.7626, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.15821371610845295, |
|
"grad_norm": 0.4251146912574768, |
|
"learning_rate": 0.00018896570909947475, |
|
"loss": 2.6842, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.15906432748538013, |
|
"grad_norm": 0.4628697335720062, |
|
"learning_rate": 0.00018884235519124972, |
|
"loss": 2.9476, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.1599149388623073, |
|
"grad_norm": 0.5052159428596497, |
|
"learning_rate": 0.0001887183563383741, |
|
"loss": 2.769, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.16076555023923444, |
|
"grad_norm": 0.4817435145378113, |
|
"learning_rate": 0.00018859371344100864, |
|
"loss": 2.6266, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.16161616161616163, |
|
"grad_norm": 0.4751468598842621, |
|
"learning_rate": 0.0001884684274039894, |
|
"loss": 2.877, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1624667729930888, |
|
"grad_norm": 0.5826165676116943, |
|
"learning_rate": 0.00018834249913682132, |
|
"loss": 2.7308, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.16331738437001594, |
|
"grad_norm": 0.5441760420799255, |
|
"learning_rate": 0.00018821592955367154, |
|
"loss": 2.6764, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.1641679957469431, |
|
"grad_norm": 0.5005947947502136, |
|
"learning_rate": 0.00018808871957336275, |
|
"loss": 2.664, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.1650186071238703, |
|
"grad_norm": 0.5205551981925964, |
|
"learning_rate": 0.00018796087011936665, |
|
"loss": 2.6192, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.16586921850079744, |
|
"grad_norm": 0.5489931106567383, |
|
"learning_rate": 0.0001878323821197971, |
|
"loss": 2.5061, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.1667198298777246, |
|
"grad_norm": 0.5525840520858765, |
|
"learning_rate": 0.00018770325650740345, |
|
"loss": 2.7474, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.1675704412546518, |
|
"grad_norm": 0.5978725552558899, |
|
"learning_rate": 0.0001875734942195637, |
|
"loss": 2.6055, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.16842105263157894, |
|
"grad_norm": 0.6148700714111328, |
|
"learning_rate": 0.0001874430961982778, |
|
"loss": 2.8352, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.1692716640085061, |
|
"grad_norm": 0.5956620573997498, |
|
"learning_rate": 0.0001873120633901608, |
|
"loss": 2.7367, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.17012227538543329, |
|
"grad_norm": 0.7082740664482117, |
|
"learning_rate": 0.0001871803967464358, |
|
"loss": 2.9437, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17097288676236044, |
|
"grad_norm": 0.32244405150413513, |
|
"learning_rate": 0.00018704809722292737, |
|
"loss": 2.3835, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.1718234981392876, |
|
"grad_norm": 0.3367772102355957, |
|
"learning_rate": 0.00018691516578005427, |
|
"loss": 2.601, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.17267410951621479, |
|
"grad_norm": 0.31732872128486633, |
|
"learning_rate": 0.00018678160338282272, |
|
"loss": 2.5894, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.17352472089314194, |
|
"grad_norm": 0.27467650175094604, |
|
"learning_rate": 0.0001866474110008193, |
|
"loss": 2.4369, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.1743753322700691, |
|
"grad_norm": 0.29726937413215637, |
|
"learning_rate": 0.00018651258960820385, |
|
"loss": 2.6123, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.17522594364699628, |
|
"grad_norm": 0.27499106526374817, |
|
"learning_rate": 0.00018637714018370253, |
|
"loss": 2.5141, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.17607655502392344, |
|
"grad_norm": 0.27535390853881836, |
|
"learning_rate": 0.00018624106371060067, |
|
"loss": 2.5148, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.1769271664008506, |
|
"grad_norm": 0.2687024176120758, |
|
"learning_rate": 0.00018610436117673555, |
|
"loss": 2.6057, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 0.31320950388908386, |
|
"learning_rate": 0.00018596703357448934, |
|
"loss": 2.6813, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.17862838915470494, |
|
"grad_norm": 0.25832033157348633, |
|
"learning_rate": 0.00018582908190078185, |
|
"loss": 2.4898, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1794790005316321, |
|
"grad_norm": 0.2806166410446167, |
|
"learning_rate": 0.00018569050715706325, |
|
"loss": 2.5762, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.18032961190855928, |
|
"grad_norm": 0.26099708676338196, |
|
"learning_rate": 0.00018555131034930685, |
|
"loss": 2.5386, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.18118022328548644, |
|
"grad_norm": 0.26140880584716797, |
|
"learning_rate": 0.00018541149248800184, |
|
"loss": 2.7159, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.1820308346624136, |
|
"grad_norm": 0.2698177695274353, |
|
"learning_rate": 0.0001852710545881459, |
|
"loss": 2.5942, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.18288144603934078, |
|
"grad_norm": 0.27240726351737976, |
|
"learning_rate": 0.00018512999766923772, |
|
"loss": 2.5377, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.18373205741626794, |
|
"grad_norm": 0.2780822813510895, |
|
"learning_rate": 0.00018498832275526988, |
|
"loss": 2.6185, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.1845826687931951, |
|
"grad_norm": 0.2713901400566101, |
|
"learning_rate": 0.00018484603087472109, |
|
"loss": 2.5802, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.18543328017012228, |
|
"grad_norm": 0.2843954265117645, |
|
"learning_rate": 0.000184703123060549, |
|
"loss": 2.6404, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.18628389154704944, |
|
"grad_norm": 0.2679051160812378, |
|
"learning_rate": 0.0001845596003501826, |
|
"loss": 2.6688, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.1871345029239766, |
|
"grad_norm": 0.292568176984787, |
|
"learning_rate": 0.00018441546378551458, |
|
"loss": 2.6505, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.18798511430090378, |
|
"grad_norm": 0.282326877117157, |
|
"learning_rate": 0.00018427071441289388, |
|
"loss": 2.6299, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.18883572567783094, |
|
"grad_norm": 0.2853985130786896, |
|
"learning_rate": 0.00018412535328311814, |
|
"loss": 2.8143, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.1896863370547581, |
|
"grad_norm": 0.2786814868450165, |
|
"learning_rate": 0.00018397938145142591, |
|
"loss": 2.6007, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.19053694843168528, |
|
"grad_norm": 0.42460358142852783, |
|
"learning_rate": 0.0001838327999774892, |
|
"loss": 2.7891, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.19138755980861244, |
|
"grad_norm": 0.30478838086128235, |
|
"learning_rate": 0.00018368560992540562, |
|
"loss": 2.4551, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.1922381711855396, |
|
"grad_norm": 0.3402044177055359, |
|
"learning_rate": 0.00018353781236369064, |
|
"loss": 2.9191, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.19308878256246678, |
|
"grad_norm": 0.33662521839141846, |
|
"learning_rate": 0.00018338940836527004, |
|
"loss": 2.5606, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.19393939393939394, |
|
"grad_norm": 0.34461426734924316, |
|
"learning_rate": 0.0001832403990074719, |
|
"loss": 2.714, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.1947900053163211, |
|
"grad_norm": 0.342184454202652, |
|
"learning_rate": 0.0001830907853720188, |
|
"loss": 2.6936, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.19564061669324828, |
|
"grad_norm": 0.3557281494140625, |
|
"learning_rate": 0.0001829405685450202, |
|
"loss": 2.6663, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.19649122807017544, |
|
"grad_norm": 0.38674700260162354, |
|
"learning_rate": 0.0001827897496169642, |
|
"loss": 2.7257, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.1973418394471026, |
|
"grad_norm": 0.3849089741706848, |
|
"learning_rate": 0.00018263832968271, |
|
"loss": 2.7178, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.19819245082402978, |
|
"grad_norm": 0.4508901834487915, |
|
"learning_rate": 0.00018248630984147955, |
|
"loss": 2.7947, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.19904306220095694, |
|
"grad_norm": 0.39502936601638794, |
|
"learning_rate": 0.00018233369119684996, |
|
"loss": 2.5885, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.1998936735778841, |
|
"grad_norm": 0.4287837743759155, |
|
"learning_rate": 0.00018218047485674523, |
|
"loss": 2.6911, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.20074428495481128, |
|
"grad_norm": 0.4257849454879761, |
|
"learning_rate": 0.00018202666193342833, |
|
"loss": 2.8803, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.20159489633173844, |
|
"grad_norm": 0.4459477961063385, |
|
"learning_rate": 0.00018187225354349295, |
|
"loss": 2.8352, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.2024455077086656, |
|
"grad_norm": 0.4430312514305115, |
|
"learning_rate": 0.0001817172508078557, |
|
"loss": 2.7517, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.20329611908559278, |
|
"grad_norm": 0.4465429484844208, |
|
"learning_rate": 0.00018156165485174773, |
|
"loss": 2.7119, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.20414673046251994, |
|
"grad_norm": 0.4532601833343506, |
|
"learning_rate": 0.00018140546680470659, |
|
"loss": 2.7346, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2049973418394471, |
|
"grad_norm": 0.4750036299228668, |
|
"learning_rate": 0.00018124868780056814, |
|
"loss": 2.6113, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.20584795321637428, |
|
"grad_norm": 0.5072234272956848, |
|
"learning_rate": 0.00018109131897745822, |
|
"loss": 2.844, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.20669856459330144, |
|
"grad_norm": 0.5094662308692932, |
|
"learning_rate": 0.00018093336147778438, |
|
"loss": 2.7737, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.2075491759702286, |
|
"grad_norm": 0.606842577457428, |
|
"learning_rate": 0.00018077481644822768, |
|
"loss": 2.6153, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.20839978734715578, |
|
"grad_norm": 0.5311163067817688, |
|
"learning_rate": 0.00018061568503973435, |
|
"loss": 2.6038, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.20925039872408294, |
|
"grad_norm": 0.5758761167526245, |
|
"learning_rate": 0.00018045596840750723, |
|
"loss": 2.6446, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.2101010101010101, |
|
"grad_norm": 0.598297119140625, |
|
"learning_rate": 0.00018029566771099776, |
|
"loss": 2.7002, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.21095162147793728, |
|
"grad_norm": 0.6635774970054626, |
|
"learning_rate": 0.00018013478411389716, |
|
"loss": 2.8011, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.21180223285486444, |
|
"grad_norm": 0.6850919723510742, |
|
"learning_rate": 0.00017997331878412835, |
|
"loss": 2.8903, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.2126528442317916, |
|
"grad_norm": 0.7298348546028137, |
|
"learning_rate": 0.00017981127289383716, |
|
"loss": 2.9483, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.21350345560871878, |
|
"grad_norm": 0.33354559540748596, |
|
"learning_rate": 0.00017964864761938404, |
|
"loss": 2.4727, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.21435406698564594, |
|
"grad_norm": 0.3557465374469757, |
|
"learning_rate": 0.00017948544414133534, |
|
"loss": 2.5058, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.2152046783625731, |
|
"grad_norm": 0.3230442702770233, |
|
"learning_rate": 0.00017932166364445498, |
|
"loss": 2.5422, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.21605528973950028, |
|
"grad_norm": 0.28668278455734253, |
|
"learning_rate": 0.0001791573073176956, |
|
"loss": 2.3173, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.21690590111642744, |
|
"grad_norm": 0.30019721388816833, |
|
"learning_rate": 0.00017899237635419002, |
|
"loss": 2.6444, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.2177565124933546, |
|
"grad_norm": 0.285314679145813, |
|
"learning_rate": 0.0001788268719512427, |
|
"loss": 2.5319, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.21860712387028178, |
|
"grad_norm": 0.27584996819496155, |
|
"learning_rate": 0.00017866079531032088, |
|
"loss": 2.6496, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.21945773524720893, |
|
"grad_norm": 0.2874069809913635, |
|
"learning_rate": 0.0001784941476370459, |
|
"loss": 2.5156, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.2203083466241361, |
|
"grad_norm": 0.26786255836486816, |
|
"learning_rate": 0.00017832693014118448, |
|
"loss": 2.6211, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.22115895800106328, |
|
"grad_norm": 0.2633914351463318, |
|
"learning_rate": 0.0001781591440366399, |
|
"loss": 2.5811, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.22200956937799043, |
|
"grad_norm": 0.2724866569042206, |
|
"learning_rate": 0.00017799079054144334, |
|
"loss": 2.5904, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.2228601807549176, |
|
"grad_norm": 0.29333001375198364, |
|
"learning_rate": 0.00017782187087774477, |
|
"loss": 2.7581, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.22371079213184478, |
|
"grad_norm": 0.2735550105571747, |
|
"learning_rate": 0.00017765238627180424, |
|
"loss": 2.7114, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.22456140350877193, |
|
"grad_norm": 0.2721397280693054, |
|
"learning_rate": 0.00017748233795398307, |
|
"loss": 2.5991, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.2254120148856991, |
|
"grad_norm": 0.25755858421325684, |
|
"learning_rate": 0.0001773117271587346, |
|
"loss": 2.5786, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.22626262626262628, |
|
"grad_norm": 0.25772804021835327, |
|
"learning_rate": 0.00017714055512459565, |
|
"loss": 2.488, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.22711323763955343, |
|
"grad_norm": 0.2766227424144745, |
|
"learning_rate": 0.0001769688230941772, |
|
"loss": 2.8924, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.2279638490164806, |
|
"grad_norm": 0.26846593618392944, |
|
"learning_rate": 0.00017679653231415552, |
|
"loss": 2.5783, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.22881446039340775, |
|
"grad_norm": 0.26374372839927673, |
|
"learning_rate": 0.00017662368403526302, |
|
"loss": 2.4675, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.22966507177033493, |
|
"grad_norm": 0.28237268328666687, |
|
"learning_rate": 0.0001764502795122793, |
|
"loss": 2.5994, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2305156831472621, |
|
"grad_norm": 0.2786102890968323, |
|
"learning_rate": 0.00017627632000402193, |
|
"loss": 2.514, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.23136629452418925, |
|
"grad_norm": 0.27646180987358093, |
|
"learning_rate": 0.00017610180677333739, |
|
"loss": 2.5673, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.23221690590111643, |
|
"grad_norm": 0.3052549660205841, |
|
"learning_rate": 0.00017592674108709186, |
|
"loss": 2.5345, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.2330675172780436, |
|
"grad_norm": 0.30554690957069397, |
|
"learning_rate": 0.00017575112421616202, |
|
"loss": 2.709, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.23391812865497075, |
|
"grad_norm": 0.3219161331653595, |
|
"learning_rate": 0.00017557495743542585, |
|
"loss": 2.6825, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.23476874003189793, |
|
"grad_norm": 0.31834957003593445, |
|
"learning_rate": 0.0001753982420237533, |
|
"loss": 2.7017, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.2356193514088251, |
|
"grad_norm": 0.30264872312545776, |
|
"learning_rate": 0.00017522097926399722, |
|
"loss": 2.3725, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.23646996278575225, |
|
"grad_norm": 0.3283548951148987, |
|
"learning_rate": 0.00017504317044298367, |
|
"loss": 2.6217, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.23732057416267943, |
|
"grad_norm": 0.33564746379852295, |
|
"learning_rate": 0.00017486481685150302, |
|
"loss": 2.5738, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.2381711855396066, |
|
"grad_norm": 0.37258434295654297, |
|
"learning_rate": 0.0001746859197843002, |
|
"loss": 2.783, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.23902179691653375, |
|
"grad_norm": 0.3897363245487213, |
|
"learning_rate": 0.0001745064805400656, |
|
"loss": 2.7908, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.23987240829346093, |
|
"grad_norm": 0.3756699562072754, |
|
"learning_rate": 0.00017432650042142536, |
|
"loss": 2.5944, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.2407230196703881, |
|
"grad_norm": 0.3787755072116852, |
|
"learning_rate": 0.00017414598073493216, |
|
"loss": 2.7574, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.24157363104731525, |
|
"grad_norm": 0.38891106843948364, |
|
"learning_rate": 0.0001739649227910556, |
|
"loss": 2.8635, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.24242424242424243, |
|
"grad_norm": 0.40293633937835693, |
|
"learning_rate": 0.00017378332790417273, |
|
"loss": 2.729, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.2432748538011696, |
|
"grad_norm": 0.414109468460083, |
|
"learning_rate": 0.00017360119739255852, |
|
"loss": 2.6077, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.24412546517809675, |
|
"grad_norm": 0.42549028992652893, |
|
"learning_rate": 0.0001734185325783762, |
|
"loss": 2.7812, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.24497607655502393, |
|
"grad_norm": 0.42882055044174194, |
|
"learning_rate": 0.00017323533478766777, |
|
"loss": 2.7653, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.2458266879319511, |
|
"grad_norm": 0.42119139432907104, |
|
"learning_rate": 0.00017305160535034436, |
|
"loss": 2.5355, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.24667729930887825, |
|
"grad_norm": 0.4749990999698639, |
|
"learning_rate": 0.0001728673456001766, |
|
"loss": 2.7885, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.24752791068580543, |
|
"grad_norm": 0.4682268500328064, |
|
"learning_rate": 0.00017268255687478469, |
|
"loss": 2.6402, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.2483785220627326, |
|
"grad_norm": 0.4854019284248352, |
|
"learning_rate": 0.00017249724051562906, |
|
"loss": 2.7255, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.24922913343965974, |
|
"grad_norm": 0.5112527012825012, |
|
"learning_rate": 0.00017231139786800042, |
|
"loss": 2.8374, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.2500797448165869, |
|
"grad_norm": 0.5242344737052917, |
|
"learning_rate": 0.0001721250302810101, |
|
"loss": 2.9178, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.2500797448165869, |
|
"eval_loss": 2.688343048095703, |
|
"eval_runtime": 80.6326, |
|
"eval_samples_per_second": 12.278, |
|
"eval_steps_per_second": 6.139, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.2509303561935141, |
|
"grad_norm": 0.6918848156929016, |
|
"learning_rate": 0.00017193813910758, |
|
"loss": 2.6556, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.25178096757044127, |
|
"grad_norm": 0.5682982802391052, |
|
"learning_rate": 0.00017175072570443312, |
|
"loss": 2.6581, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.25263157894736843, |
|
"grad_norm": 0.6087559461593628, |
|
"learning_rate": 0.00017156279143208352, |
|
"loss": 2.5665, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.2534821903242956, |
|
"grad_norm": 0.6545628309249878, |
|
"learning_rate": 0.00017137433765482642, |
|
"loss": 2.8215, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.25433280170122274, |
|
"grad_norm": 0.6754540801048279, |
|
"learning_rate": 0.00017118536574072842, |
|
"loss": 2.7991, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.2551834130781499, |
|
"grad_norm": 0.6926846504211426, |
|
"learning_rate": 0.0001709958770616174, |
|
"loss": 2.7371, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2560340244550771, |
|
"grad_norm": 0.32997071743011475, |
|
"learning_rate": 0.00017080587299307283, |
|
"loss": 2.7739, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.25688463583200427, |
|
"grad_norm": 0.3260290324687958, |
|
"learning_rate": 0.0001706153549144154, |
|
"loss": 2.5971, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.25773524720893143, |
|
"grad_norm": 0.3378421366214752, |
|
"learning_rate": 0.00017042432420869732, |
|
"loss": 2.7588, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.2585858585858586, |
|
"grad_norm": 0.27844691276550293, |
|
"learning_rate": 0.0001702327822626922, |
|
"loss": 2.6493, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.25943646996278574, |
|
"grad_norm": 0.29100462794303894, |
|
"learning_rate": 0.00017004073046688497, |
|
"loss": 2.6397, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.2602870813397129, |
|
"grad_norm": 0.2764577865600586, |
|
"learning_rate": 0.00016984817021546177, |
|
"loss": 2.4199, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.26113769271664006, |
|
"grad_norm": 0.26870644092559814, |
|
"learning_rate": 0.00016965510290629972, |
|
"loss": 2.2552, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.26198830409356727, |
|
"grad_norm": 0.2770349979400635, |
|
"learning_rate": 0.00016946152994095704, |
|
"loss": 2.6118, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.2628389154704944, |
|
"grad_norm": 0.27041003108024597, |
|
"learning_rate": 0.00016926745272466268, |
|
"loss": 2.5329, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.2636895268474216, |
|
"grad_norm": 0.25608015060424805, |
|
"learning_rate": 0.00016907287266630614, |
|
"loss": 2.3411, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.26454013822434874, |
|
"grad_norm": 0.2750420570373535, |
|
"learning_rate": 0.00016887779117842725, |
|
"loss": 2.6393, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.2653907496012759, |
|
"grad_norm": 0.26276537775993347, |
|
"learning_rate": 0.00016868220967720604, |
|
"loss": 2.3616, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.26624136097820306, |
|
"grad_norm": 0.2735307514667511, |
|
"learning_rate": 0.00016848612958245216, |
|
"loss": 2.5156, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.26709197235513027, |
|
"grad_norm": 0.32951095700263977, |
|
"learning_rate": 0.00016828955231759497, |
|
"loss": 2.5329, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.2679425837320574, |
|
"grad_norm": 0.2762184143066406, |
|
"learning_rate": 0.00016809247930967282, |
|
"loss": 2.6873, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.2687931951089846, |
|
"grad_norm": 0.2802570164203644, |
|
"learning_rate": 0.000167894911989323, |
|
"loss": 2.5532, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.26964380648591174, |
|
"grad_norm": 0.26968276500701904, |
|
"learning_rate": 0.0001676968517907712, |
|
"loss": 2.602, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.2704944178628389, |
|
"grad_norm": 0.27560874819755554, |
|
"learning_rate": 0.00016749830015182107, |
|
"loss": 2.5003, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.27134502923976606, |
|
"grad_norm": 0.288411021232605, |
|
"learning_rate": 0.00016729925851384386, |
|
"loss": 2.6859, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.27219564061669327, |
|
"grad_norm": 0.2999224364757538, |
|
"learning_rate": 0.00016709972832176797, |
|
"loss": 2.8356, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2730462519936204, |
|
"grad_norm": 0.2956329882144928, |
|
"learning_rate": 0.0001668997110240684, |
|
"loss": 2.6157, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.2738968633705476, |
|
"grad_norm": 0.30924192070961, |
|
"learning_rate": 0.00016669920807275623, |
|
"loss": 2.8421, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.27474747474747474, |
|
"grad_norm": 0.31185418367385864, |
|
"learning_rate": 0.00016649822092336812, |
|
"loss": 2.758, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.2755980861244019, |
|
"grad_norm": 0.3129124045372009, |
|
"learning_rate": 0.0001662967510349558, |
|
"loss": 2.6861, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.27644869750132905, |
|
"grad_norm": 0.3300238251686096, |
|
"learning_rate": 0.00016609479987007527, |
|
"loss": 2.8284, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.27729930887825627, |
|
"grad_norm": 0.3459511399269104, |
|
"learning_rate": 0.00016589236889477646, |
|
"loss": 2.6454, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.2781499202551834, |
|
"grad_norm": 0.3566714823246002, |
|
"learning_rate": 0.00016568945957859236, |
|
"loss": 2.399, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.2790005316321106, |
|
"grad_norm": 0.3650771379470825, |
|
"learning_rate": 0.00016548607339452853, |
|
"loss": 2.7529, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.27985114300903774, |
|
"grad_norm": 0.35940608382225037, |
|
"learning_rate": 0.00016528221181905217, |
|
"loss": 2.5894, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.2807017543859649, |
|
"grad_norm": 0.4018422067165375, |
|
"learning_rate": 0.0001650778763320817, |
|
"loss": 2.7001, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.28155236576289205, |
|
"grad_norm": 0.3867095708847046, |
|
"learning_rate": 0.00016487306841697578, |
|
"loss": 2.9919, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.28240297713981927, |
|
"grad_norm": 0.5714160799980164, |
|
"learning_rate": 0.0001646677895605227, |
|
"loss": 2.9366, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.2832535885167464, |
|
"grad_norm": 0.3697023391723633, |
|
"learning_rate": 0.00016446204125292942, |
|
"loss": 2.653, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.2841041998936736, |
|
"grad_norm": 0.4222877323627472, |
|
"learning_rate": 0.00016425582498781087, |
|
"loss": 2.8557, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.28495481127060074, |
|
"grad_norm": 0.40899163484573364, |
|
"learning_rate": 0.0001640491422621792, |
|
"loss": 2.7875, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.2858054226475279, |
|
"grad_norm": 0.44694027304649353, |
|
"learning_rate": 0.00016384199457643262, |
|
"loss": 2.8616, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.28665603402445505, |
|
"grad_norm": 0.45921215415000916, |
|
"learning_rate": 0.00016363438343434483, |
|
"loss": 2.5821, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.28750664540138227, |
|
"grad_norm": 0.4156991243362427, |
|
"learning_rate": 0.00016342631034305384, |
|
"loss": 2.7228, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.2883572567783094, |
|
"grad_norm": 0.43912479281425476, |
|
"learning_rate": 0.00016321777681305125, |
|
"loss": 2.7119, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.2892078681552366, |
|
"grad_norm": 0.4638505280017853, |
|
"learning_rate": 0.00016300878435817113, |
|
"loss": 2.6832, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.29005847953216374, |
|
"grad_norm": 0.4912424087524414, |
|
"learning_rate": 0.00016279933449557906, |
|
"loss": 2.6583, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.2909090909090909, |
|
"grad_norm": 0.5067815780639648, |
|
"learning_rate": 0.00016258942874576118, |
|
"loss": 2.7723, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.29175970228601805, |
|
"grad_norm": 0.49635544419288635, |
|
"learning_rate": 0.0001623790686325131, |
|
"loss": 2.7351, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.29261031366294527, |
|
"grad_norm": 0.49465370178222656, |
|
"learning_rate": 0.00016216825568292885, |
|
"loss": 2.7251, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.2934609250398724, |
|
"grad_norm": 0.558665931224823, |
|
"learning_rate": 0.00016195699142738975, |
|
"loss": 2.5475, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.2943115364167996, |
|
"grad_norm": 0.6477727890014648, |
|
"learning_rate": 0.00016174527739955342, |
|
"loss": 2.7069, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.29516214779372674, |
|
"grad_norm": 0.6656542420387268, |
|
"learning_rate": 0.00016153311513634257, |
|
"loss": 3.0185, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.2960127591706539, |
|
"grad_norm": 0.7209298610687256, |
|
"learning_rate": 0.0001613205061779337, |
|
"loss": 2.8225, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.29686337054758105, |
|
"grad_norm": 0.6285322308540344, |
|
"learning_rate": 0.0001611074520677462, |
|
"loss": 2.9088, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.29771398192450826, |
|
"grad_norm": 0.7612189054489136, |
|
"learning_rate": 0.00016089395435243105, |
|
"loss": 2.9744, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2985645933014354, |
|
"grad_norm": 0.3265992999076843, |
|
"learning_rate": 0.00016068001458185936, |
|
"loss": 2.3931, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.2994152046783626, |
|
"grad_norm": 0.3179806172847748, |
|
"learning_rate": 0.00016046563430911146, |
|
"loss": 2.6212, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.30026581605528974, |
|
"grad_norm": 0.31296634674072266, |
|
"learning_rate": 0.00016025081509046544, |
|
"loss": 2.5008, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.3011164274322169, |
|
"grad_norm": 0.283408522605896, |
|
"learning_rate": 0.00016003555848538586, |
|
"loss": 2.3946, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.30196703880914405, |
|
"grad_norm": 0.2742927074432373, |
|
"learning_rate": 0.00015981986605651248, |
|
"loss": 2.5154, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.30281765018607126, |
|
"grad_norm": 0.29107552766799927, |
|
"learning_rate": 0.00015960373936964892, |
|
"loss": 2.505, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.3036682615629984, |
|
"grad_norm": 0.3244473338127136, |
|
"learning_rate": 0.0001593871799937512, |
|
"loss": 2.5244, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.3045188729399256, |
|
"grad_norm": 0.27494120597839355, |
|
"learning_rate": 0.0001591701895009164, |
|
"loss": 2.4161, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.30536948431685274, |
|
"grad_norm": 0.28784480690956116, |
|
"learning_rate": 0.00015895276946637136, |
|
"loss": 2.6434, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.3062200956937799, |
|
"grad_norm": 0.28096505999565125, |
|
"learning_rate": 0.00015873492146846108, |
|
"loss": 2.6784, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.30707070707070705, |
|
"grad_norm": 0.2602795362472534, |
|
"learning_rate": 0.00015851664708863735, |
|
"loss": 2.2712, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.30792131844763426, |
|
"grad_norm": 0.28796225786209106, |
|
"learning_rate": 0.0001582979479114472, |
|
"loss": 2.8673, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.3087719298245614, |
|
"grad_norm": 0.273456871509552, |
|
"learning_rate": 0.00015807882552452154, |
|
"loss": 2.6931, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.3096225412014886, |
|
"grad_norm": 0.29190319776535034, |
|
"learning_rate": 0.00015785928151856347, |
|
"loss": 2.5932, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.31047315257841573, |
|
"grad_norm": 0.290829598903656, |
|
"learning_rate": 0.0001576393174873368, |
|
"loss": 2.6889, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.3113237639553429, |
|
"grad_norm": 0.27631402015686035, |
|
"learning_rate": 0.0001574189350276545, |
|
"loss": 2.6508, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.31217437533227005, |
|
"grad_norm": 0.30464449524879456, |
|
"learning_rate": 0.00015719813573936712, |
|
"loss": 2.4918, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.31302498670919726, |
|
"grad_norm": 0.2622525095939636, |
|
"learning_rate": 0.00015697692122535107, |
|
"loss": 2.4657, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.3138755980861244, |
|
"grad_norm": 0.2790607810020447, |
|
"learning_rate": 0.0001567552930914972, |
|
"loss": 2.78, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.3147262094630516, |
|
"grad_norm": 0.27601122856140137, |
|
"learning_rate": 0.00015653325294669884, |
|
"loss": 2.6908, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.31557682083997873, |
|
"grad_norm": 0.3054703176021576, |
|
"learning_rate": 0.0001563108024028404, |
|
"loss": 2.6602, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.3164274322169059, |
|
"grad_norm": 0.2979956567287445, |
|
"learning_rate": 0.00015608794307478546, |
|
"loss": 2.8034, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.31727804359383305, |
|
"grad_norm": 0.3022500276565552, |
|
"learning_rate": 0.00015586467658036524, |
|
"loss": 2.6491, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.31812865497076026, |
|
"grad_norm": 0.3278852701187134, |
|
"learning_rate": 0.0001556410045403667, |
|
"loss": 2.5928, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.3189792663476874, |
|
"grad_norm": 0.3023948073387146, |
|
"learning_rate": 0.0001554169285785208, |
|
"loss": 2.6165, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.3198298777246146, |
|
"grad_norm": 0.33025047183036804, |
|
"learning_rate": 0.00015519245032149083, |
|
"loss": 2.7669, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.32068048910154173, |
|
"grad_norm": 0.3081379532814026, |
|
"learning_rate": 0.0001549675713988604, |
|
"loss": 2.5991, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.3215311004784689, |
|
"grad_norm": 0.35036811232566833, |
|
"learning_rate": 0.0001547422934431218, |
|
"loss": 2.6891, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.32238171185539605, |
|
"grad_norm": 0.399915486574173, |
|
"learning_rate": 0.00015451661808966405, |
|
"loss": 2.8271, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.32323232323232326, |
|
"grad_norm": 0.3355950713157654, |
|
"learning_rate": 0.00015429054697676107, |
|
"loss": 2.3574, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3240829346092504, |
|
"grad_norm": 0.35686787962913513, |
|
"learning_rate": 0.00015406408174555976, |
|
"loss": 2.6926, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.3249335459861776, |
|
"grad_norm": 0.3730961084365845, |
|
"learning_rate": 0.00015383722404006806, |
|
"loss": 2.7418, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.32578415736310473, |
|
"grad_norm": 0.36210712790489197, |
|
"learning_rate": 0.00015360997550714305, |
|
"loss": 2.7188, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.3266347687400319, |
|
"grad_norm": 0.4164154827594757, |
|
"learning_rate": 0.0001533823377964791, |
|
"loss": 2.8182, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.32748538011695905, |
|
"grad_norm": 0.3890111446380615, |
|
"learning_rate": 0.0001531543125605956, |
|
"loss": 2.6873, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.3283359914938862, |
|
"grad_norm": 0.3929746747016907, |
|
"learning_rate": 0.0001529259014548253, |
|
"loss": 2.6169, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.3291866028708134, |
|
"grad_norm": 0.4354581832885742, |
|
"learning_rate": 0.0001526971061373021, |
|
"loss": 2.681, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.3300372142477406, |
|
"grad_norm": 0.4014440178871155, |
|
"learning_rate": 0.00015246792826894906, |
|
"loss": 2.6601, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.33088782562466773, |
|
"grad_norm": 0.41521206498146057, |
|
"learning_rate": 0.00015223836951346634, |
|
"loss": 2.7763, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.3317384370015949, |
|
"grad_norm": 0.43811067938804626, |
|
"learning_rate": 0.00015200843153731906, |
|
"loss": 2.7373, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.33258904837852205, |
|
"grad_norm": 0.4687197208404541, |
|
"learning_rate": 0.0001517781160097254, |
|
"loss": 2.6432, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.3334396597554492, |
|
"grad_norm": 0.4998420178890228, |
|
"learning_rate": 0.00015154742460264425, |
|
"loss": 2.6434, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.3342902711323764, |
|
"grad_norm": 0.4983424246311188, |
|
"learning_rate": 0.0001513163589907632, |
|
"loss": 2.7619, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.3351408825093036, |
|
"grad_norm": 0.5066902041435242, |
|
"learning_rate": 0.00015108492085148632, |
|
"loss": 2.6515, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.33599149388623073, |
|
"grad_norm": 0.5106616020202637, |
|
"learning_rate": 0.00015085311186492206, |
|
"loss": 2.6288, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.3368421052631579, |
|
"grad_norm": 0.6036600470542908, |
|
"learning_rate": 0.00015062093371387097, |
|
"loss": 2.6391, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.33769271664008504, |
|
"grad_norm": 0.6260906457901001, |
|
"learning_rate": 0.00015038838808381354, |
|
"loss": 2.8623, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.3385433280170122, |
|
"grad_norm": 0.6070663332939148, |
|
"learning_rate": 0.00015015547666289797, |
|
"loss": 2.615, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.3393939393939394, |
|
"grad_norm": 0.66013503074646, |
|
"learning_rate": 0.00014992220114192785, |
|
"loss": 2.6865, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.34024455077086657, |
|
"grad_norm": 0.7371327877044678, |
|
"learning_rate": 0.00014968856321434998, |
|
"loss": 2.6959, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.34109516214779373, |
|
"grad_norm": 0.3081108629703522, |
|
"learning_rate": 0.00014945456457624197, |
|
"loss": 2.321, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.3419457735247209, |
|
"grad_norm": 0.37835538387298584, |
|
"learning_rate": 0.0001492202069263, |
|
"loss": 2.5144, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.34279638490164804, |
|
"grad_norm": 0.3184291422367096, |
|
"learning_rate": 0.00014898549196582645, |
|
"loss": 2.5783, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.3436469962785752, |
|
"grad_norm": 0.3378361165523529, |
|
"learning_rate": 0.00014875042139871766, |
|
"loss": 2.667, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.3444976076555024, |
|
"grad_norm": 0.285756379365921, |
|
"learning_rate": 0.00014851499693145135, |
|
"loss": 2.5316, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.34534821903242957, |
|
"grad_norm": 0.28837427496910095, |
|
"learning_rate": 0.00014827922027307451, |
|
"loss": 2.5012, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.34619883040935673, |
|
"grad_norm": 0.30004727840423584, |
|
"learning_rate": 0.0001480430931351906, |
|
"loss": 2.5237, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.3470494417862839, |
|
"grad_norm": 0.28527382016181946, |
|
"learning_rate": 0.00014780661723194757, |
|
"loss": 2.5067, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.34790005316321104, |
|
"grad_norm": 0.26983842253685, |
|
"learning_rate": 0.00014756979428002514, |
|
"loss": 2.5577, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.3487506645401382, |
|
"grad_norm": 0.2686617076396942, |
|
"learning_rate": 0.00014733262599862234, |
|
"loss": 2.5267, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3496012759170654, |
|
"grad_norm": 0.26147857308387756, |
|
"learning_rate": 0.00014709511410944523, |
|
"loss": 2.4459, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.35045188729399257, |
|
"grad_norm": 0.29142996668815613, |
|
"learning_rate": 0.00014685726033669412, |
|
"loss": 2.7246, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.3513024986709197, |
|
"grad_norm": 0.29214030504226685, |
|
"learning_rate": 0.00014661906640705129, |
|
"loss": 2.6422, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.3521531100478469, |
|
"grad_norm": 0.2727803885936737, |
|
"learning_rate": 0.00014638053404966836, |
|
"loss": 2.6416, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.35300372142477404, |
|
"grad_norm": 0.2637098431587219, |
|
"learning_rate": 0.0001461416649961537, |
|
"loss": 2.5555, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.3538543328017012, |
|
"grad_norm": 0.2736997902393341, |
|
"learning_rate": 0.00014590246098055996, |
|
"loss": 2.7423, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.3547049441786284, |
|
"grad_norm": 0.2691054940223694, |
|
"learning_rate": 0.0001456629237393713, |
|
"loss": 2.7818, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 0.28304579854011536, |
|
"learning_rate": 0.0001454230550114911, |
|
"loss": 2.554, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.3564061669324827, |
|
"grad_norm": 0.2698013484477997, |
|
"learning_rate": 0.00014518285653822898, |
|
"loss": 2.5155, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.3572567783094099, |
|
"grad_norm": 0.25874024629592896, |
|
"learning_rate": 0.00014494233006328837, |
|
"loss": 2.4841, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.35810738968633704, |
|
"grad_norm": 0.2918216586112976, |
|
"learning_rate": 0.00014470147733275387, |
|
"loss": 2.7489, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.3589580010632642, |
|
"grad_norm": 0.28201210498809814, |
|
"learning_rate": 0.0001444603000950784, |
|
"loss": 2.5709, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.3598086124401914, |
|
"grad_norm": 0.3077748119831085, |
|
"learning_rate": 0.0001442188001010707, |
|
"loss": 2.6089, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.36065922381711857, |
|
"grad_norm": 0.2970117926597595, |
|
"learning_rate": 0.00014397697910388248, |
|
"loss": 2.6171, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.3615098351940457, |
|
"grad_norm": 0.302947074174881, |
|
"learning_rate": 0.00014373483885899582, |
|
"loss": 2.4033, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.3623604465709729, |
|
"grad_norm": 0.31938815116882324, |
|
"learning_rate": 0.00014349238112421024, |
|
"loss": 2.7042, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.36321105794790004, |
|
"grad_norm": 0.3481922447681427, |
|
"learning_rate": 0.00014324960765963018, |
|
"loss": 2.6479, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.3640616693248272, |
|
"grad_norm": 0.3142179846763611, |
|
"learning_rate": 0.00014300652022765207, |
|
"loss": 2.5285, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.3649122807017544, |
|
"grad_norm": 0.3226439952850342, |
|
"learning_rate": 0.00014276312059295147, |
|
"loss": 2.5389, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.36576289207868157, |
|
"grad_norm": 0.401301771402359, |
|
"learning_rate": 0.00014251941052247045, |
|
"loss": 2.8749, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3666135034556087, |
|
"grad_norm": 0.3587849736213684, |
|
"learning_rate": 0.00014227539178540463, |
|
"loss": 2.7549, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.3674641148325359, |
|
"grad_norm": 0.38098907470703125, |
|
"learning_rate": 0.00014203106615319038, |
|
"loss": 2.6159, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.36831472620946304, |
|
"grad_norm": 0.402971088886261, |
|
"learning_rate": 0.00014178643539949196, |
|
"loss": 2.6626, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.3691653375863902, |
|
"grad_norm": 0.3960564136505127, |
|
"learning_rate": 0.00014154150130018866, |
|
"loss": 2.8099, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.3700159489633174, |
|
"grad_norm": 0.41555696725845337, |
|
"learning_rate": 0.00014129626563336178, |
|
"loss": 2.7282, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.37086656034024457, |
|
"grad_norm": 0.3991285264492035, |
|
"learning_rate": 0.000141050730179282, |
|
"loss": 2.5909, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.3717171717171717, |
|
"grad_norm": 0.442220002412796, |
|
"learning_rate": 0.00014080489672039606, |
|
"loss": 2.8671, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.3725677830940989, |
|
"grad_norm": 0.4263823628425598, |
|
"learning_rate": 0.0001405587670413143, |
|
"loss": 2.5901, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.37341839447102604, |
|
"grad_norm": 0.45683711767196655, |
|
"learning_rate": 0.00014031234292879725, |
|
"loss": 2.7801, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.3742690058479532, |
|
"grad_norm": 0.4672732353210449, |
|
"learning_rate": 0.00014006562617174294, |
|
"loss": 2.696, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3751196172248804, |
|
"grad_norm": 0.45676231384277344, |
|
"learning_rate": 0.0001398186185611738, |
|
"loss": 2.6197, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.37597022860180757, |
|
"grad_norm": 0.4717809855937958, |
|
"learning_rate": 0.00013957132189022374, |
|
"loss": 2.7676, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.3768208399787347, |
|
"grad_norm": 0.4931057393550873, |
|
"learning_rate": 0.00013932373795412503, |
|
"loss": 2.7968, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.3776714513556619, |
|
"grad_norm": 0.5112429857254028, |
|
"learning_rate": 0.0001390758685501954, |
|
"loss": 2.5947, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.37852206273258904, |
|
"grad_norm": 0.5512686371803284, |
|
"learning_rate": 0.00013882771547782475, |
|
"loss": 2.6041, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.3793726741095162, |
|
"grad_norm": 0.5506839752197266, |
|
"learning_rate": 0.0001385792805384625, |
|
"loss": 2.7389, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.3802232854864434, |
|
"grad_norm": 0.5831094980239868, |
|
"learning_rate": 0.00013833056553560398, |
|
"loss": 2.8867, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.38107389686337056, |
|
"grad_norm": 0.6127706170082092, |
|
"learning_rate": 0.00013808157227477788, |
|
"loss": 2.689, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.3819245082402977, |
|
"grad_norm": 0.6911614537239075, |
|
"learning_rate": 0.00013783230256353266, |
|
"loss": 2.9826, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.3827751196172249, |
|
"grad_norm": 0.7095205187797546, |
|
"learning_rate": 0.00013758275821142382, |
|
"loss": 2.7725, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.38362573099415204, |
|
"grad_norm": 0.31148967146873474, |
|
"learning_rate": 0.00013733294103000055, |
|
"loss": 2.5203, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.3844763423710792, |
|
"grad_norm": 0.3002786338329315, |
|
"learning_rate": 0.00013708285283279252, |
|
"loss": 2.5911, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.3853269537480064, |
|
"grad_norm": 0.29820379614830017, |
|
"learning_rate": 0.00013683249543529696, |
|
"loss": 2.3441, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.38617756512493356, |
|
"grad_norm": 0.29162371158599854, |
|
"learning_rate": 0.00013658187065496532, |
|
"loss": 2.4791, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.3870281765018607, |
|
"grad_norm": 0.2905353307723999, |
|
"learning_rate": 0.00013633098031119002, |
|
"loss": 2.4208, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.3878787878787879, |
|
"grad_norm": 0.28394779562950134, |
|
"learning_rate": 0.00013607982622529133, |
|
"loss": 2.6468, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.38872939925571504, |
|
"grad_norm": 0.2683579921722412, |
|
"learning_rate": 0.00013582841022050424, |
|
"loss": 2.5199, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.3895800106326422, |
|
"grad_norm": 0.2719745934009552, |
|
"learning_rate": 0.00013557673412196503, |
|
"loss": 2.4813, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.39043062200956935, |
|
"grad_norm": 0.26781052350997925, |
|
"learning_rate": 0.00013532479975669808, |
|
"loss": 2.5296, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.39128123338649656, |
|
"grad_norm": 0.26786506175994873, |
|
"learning_rate": 0.00013507260895360274, |
|
"loss": 2.5254, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.3921318447634237, |
|
"grad_norm": 0.27148234844207764, |
|
"learning_rate": 0.0001348201635434399, |
|
"loss": 2.6415, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.3929824561403509, |
|
"grad_norm": 0.29129862785339355, |
|
"learning_rate": 0.00013456746535881871, |
|
"loss": 2.6098, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.39383306751727803, |
|
"grad_norm": 0.2636375427246094, |
|
"learning_rate": 0.00013431451623418343, |
|
"loss": 2.4875, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.3946836788942052, |
|
"grad_norm": 0.2701190114021301, |
|
"learning_rate": 0.00013406131800579985, |
|
"loss": 2.7151, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.39553429027113235, |
|
"grad_norm": 0.2739466428756714, |
|
"learning_rate": 0.00013380787251174225, |
|
"loss": 2.6465, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.39638490164805956, |
|
"grad_norm": 0.2672434151172638, |
|
"learning_rate": 0.00013355418159187985, |
|
"loss": 2.6823, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.3972355130249867, |
|
"grad_norm": 0.26863133907318115, |
|
"learning_rate": 0.00013330024708786353, |
|
"loss": 2.5357, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.3980861244019139, |
|
"grad_norm": 0.2704436779022217, |
|
"learning_rate": 0.00013304607084311244, |
|
"loss": 2.6887, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.39893673577884103, |
|
"grad_norm": 0.2772809863090515, |
|
"learning_rate": 0.00013279165470280065, |
|
"loss": 2.6992, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.3997873471557682, |
|
"grad_norm": 0.3024834394454956, |
|
"learning_rate": 0.0001325370005138437, |
|
"loss": 2.753, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.40063795853269535, |
|
"grad_norm": 0.2810865342617035, |
|
"learning_rate": 0.00013228211012488532, |
|
"loss": 2.7921, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.40148856990962256, |
|
"grad_norm": 0.28023043274879456, |
|
"learning_rate": 0.00013202698538628376, |
|
"loss": 2.2935, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.4023391812865497, |
|
"grad_norm": 0.30379030108451843, |
|
"learning_rate": 0.0001317716281500987, |
|
"loss": 2.5575, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.4031897926634769, |
|
"grad_norm": 0.29648759961128235, |
|
"learning_rate": 0.00013151604027007745, |
|
"loss": 2.4384, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.40404040404040403, |
|
"grad_norm": 0.32276931405067444, |
|
"learning_rate": 0.00013126022360164172, |
|
"loss": 2.7529, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.4048910154173312, |
|
"grad_norm": 0.3146274983882904, |
|
"learning_rate": 0.00013100418000187419, |
|
"loss": 2.473, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.40574162679425835, |
|
"grad_norm": 0.33162757754325867, |
|
"learning_rate": 0.00013074791132950485, |
|
"loss": 2.5832, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.40659223817118556, |
|
"grad_norm": 0.33521875739097595, |
|
"learning_rate": 0.00013049141944489748, |
|
"loss": 2.4807, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.4074428495481127, |
|
"grad_norm": 0.3724415898323059, |
|
"learning_rate": 0.00013023470621003643, |
|
"loss": 2.8245, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.4082934609250399, |
|
"grad_norm": 0.3634830117225647, |
|
"learning_rate": 0.00012997777348851288, |
|
"loss": 2.5921, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.40914407230196703, |
|
"grad_norm": 0.38656124472618103, |
|
"learning_rate": 0.0001297206231455113, |
|
"loss": 2.619, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.4099946836788942, |
|
"grad_norm": 0.3939076066017151, |
|
"learning_rate": 0.00012946325704779602, |
|
"loss": 2.7466, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.41084529505582135, |
|
"grad_norm": 0.37405261397361755, |
|
"learning_rate": 0.00012920567706369758, |
|
"loss": 2.764, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.41169590643274856, |
|
"grad_norm": 0.3894766569137573, |
|
"learning_rate": 0.0001289478850630993, |
|
"loss": 2.7373, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.4125465178096757, |
|
"grad_norm": 0.44012099504470825, |
|
"learning_rate": 0.00012868988291742347, |
|
"loss": 2.6475, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.4133971291866029, |
|
"grad_norm": 0.41175583004951477, |
|
"learning_rate": 0.0001284316724996181, |
|
"loss": 2.7573, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.41424774056353003, |
|
"grad_norm": 0.4406805634498596, |
|
"learning_rate": 0.00012817325568414297, |
|
"loss": 2.5429, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.4150983519404572, |
|
"grad_norm": 0.4783489406108856, |
|
"learning_rate": 0.0001279146343469563, |
|
"loss": 2.8275, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.41594896331738435, |
|
"grad_norm": 0.44115763902664185, |
|
"learning_rate": 0.00012765581036550095, |
|
"loss": 2.6858, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.41679957469431156, |
|
"grad_norm": 0.5237467885017395, |
|
"learning_rate": 0.0001273967856186909, |
|
"loss": 2.8638, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4176501860712387, |
|
"grad_norm": 0.5071147680282593, |
|
"learning_rate": 0.00012713756198689757, |
|
"loss": 2.8603, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.4185007974481659, |
|
"grad_norm": 0.5125464797019958, |
|
"learning_rate": 0.00012687814135193612, |
|
"loss": 3.0048, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.41935140882509303, |
|
"grad_norm": 0.5373572707176208, |
|
"learning_rate": 0.0001266185255970519, |
|
"loss": 2.5727, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.4202020202020202, |
|
"grad_norm": 0.4913314878940582, |
|
"learning_rate": 0.00012635871660690676, |
|
"loss": 2.6501, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"grad_norm": 0.5953575968742371, |
|
"learning_rate": 0.00012609871626756522, |
|
"loss": 2.8674, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.42190324295587456, |
|
"grad_norm": 0.5852685570716858, |
|
"learning_rate": 0.00012583852646648095, |
|
"loss": 2.6855, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.4227538543328017, |
|
"grad_norm": 0.6449065804481506, |
|
"learning_rate": 0.00012557814909248296, |
|
"loss": 2.7688, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.4236044657097289, |
|
"grad_norm": 0.6615833640098572, |
|
"learning_rate": 0.000125317586035762, |
|
"loss": 2.7446, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.42445507708665603, |
|
"grad_norm": 0.6561222076416016, |
|
"learning_rate": 0.0001250568391878567, |
|
"loss": 2.801, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.4253056884635832, |
|
"grad_norm": 0.8370924592018127, |
|
"learning_rate": 0.00012479591044163997, |
|
"loss": 2.9991, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.42615629984051034, |
|
"grad_norm": 0.3273813724517822, |
|
"learning_rate": 0.0001245348016913051, |
|
"loss": 2.5914, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.42700691121743756, |
|
"grad_norm": 0.3103164732456207, |
|
"learning_rate": 0.00012427351483235223, |
|
"loss": 2.6331, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.4278575225943647, |
|
"grad_norm": 0.3061564862728119, |
|
"learning_rate": 0.00012401205176157447, |
|
"loss": 2.5927, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.42870813397129187, |
|
"grad_norm": 0.2943616509437561, |
|
"learning_rate": 0.00012375041437704393, |
|
"loss": 2.4734, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.42955874534821903, |
|
"grad_norm": 0.29883497953414917, |
|
"learning_rate": 0.00012348860457809838, |
|
"loss": 2.4734, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.4304093567251462, |
|
"grad_norm": 0.295578271150589, |
|
"learning_rate": 0.00012322662426532708, |
|
"loss": 2.3561, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.43125996810207334, |
|
"grad_norm": 0.3073442578315735, |
|
"learning_rate": 0.00012296447534055716, |
|
"loss": 2.5489, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.43211057947900056, |
|
"grad_norm": 0.3207886517047882, |
|
"learning_rate": 0.00012270215970683977, |
|
"loss": 2.6984, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.4329611908559277, |
|
"grad_norm": 0.26681843400001526, |
|
"learning_rate": 0.00012243967926843627, |
|
"loss": 2.4313, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.43381180223285487, |
|
"grad_norm": 0.29009976983070374, |
|
"learning_rate": 0.00012217703593080445, |
|
"loss": 2.6128, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.43466241360978203, |
|
"grad_norm": 0.27712225914001465, |
|
"learning_rate": 0.00012191423160058462, |
|
"loss": 2.4976, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.4355130249867092, |
|
"grad_norm": 0.30184635519981384, |
|
"learning_rate": 0.00012165126818558572, |
|
"loss": 2.6912, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.43636363636363634, |
|
"grad_norm": 0.28122004866600037, |
|
"learning_rate": 0.00012138814759477176, |
|
"loss": 2.6935, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.43721424774056356, |
|
"grad_norm": 0.2680952250957489, |
|
"learning_rate": 0.00012112487173824753, |
|
"loss": 2.5607, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.4380648591174907, |
|
"grad_norm": 0.26283374428749084, |
|
"learning_rate": 0.00012086144252724513, |
|
"loss": 2.5001, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.43891547049441787, |
|
"grad_norm": 0.26960310339927673, |
|
"learning_rate": 0.00012059786187410984, |
|
"loss": 2.609, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.439766081871345, |
|
"grad_norm": 0.2755014896392822, |
|
"learning_rate": 0.00012033413169228635, |
|
"loss": 2.6356, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.4406166932482722, |
|
"grad_norm": 0.2706344723701477, |
|
"learning_rate": 0.00012007025389630484, |
|
"loss": 2.6909, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.44146730462519934, |
|
"grad_norm": 0.2639751434326172, |
|
"learning_rate": 0.00011980623040176704, |
|
"loss": 2.542, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.44231791600212655, |
|
"grad_norm": 0.2747778594493866, |
|
"learning_rate": 0.00011954206312533245, |
|
"loss": 2.4773, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.4431685273790537, |
|
"grad_norm": 0.2990424931049347, |
|
"learning_rate": 0.0001192777539847043, |
|
"loss": 2.6881, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.44401913875598087, |
|
"grad_norm": 0.27671095728874207, |
|
"learning_rate": 0.00011901330489861564, |
|
"loss": 2.5495, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.444869750132908, |
|
"grad_norm": 0.3051941394805908, |
|
"learning_rate": 0.00011874871778681555, |
|
"loss": 2.7591, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.4457203615098352, |
|
"grad_norm": 0.2990604043006897, |
|
"learning_rate": 0.00011848399457005495, |
|
"loss": 2.5765, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.44657097288676234, |
|
"grad_norm": 0.3488616943359375, |
|
"learning_rate": 0.00011821913717007298, |
|
"loss": 2.634, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.44742158426368955, |
|
"grad_norm": 0.3207804262638092, |
|
"learning_rate": 0.00011795414750958265, |
|
"loss": 2.5632, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.4482721956406167, |
|
"grad_norm": 0.3244103491306305, |
|
"learning_rate": 0.0001176890275122573, |
|
"loss": 2.5784, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.44912280701754387, |
|
"grad_norm": 0.33527323603630066, |
|
"learning_rate": 0.00011742377910271639, |
|
"loss": 2.7016, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.449973418394471, |
|
"grad_norm": 0.3452233672142029, |
|
"learning_rate": 0.00011715840420651152, |
|
"loss": 2.5495, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.4508240297713982, |
|
"grad_norm": 0.36391714215278625, |
|
"learning_rate": 0.00011689290475011259, |
|
"loss": 2.662, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.45167464114832534, |
|
"grad_norm": 0.41914063692092896, |
|
"learning_rate": 0.00011662728266089364, |
|
"loss": 2.8876, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.45252525252525255, |
|
"grad_norm": 0.4124680459499359, |
|
"learning_rate": 0.00011636153986711906, |
|
"loss": 2.7271, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.4533758639021797, |
|
"grad_norm": 0.3957962393760681, |
|
"learning_rate": 0.00011609567829792944, |
|
"loss": 2.5991, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.45422647527910687, |
|
"grad_norm": 0.401443213224411, |
|
"learning_rate": 0.00011582969988332757, |
|
"loss": 2.7777, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.455077086656034, |
|
"grad_norm": 0.4349088966846466, |
|
"learning_rate": 0.00011556360655416457, |
|
"loss": 2.524, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.4559276980329612, |
|
"grad_norm": 0.44450250267982483, |
|
"learning_rate": 0.00011529740024212565, |
|
"loss": 2.8296, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.45677830940988834, |
|
"grad_norm": 0.44510000944137573, |
|
"learning_rate": 0.00011503108287971626, |
|
"loss": 2.8425, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.4576289207868155, |
|
"grad_norm": 0.4766497313976288, |
|
"learning_rate": 0.00011476465640024814, |
|
"loss": 2.7471, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.4584795321637427, |
|
"grad_norm": 0.458278089761734, |
|
"learning_rate": 0.00011449812273782492, |
|
"loss": 2.7274, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.45933014354066987, |
|
"grad_norm": 0.46523377299308777, |
|
"learning_rate": 0.00011423148382732853, |
|
"loss": 2.4967, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.460180754917597, |
|
"grad_norm": 0.4741576611995697, |
|
"learning_rate": 0.00011396474160440478, |
|
"loss": 2.6591, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.4610313662945242, |
|
"grad_norm": 0.5007473230361938, |
|
"learning_rate": 0.00011369789800544959, |
|
"loss": 2.8139, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.46188197767145134, |
|
"grad_norm": 0.45440518856048584, |
|
"learning_rate": 0.00011343095496759476, |
|
"loss": 2.6233, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.4627325890483785, |
|
"grad_norm": 0.5588196516036987, |
|
"learning_rate": 0.00011316391442869394, |
|
"loss": 2.6369, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.4635832004253057, |
|
"grad_norm": 0.5833231806755066, |
|
"learning_rate": 0.00011289677832730862, |
|
"loss": 2.8079, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.46443381180223287, |
|
"grad_norm": 0.6029292345046997, |
|
"learning_rate": 0.00011262954860269399, |
|
"loss": 2.8007, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.46528442317916, |
|
"grad_norm": 0.5825842022895813, |
|
"learning_rate": 0.00011236222719478491, |
|
"loss": 2.7767, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.4661350345560872, |
|
"grad_norm": 0.6685728430747986, |
|
"learning_rate": 0.00011209481604418181, |
|
"loss": 2.6621, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.46698564593301434, |
|
"grad_norm": 0.6266542077064514, |
|
"learning_rate": 0.00011182731709213659, |
|
"loss": 2.5234, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.4678362573099415, |
|
"grad_norm": 0.7756956815719604, |
|
"learning_rate": 0.00011155973228053853, |
|
"loss": 2.9975, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4686868686868687, |
|
"grad_norm": 0.27434810996055603, |
|
"learning_rate": 0.00011129206355190025, |
|
"loss": 2.3867, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.46953748006379586, |
|
"grad_norm": 0.28944891691207886, |
|
"learning_rate": 0.00011102431284934345, |
|
"loss": 2.6283, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.470388091440723, |
|
"grad_norm": 0.2992161810398102, |
|
"learning_rate": 0.00011075648211658505, |
|
"loss": 2.5818, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.4712387028176502, |
|
"grad_norm": 0.2830989956855774, |
|
"learning_rate": 0.00011048857329792284, |
|
"loss": 2.5738, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.47208931419457734, |
|
"grad_norm": 0.2766299545764923, |
|
"learning_rate": 0.00011022058833822158, |
|
"loss": 2.5138, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.4729399255715045, |
|
"grad_norm": 0.27325817942619324, |
|
"learning_rate": 0.0001099525291828986, |
|
"loss": 2.5166, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.4737905369484317, |
|
"grad_norm": 0.27071428298950195, |
|
"learning_rate": 0.00010968439777790999, |
|
"loss": 2.4935, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.47464114832535886, |
|
"grad_norm": 0.28781965374946594, |
|
"learning_rate": 0.00010941619606973632, |
|
"loss": 2.5851, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.475491759702286, |
|
"grad_norm": 0.2613832354545593, |
|
"learning_rate": 0.00010914792600536843, |
|
"loss": 2.5289, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.4763423710792132, |
|
"grad_norm": 0.2806464433670044, |
|
"learning_rate": 0.00010887958953229349, |
|
"loss": 2.659, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.47719298245614034, |
|
"grad_norm": 0.2767505347728729, |
|
"learning_rate": 0.00010861118859848067, |
|
"loss": 2.6562, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.4780435938330675, |
|
"grad_norm": 0.26740705966949463, |
|
"learning_rate": 0.0001083427251523672, |
|
"loss": 2.5689, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.4788942052099947, |
|
"grad_norm": 0.2635597884654999, |
|
"learning_rate": 0.000108074201142844, |
|
"loss": 2.3973, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.47974481658692186, |
|
"grad_norm": 0.2559509575366974, |
|
"learning_rate": 0.00010780561851924167, |
|
"loss": 2.4662, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.480595427963849, |
|
"grad_norm": 0.2819572687149048, |
|
"learning_rate": 0.0001075369792313164, |
|
"loss": 2.7678, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.4814460393407762, |
|
"grad_norm": 0.2618950605392456, |
|
"learning_rate": 0.00010726828522923562, |
|
"loss": 2.6463, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.48229665071770333, |
|
"grad_norm": 0.2766074240207672, |
|
"learning_rate": 0.000106999538463564, |
|
"loss": 2.7133, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.4831472620946305, |
|
"grad_norm": 0.280367910861969, |
|
"learning_rate": 0.00010673074088524926, |
|
"loss": 2.61, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.4839978734715577, |
|
"grad_norm": 0.3028632402420044, |
|
"learning_rate": 0.00010646189444560799, |
|
"loss": 2.5465, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.48484848484848486, |
|
"grad_norm": 0.2950790822505951, |
|
"learning_rate": 0.00010619300109631145, |
|
"loss": 2.6517, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.485699096225412, |
|
"grad_norm": 0.2803073227405548, |
|
"learning_rate": 0.00010592406278937144, |
|
"loss": 2.6062, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.4865497076023392, |
|
"grad_norm": 0.29916471242904663, |
|
"learning_rate": 0.00010565508147712617, |
|
"loss": 2.5532, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.48740031897926633, |
|
"grad_norm": 0.32185062766075134, |
|
"learning_rate": 0.00010538605911222603, |
|
"loss": 2.722, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.4882509303561935, |
|
"grad_norm": 0.3155268728733063, |
|
"learning_rate": 0.00010511699764761936, |
|
"loss": 2.6655, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.4891015417331207, |
|
"grad_norm": 0.3241024315357208, |
|
"learning_rate": 0.00010484789903653846, |
|
"loss": 2.7093, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.48995215311004786, |
|
"grad_norm": 0.3311263620853424, |
|
"learning_rate": 0.00010457876523248518, |
|
"loss": 2.6485, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.490802764486975, |
|
"grad_norm": 0.34630128741264343, |
|
"learning_rate": 0.00010430959818921694, |
|
"loss": 2.8315, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.4916533758639022, |
|
"grad_norm": 0.34841713309288025, |
|
"learning_rate": 0.00010404039986073244, |
|
"loss": 2.7484, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.49250398724082933, |
|
"grad_norm": 0.35180601477622986, |
|
"learning_rate": 0.00010377117220125741, |
|
"loss": 2.6745, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.4933545986177565, |
|
"grad_norm": 0.38369500637054443, |
|
"learning_rate": 0.00010350191716523059, |
|
"loss": 2.6623, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.4942052099946837, |
|
"grad_norm": 0.3735206425189972, |
|
"learning_rate": 0.00010323263670728946, |
|
"loss": 2.5805, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.49505582137161086, |
|
"grad_norm": 0.3994956314563751, |
|
"learning_rate": 0.00010296333278225599, |
|
"loss": 2.5622, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.495906432748538, |
|
"grad_norm": 0.39151209592819214, |
|
"learning_rate": 0.00010269400734512256, |
|
"loss": 2.805, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.4967570441254652, |
|
"grad_norm": 0.40469613671302795, |
|
"learning_rate": 0.0001024246623510377, |
|
"loss": 2.7588, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.49760765550239233, |
|
"grad_norm": 0.4307393431663513, |
|
"learning_rate": 0.0001021552997552919, |
|
"loss": 2.8039, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.4984582668793195, |
|
"grad_norm": 0.41100749373435974, |
|
"learning_rate": 0.00010188592151330343, |
|
"loss": 2.6842, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.4993088782562467, |
|
"grad_norm": 0.45486176013946533, |
|
"learning_rate": 0.00010161652958060417, |
|
"loss": 2.8073, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.5001594896331738, |
|
"grad_norm": 0.43648669123649597, |
|
"learning_rate": 0.00010134712591282538, |
|
"loss": 2.638, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.5001594896331738, |
|
"eval_loss": 2.66949725151062, |
|
"eval_runtime": 80.5199, |
|
"eval_samples_per_second": 12.295, |
|
"eval_steps_per_second": 6.148, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.501010101010101, |
|
"grad_norm": 0.44425809383392334, |
|
"learning_rate": 0.00010107771246568345, |
|
"loss": 2.5832, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.5018607123870282, |
|
"grad_norm": 0.4519883990287781, |
|
"learning_rate": 0.00010080829119496586, |
|
"loss": 2.8142, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5027113237639553, |
|
"grad_norm": 0.4547143280506134, |
|
"learning_rate": 0.0001005388640565168, |
|
"loss": 2.4661, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.5035619351408825, |
|
"grad_norm": 0.5073342323303223, |
|
"learning_rate": 0.00010026943300622313, |
|
"loss": 2.7849, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.5044125465178096, |
|
"grad_norm": 0.5203079581260681, |
|
"learning_rate": 0.0001, |
|
"loss": 2.6077, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.5052631578947369, |
|
"grad_norm": 0.4956808090209961, |
|
"learning_rate": 9.97305669937769e-05, |
|
"loss": 2.7116, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.506113769271664, |
|
"grad_norm": 0.5807839035987854, |
|
"learning_rate": 9.946113594348321e-05, |
|
"loss": 2.8024, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.5069643806485912, |
|
"grad_norm": 0.5834396481513977, |
|
"learning_rate": 9.919170880503415e-05, |
|
"loss": 2.6428, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.5078149920255184, |
|
"grad_norm": 0.5668088793754578, |
|
"learning_rate": 9.892228753431657e-05, |
|
"loss": 2.6711, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.5086656034024455, |
|
"grad_norm": 0.6034618616104126, |
|
"learning_rate": 9.865287408717465e-05, |
|
"loss": 2.9129, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.5095162147793727, |
|
"grad_norm": 0.5953810811042786, |
|
"learning_rate": 9.838347041939584e-05, |
|
"loss": 2.7463, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.5103668261562998, |
|
"grad_norm": 0.6790388822555542, |
|
"learning_rate": 9.811407848669657e-05, |
|
"loss": 2.8535, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.511217437533227, |
|
"grad_norm": 0.2997681200504303, |
|
"learning_rate": 9.784470024470812e-05, |
|
"loss": 2.4338, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.5120680489101542, |
|
"grad_norm": 0.28502747416496277, |
|
"learning_rate": 9.757533764896235e-05, |
|
"loss": 2.2975, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.5129186602870813, |
|
"grad_norm": 0.29402679204940796, |
|
"learning_rate": 9.730599265487745e-05, |
|
"loss": 2.6287, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.5137692716640085, |
|
"grad_norm": 0.28246256709098816, |
|
"learning_rate": 9.703666721774402e-05, |
|
"loss": 2.4197, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.5146198830409356, |
|
"grad_norm": 0.29511624574661255, |
|
"learning_rate": 9.676736329271059e-05, |
|
"loss": 2.6028, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.5154704944178629, |
|
"grad_norm": 0.2746260166168213, |
|
"learning_rate": 9.649808283476941e-05, |
|
"loss": 2.5791, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.51632110579479, |
|
"grad_norm": 0.2634756863117218, |
|
"learning_rate": 9.622882779874263e-05, |
|
"loss": 2.4199, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.5171717171717172, |
|
"grad_norm": 0.2797803282737732, |
|
"learning_rate": 9.595960013926761e-05, |
|
"loss": 2.5637, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.5180223285486444, |
|
"grad_norm": 0.2671017646789551, |
|
"learning_rate": 9.569040181078306e-05, |
|
"loss": 2.6811, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.5188729399255715, |
|
"grad_norm": 0.26546764373779297, |
|
"learning_rate": 9.542123476751483e-05, |
|
"loss": 2.5613, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5197235513024987, |
|
"grad_norm": 0.2660638988018036, |
|
"learning_rate": 9.515210096346155e-05, |
|
"loss": 2.4644, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.5205741626794258, |
|
"grad_norm": 0.2697810232639313, |
|
"learning_rate": 9.488300235238067e-05, |
|
"loss": 2.5643, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.521424774056353, |
|
"grad_norm": 0.26712700724601746, |
|
"learning_rate": 9.461394088777402e-05, |
|
"loss": 2.6993, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.5222753854332801, |
|
"grad_norm": 0.2802102565765381, |
|
"learning_rate": 9.434491852287385e-05, |
|
"loss": 2.5723, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.5231259968102073, |
|
"grad_norm": 0.2690255641937256, |
|
"learning_rate": 9.407593721062859e-05, |
|
"loss": 2.5136, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.5239766081871345, |
|
"grad_norm": 0.2789754271507263, |
|
"learning_rate": 9.38069989036886e-05, |
|
"loss": 2.5901, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.5248272195640616, |
|
"grad_norm": 0.2947288751602173, |
|
"learning_rate": 9.353810555439203e-05, |
|
"loss": 2.5661, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.5256778309409889, |
|
"grad_norm": 0.268081396818161, |
|
"learning_rate": 9.326925911475075e-05, |
|
"loss": 2.603, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.526528442317916, |
|
"grad_norm": 0.2749037444591522, |
|
"learning_rate": 9.300046153643602e-05, |
|
"loss": 2.7176, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.5273790536948432, |
|
"grad_norm": 0.267333984375, |
|
"learning_rate": 9.27317147707644e-05, |
|
"loss": 2.5775, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5282296650717704, |
|
"grad_norm": 0.2688322961330414, |
|
"learning_rate": 9.246302076868363e-05, |
|
"loss": 2.4796, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.5290802764486975, |
|
"grad_norm": 0.2832041382789612, |
|
"learning_rate": 9.219438148075832e-05, |
|
"loss": 2.5764, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.5299308878256247, |
|
"grad_norm": 0.3032709062099457, |
|
"learning_rate": 9.192579885715602e-05, |
|
"loss": 2.7559, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.5307814992025518, |
|
"grad_norm": 0.3065055012702942, |
|
"learning_rate": 9.165727484763282e-05, |
|
"loss": 2.7058, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.531632110579479, |
|
"grad_norm": 0.314698189496994, |
|
"learning_rate": 9.138881140151931e-05, |
|
"loss": 2.6227, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.5324827219564061, |
|
"grad_norm": 0.31485408544540405, |
|
"learning_rate": 9.112041046770653e-05, |
|
"loss": 2.5687, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.3281068801879883, |
|
"learning_rate": 9.085207399463162e-05, |
|
"loss": 2.5957, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.5341839447102605, |
|
"grad_norm": 0.3461722731590271, |
|
"learning_rate": 9.058380393026369e-05, |
|
"loss": 2.8056, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.5350345560871876, |
|
"grad_norm": 0.3617733418941498, |
|
"learning_rate": 9.031560222209002e-05, |
|
"loss": 2.8171, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.5358851674641149, |
|
"grad_norm": 0.34867244958877563, |
|
"learning_rate": 9.00474708171014e-05, |
|
"loss": 2.6481, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.536735778841042, |
|
"grad_norm": 0.3704431653022766, |
|
"learning_rate": 8.977941166177845e-05, |
|
"loss": 2.7771, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.5375863902179692, |
|
"grad_norm": 0.3661860525608063, |
|
"learning_rate": 8.951142670207717e-05, |
|
"loss": 2.5948, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.5384370015948964, |
|
"grad_norm": 0.3825220763683319, |
|
"learning_rate": 8.924351788341496e-05, |
|
"loss": 2.5843, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.5392876129718235, |
|
"grad_norm": 0.39650052785873413, |
|
"learning_rate": 8.897568715065657e-05, |
|
"loss": 2.7541, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.5401382243487507, |
|
"grad_norm": 0.4185117483139038, |
|
"learning_rate": 8.87079364480998e-05, |
|
"loss": 2.791, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.5409888357256778, |
|
"grad_norm": 0.41095203161239624, |
|
"learning_rate": 8.844026771946147e-05, |
|
"loss": 2.5707, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.541839447102605, |
|
"grad_norm": 0.4548766016960144, |
|
"learning_rate": 8.817268290786343e-05, |
|
"loss": 2.8622, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.5426900584795321, |
|
"grad_norm": 0.4231926202774048, |
|
"learning_rate": 8.790518395581822e-05, |
|
"loss": 2.5893, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.5435406698564593, |
|
"grad_norm": 0.441785603761673, |
|
"learning_rate": 8.763777280521511e-05, |
|
"loss": 2.7675, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.5443912812333865, |
|
"grad_norm": 0.4583280682563782, |
|
"learning_rate": 8.737045139730605e-05, |
|
"loss": 2.7461, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5452418926103136, |
|
"grad_norm": 0.5172987580299377, |
|
"learning_rate": 8.71032216726914e-05, |
|
"loss": 2.9216, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.5460925039872409, |
|
"grad_norm": 0.49341312050819397, |
|
"learning_rate": 8.683608557130608e-05, |
|
"loss": 2.803, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.546943115364168, |
|
"grad_norm": 0.5035589933395386, |
|
"learning_rate": 8.656904503240527e-05, |
|
"loss": 2.869, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.5477937267410952, |
|
"grad_norm": 0.5143235921859741, |
|
"learning_rate": 8.630210199455041e-05, |
|
"loss": 2.7889, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.5486443381180224, |
|
"grad_norm": 0.5110295414924622, |
|
"learning_rate": 8.603525839559523e-05, |
|
"loss": 2.8229, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.5494949494949495, |
|
"grad_norm": 0.555565595626831, |
|
"learning_rate": 8.57685161726715e-05, |
|
"loss": 2.6461, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.5503455608718767, |
|
"grad_norm": 0.56583172082901, |
|
"learning_rate": 8.550187726217507e-05, |
|
"loss": 2.7647, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.5511961722488038, |
|
"grad_norm": 0.6618173718452454, |
|
"learning_rate": 8.523534359975189e-05, |
|
"loss": 2.6992, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.552046783625731, |
|
"grad_norm": 0.6295120120048523, |
|
"learning_rate": 8.496891712028375e-05, |
|
"loss": 2.6303, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.5528973950026581, |
|
"grad_norm": 0.6757770776748657, |
|
"learning_rate": 8.470259975787438e-05, |
|
"loss": 2.8003, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5537480063795853, |
|
"grad_norm": 0.26357659697532654, |
|
"learning_rate": 8.443639344583547e-05, |
|
"loss": 2.4452, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.5545986177565125, |
|
"grad_norm": 0.31750619411468506, |
|
"learning_rate": 8.417030011667241e-05, |
|
"loss": 2.374, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.5554492291334396, |
|
"grad_norm": 0.27070245146751404, |
|
"learning_rate": 8.390432170207057e-05, |
|
"loss": 2.3384, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.5562998405103668, |
|
"grad_norm": 0.28004199266433716, |
|
"learning_rate": 8.363846013288095e-05, |
|
"loss": 2.4357, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.557150451887294, |
|
"grad_norm": 0.323215126991272, |
|
"learning_rate": 8.337271733910637e-05, |
|
"loss": 2.3801, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.5580010632642212, |
|
"grad_norm": 0.292837530374527, |
|
"learning_rate": 8.310709524988743e-05, |
|
"loss": 2.3924, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.5588516746411484, |
|
"grad_norm": 0.2810145914554596, |
|
"learning_rate": 8.284159579348851e-05, |
|
"loss": 2.5257, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.5597022860180755, |
|
"grad_norm": 0.2848433554172516, |
|
"learning_rate": 8.257622089728362e-05, |
|
"loss": 2.5743, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.5605528973950027, |
|
"grad_norm": 0.28531527519226074, |
|
"learning_rate": 8.231097248774274e-05, |
|
"loss": 2.5492, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.5614035087719298, |
|
"grad_norm": 0.2882923185825348, |
|
"learning_rate": 8.20458524904174e-05, |
|
"loss": 2.4982, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.562254120148857, |
|
"grad_norm": 0.2807391583919525, |
|
"learning_rate": 8.178086282992705e-05, |
|
"loss": 2.5222, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.5631047315257841, |
|
"grad_norm": 0.29856112599372864, |
|
"learning_rate": 8.151600542994506e-05, |
|
"loss": 2.723, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.5639553429027113, |
|
"grad_norm": 0.2890356481075287, |
|
"learning_rate": 8.125128221318446e-05, |
|
"loss": 2.6931, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.5648059542796385, |
|
"grad_norm": 0.28091031312942505, |
|
"learning_rate": 8.098669510138437e-05, |
|
"loss": 2.6273, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.5656565656565656, |
|
"grad_norm": 0.2678775489330292, |
|
"learning_rate": 8.072224601529574e-05, |
|
"loss": 2.5571, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.5665071770334928, |
|
"grad_norm": 0.2804068624973297, |
|
"learning_rate": 8.045793687466757e-05, |
|
"loss": 2.4443, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.56735778841042, |
|
"grad_norm": 0.279041051864624, |
|
"learning_rate": 8.0193769598233e-05, |
|
"loss": 2.6947, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.5682083997873472, |
|
"grad_norm": 0.27611520886421204, |
|
"learning_rate": 7.992974610369521e-05, |
|
"loss": 2.782, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.5690590111642744, |
|
"grad_norm": 0.28445136547088623, |
|
"learning_rate": 7.966586830771367e-05, |
|
"loss": 2.5741, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.5699096225412015, |
|
"grad_norm": 0.2607346773147583, |
|
"learning_rate": 7.940213812589018e-05, |
|
"loss": 2.4522, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5707602339181287, |
|
"grad_norm": 0.2881016433238983, |
|
"learning_rate": 7.913855747275489e-05, |
|
"loss": 2.6714, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.5716108452950558, |
|
"grad_norm": 0.3019566237926483, |
|
"learning_rate": 7.887512826175248e-05, |
|
"loss": 2.7117, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.572461456671983, |
|
"grad_norm": 0.3051791489124298, |
|
"learning_rate": 7.861185240522827e-05, |
|
"loss": 2.6867, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.5733120680489101, |
|
"grad_norm": 0.32132992148399353, |
|
"learning_rate": 7.834873181441427e-05, |
|
"loss": 2.5278, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.5741626794258373, |
|
"grad_norm": 0.32348886132240295, |
|
"learning_rate": 7.808576839941542e-05, |
|
"loss": 2.7507, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5750132908027645, |
|
"grad_norm": 0.32297685742378235, |
|
"learning_rate": 7.782296406919557e-05, |
|
"loss": 2.6857, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.5758639021796916, |
|
"grad_norm": 0.34016191959381104, |
|
"learning_rate": 7.756032073156373e-05, |
|
"loss": 2.57, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.5767145135566188, |
|
"grad_norm": 0.349483847618103, |
|
"learning_rate": 7.729784029316025e-05, |
|
"loss": 2.6255, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.577565124933546, |
|
"grad_norm": 0.3523537218570709, |
|
"learning_rate": 7.703552465944287e-05, |
|
"loss": 2.8317, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.5784157363104732, |
|
"grad_norm": 0.38119909167289734, |
|
"learning_rate": 7.677337573467294e-05, |
|
"loss": 2.7303, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5792663476874004, |
|
"grad_norm": 0.4053308367729187, |
|
"learning_rate": 7.651139542190164e-05, |
|
"loss": 2.6776, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.5801169590643275, |
|
"grad_norm": 0.38038370013237, |
|
"learning_rate": 7.624958562295606e-05, |
|
"loss": 2.6411, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.5809675704412547, |
|
"grad_norm": 0.414034903049469, |
|
"learning_rate": 7.598794823842557e-05, |
|
"loss": 2.8368, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.5818181818181818, |
|
"grad_norm": 0.41362208127975464, |
|
"learning_rate": 7.572648516764777e-05, |
|
"loss": 2.7608, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.582668793195109, |
|
"grad_norm": 0.41102275252342224, |
|
"learning_rate": 7.54651983086949e-05, |
|
"loss": 2.7317, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.5835194045720361, |
|
"grad_norm": 0.45451274514198303, |
|
"learning_rate": 7.520408955836007e-05, |
|
"loss": 2.7164, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.5843700159489633, |
|
"grad_norm": 0.42562657594680786, |
|
"learning_rate": 7.494316081214334e-05, |
|
"loss": 2.6641, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.5852206273258905, |
|
"grad_norm": 0.46389469504356384, |
|
"learning_rate": 7.468241396423801e-05, |
|
"loss": 2.6883, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.5860712387028176, |
|
"grad_norm": 0.42721521854400635, |
|
"learning_rate": 7.442185090751705e-05, |
|
"loss": 2.5853, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.5869218500797448, |
|
"grad_norm": 0.5020928978919983, |
|
"learning_rate": 7.416147353351909e-05, |
|
"loss": 2.6468, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.587772461456672, |
|
"grad_norm": 0.4806350767612457, |
|
"learning_rate": 7.390128373243479e-05, |
|
"loss": 2.5275, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.5886230728335992, |
|
"grad_norm": 0.5264183282852173, |
|
"learning_rate": 7.364128339309326e-05, |
|
"loss": 2.6082, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.5894736842105263, |
|
"grad_norm": 0.5251814126968384, |
|
"learning_rate": 7.338147440294809e-05, |
|
"loss": 2.6319, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.5903242955874535, |
|
"grad_norm": 0.5503838658332825, |
|
"learning_rate": 7.312185864806391e-05, |
|
"loss": 2.6875, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.5911749069643807, |
|
"grad_norm": 0.5717040300369263, |
|
"learning_rate": 7.286243801310248e-05, |
|
"loss": 2.6859, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.5920255183413078, |
|
"grad_norm": 0.5679376125335693, |
|
"learning_rate": 7.260321438130913e-05, |
|
"loss": 2.6425, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.592876129718235, |
|
"grad_norm": 0.6636802554130554, |
|
"learning_rate": 7.234418963449907e-05, |
|
"loss": 2.7617, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.5937267410951621, |
|
"grad_norm": 0.6101743578910828, |
|
"learning_rate": 7.208536565304373e-05, |
|
"loss": 2.6909, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.5945773524720893, |
|
"grad_norm": 0.6630806922912598, |
|
"learning_rate": 7.182674431585704e-05, |
|
"loss": 2.7461, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.5954279638490165, |
|
"grad_norm": 0.738452136516571, |
|
"learning_rate": 7.156832750038192e-05, |
|
"loss": 2.8512, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5962785752259436, |
|
"grad_norm": 0.2764449715614319, |
|
"learning_rate": 7.131011708257654e-05, |
|
"loss": 2.4965, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.5971291866028708, |
|
"grad_norm": 0.27585679292678833, |
|
"learning_rate": 7.105211493690073e-05, |
|
"loss": 2.5628, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.597979797979798, |
|
"grad_norm": 0.2890099287033081, |
|
"learning_rate": 7.079432293630244e-05, |
|
"loss": 2.6482, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.5988304093567252, |
|
"grad_norm": 0.27924832701683044, |
|
"learning_rate": 7.0536742952204e-05, |
|
"loss": 2.4101, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.5996810207336523, |
|
"grad_norm": 0.30849024653434753, |
|
"learning_rate": 7.02793768544887e-05, |
|
"loss": 2.4998, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.6005316321105795, |
|
"grad_norm": 0.286748468875885, |
|
"learning_rate": 7.002222651148714e-05, |
|
"loss": 2.3868, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.6013822434875067, |
|
"grad_norm": 0.2908494472503662, |
|
"learning_rate": 6.976529378996357e-05, |
|
"loss": 2.4456, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.6022328548644338, |
|
"grad_norm": 0.26911187171936035, |
|
"learning_rate": 6.950858055510254e-05, |
|
"loss": 2.633, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.603083466241361, |
|
"grad_norm": 0.33310920000076294, |
|
"learning_rate": 6.925208867049522e-05, |
|
"loss": 2.8533, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.6039340776182881, |
|
"grad_norm": 0.2783631682395935, |
|
"learning_rate": 6.89958199981258e-05, |
|
"loss": 2.4214, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6047846889952153, |
|
"grad_norm": 0.2714787721633911, |
|
"learning_rate": 6.873977639835829e-05, |
|
"loss": 2.6986, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.6056353003721425, |
|
"grad_norm": 0.2905879616737366, |
|
"learning_rate": 6.848395972992261e-05, |
|
"loss": 2.5218, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.6064859117490696, |
|
"grad_norm": 0.288647323846817, |
|
"learning_rate": 6.822837184990132e-05, |
|
"loss": 2.6838, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.6073365231259968, |
|
"grad_norm": 0.2918678820133209, |
|
"learning_rate": 6.797301461371625e-05, |
|
"loss": 2.6513, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.6081871345029239, |
|
"grad_norm": 0.26965370774269104, |
|
"learning_rate": 6.771788987511469e-05, |
|
"loss": 2.5379, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.6090377458798512, |
|
"grad_norm": 0.27741289138793945, |
|
"learning_rate": 6.746299948615631e-05, |
|
"loss": 2.6699, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.6098883572567783, |
|
"grad_norm": 0.26440632343292236, |
|
"learning_rate": 6.720834529719939e-05, |
|
"loss": 2.6012, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.6107389686337055, |
|
"grad_norm": 0.27086710929870605, |
|
"learning_rate": 6.695392915688759e-05, |
|
"loss": 2.5854, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.6115895800106327, |
|
"grad_norm": 0.2577430009841919, |
|
"learning_rate": 6.66997529121365e-05, |
|
"loss": 2.4548, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.6124401913875598, |
|
"grad_norm": 0.2519771158695221, |
|
"learning_rate": 6.644581840812018e-05, |
|
"loss": 2.4582, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.613290802764487, |
|
"grad_norm": 0.31657370924949646, |
|
"learning_rate": 6.619212748825776e-05, |
|
"loss": 2.5846, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.6141414141414141, |
|
"grad_norm": 0.28847917914390564, |
|
"learning_rate": 6.593868199420017e-05, |
|
"loss": 2.7865, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.6149920255183413, |
|
"grad_norm": 0.29375341534614563, |
|
"learning_rate": 6.568548376581662e-05, |
|
"loss": 2.3419, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.6158426368952685, |
|
"grad_norm": 0.320289671421051, |
|
"learning_rate": 6.543253464118131e-05, |
|
"loss": 2.7476, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.6166932482721956, |
|
"grad_norm": 0.3045991063117981, |
|
"learning_rate": 6.517983645656014e-05, |
|
"loss": 2.5995, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.6175438596491228, |
|
"grad_norm": 0.3231359124183655, |
|
"learning_rate": 6.492739104639727e-05, |
|
"loss": 2.6869, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.6183944710260499, |
|
"grad_norm": 0.36181551218032837, |
|
"learning_rate": 6.467520024330193e-05, |
|
"loss": 2.8256, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.6192450824029772, |
|
"grad_norm": 0.34118232131004333, |
|
"learning_rate": 6.4423265878035e-05, |
|
"loss": 2.7321, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.6200956937799043, |
|
"grad_norm": 0.3405606746673584, |
|
"learning_rate": 6.417158977949575e-05, |
|
"loss": 2.6993, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.6209463051568315, |
|
"grad_norm": 0.355055570602417, |
|
"learning_rate": 6.392017377470866e-05, |
|
"loss": 2.7063, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.6217969165337587, |
|
"grad_norm": 0.368937611579895, |
|
"learning_rate": 6.366901968881002e-05, |
|
"loss": 2.682, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.6226475279106858, |
|
"grad_norm": 0.3755683898925781, |
|
"learning_rate": 6.341812934503469e-05, |
|
"loss": 2.7295, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.623498139287613, |
|
"grad_norm": 0.37153956294059753, |
|
"learning_rate": 6.316750456470303e-05, |
|
"loss": 2.7157, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.6243487506645401, |
|
"grad_norm": 0.4051001965999603, |
|
"learning_rate": 6.291714716720749e-05, |
|
"loss": 2.8429, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.6251993620414673, |
|
"grad_norm": 0.3929100036621094, |
|
"learning_rate": 6.26670589699995e-05, |
|
"loss": 2.7148, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.6260499734183945, |
|
"grad_norm": 0.4600156843662262, |
|
"learning_rate": 6.24172417885762e-05, |
|
"loss": 2.765, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.6269005847953216, |
|
"grad_norm": 0.42697155475616455, |
|
"learning_rate": 6.216769743646733e-05, |
|
"loss": 2.6477, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.6277511961722488, |
|
"grad_norm": 0.4428333640098572, |
|
"learning_rate": 6.191842772522214e-05, |
|
"loss": 2.9439, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.6286018075491759, |
|
"grad_norm": 0.45895466208457947, |
|
"learning_rate": 6.166943446439604e-05, |
|
"loss": 2.6916, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.6294524189261032, |
|
"grad_norm": 0.4783354103565216, |
|
"learning_rate": 6.142071946153751e-05, |
|
"loss": 2.6747, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6303030303030303, |
|
"grad_norm": 0.4584222733974457, |
|
"learning_rate": 6.117228452217525e-05, |
|
"loss": 2.8631, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.6311536416799575, |
|
"grad_norm": 0.5074661374092102, |
|
"learning_rate": 6.092413144980464e-05, |
|
"loss": 2.7687, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.6320042530568847, |
|
"grad_norm": 0.500322699546814, |
|
"learning_rate": 6.0676262045874976e-05, |
|
"loss": 2.6794, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.6328548644338118, |
|
"grad_norm": 0.4991973042488098, |
|
"learning_rate": 6.04286781097763e-05, |
|
"loss": 2.739, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.633705475810739, |
|
"grad_norm": 0.5203631520271301, |
|
"learning_rate": 6.018138143882621e-05, |
|
"loss": 2.5866, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.6345560871876661, |
|
"grad_norm": 0.5206765532493591, |
|
"learning_rate": 5.9934373828257106e-05, |
|
"loss": 2.5869, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.6354066985645933, |
|
"grad_norm": 0.5892865061759949, |
|
"learning_rate": 5.96876570712028e-05, |
|
"loss": 2.6141, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.6362573099415205, |
|
"grad_norm": 0.6528813242912292, |
|
"learning_rate": 5.944123295868573e-05, |
|
"loss": 2.7467, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.6371079213184476, |
|
"grad_norm": 0.6801220178604126, |
|
"learning_rate": 5.9195103279603956e-05, |
|
"loss": 2.8362, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.6379585326953748, |
|
"grad_norm": 0.7539176940917969, |
|
"learning_rate": 5.894926982071804e-05, |
|
"loss": 2.8246, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6388091440723019, |
|
"grad_norm": 0.2966770827770233, |
|
"learning_rate": 5.870373436663823e-05, |
|
"loss": 2.6906, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.6396597554492292, |
|
"grad_norm": 0.27538472414016724, |
|
"learning_rate": 5.845849869981137e-05, |
|
"loss": 2.4334, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.6405103668261563, |
|
"grad_norm": 0.2812007665634155, |
|
"learning_rate": 5.821356460050805e-05, |
|
"loss": 2.4665, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.6413609782030835, |
|
"grad_norm": 0.28760018944740295, |
|
"learning_rate": 5.796893384680964e-05, |
|
"loss": 2.6608, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.6422115895800107, |
|
"grad_norm": 0.2786334455013275, |
|
"learning_rate": 5.772460821459542e-05, |
|
"loss": 2.4717, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.6430622009569378, |
|
"grad_norm": 0.27650853991508484, |
|
"learning_rate": 5.7480589477529545e-05, |
|
"loss": 2.5303, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.643912812333865, |
|
"grad_norm": 0.29573655128479004, |
|
"learning_rate": 5.723687940704856e-05, |
|
"loss": 2.6558, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.6447634237107921, |
|
"grad_norm": 0.2814992070198059, |
|
"learning_rate": 5.699347977234799e-05, |
|
"loss": 2.4268, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.6456140350877193, |
|
"grad_norm": 0.2827305495738983, |
|
"learning_rate": 5.675039234036983e-05, |
|
"loss": 2.5284, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.6464646464646465, |
|
"grad_norm": 0.2877061665058136, |
|
"learning_rate": 5.650761887578977e-05, |
|
"loss": 2.5597, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6473152578415736, |
|
"grad_norm": 0.2902846038341522, |
|
"learning_rate": 5.6265161141004244e-05, |
|
"loss": 2.719, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.6481658692185008, |
|
"grad_norm": 0.3014174997806549, |
|
"learning_rate": 5.602302089611755e-05, |
|
"loss": 2.6498, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.6490164805954279, |
|
"grad_norm": 0.2754184603691101, |
|
"learning_rate": 5.578119989892931e-05, |
|
"loss": 2.7288, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.6498670919723551, |
|
"grad_norm": 0.27788618206977844, |
|
"learning_rate": 5.5539699904921635e-05, |
|
"loss": 2.5285, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.6507177033492823, |
|
"grad_norm": 0.27178868651390076, |
|
"learning_rate": 5.529852266724616e-05, |
|
"loss": 2.6693, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.6515683147262095, |
|
"grad_norm": 0.2783076763153076, |
|
"learning_rate": 5.505766993671162e-05, |
|
"loss": 2.6003, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.6524189261031367, |
|
"grad_norm": 0.27819153666496277, |
|
"learning_rate": 5.481714346177103e-05, |
|
"loss": 2.5775, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.6532695374800638, |
|
"grad_norm": 0.27801868319511414, |
|
"learning_rate": 5.457694498850891e-05, |
|
"loss": 2.6342, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.654120148856991, |
|
"grad_norm": 0.27529436349868774, |
|
"learning_rate": 5.43370762606287e-05, |
|
"loss": 2.6376, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.6549707602339181, |
|
"grad_norm": 0.2912578582763672, |
|
"learning_rate": 5.409753901944006e-05, |
|
"loss": 2.8169, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.6558213716108453, |
|
"grad_norm": 0.28431615233421326, |
|
"learning_rate": 5.385833500384632e-05, |
|
"loss": 2.5608, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.6566719829877724, |
|
"grad_norm": 0.3062645196914673, |
|
"learning_rate": 5.3619465950331646e-05, |
|
"loss": 2.6686, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.6575225943646996, |
|
"grad_norm": 0.29184243083000183, |
|
"learning_rate": 5.3380933592948704e-05, |
|
"loss": 2.4963, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.6583732057416268, |
|
"grad_norm": 0.3256790041923523, |
|
"learning_rate": 5.3142739663305906e-05, |
|
"loss": 2.5206, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.6592238171185539, |
|
"grad_norm": 0.3323804438114166, |
|
"learning_rate": 5.2904885890554836e-05, |
|
"loss": 2.7495, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.6600744284954811, |
|
"grad_norm": 0.3358718454837799, |
|
"learning_rate": 5.266737400137765e-05, |
|
"loss": 2.646, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.6609250398724082, |
|
"grad_norm": 0.34872984886169434, |
|
"learning_rate": 5.24302057199749e-05, |
|
"loss": 2.8041, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.6617756512493355, |
|
"grad_norm": 0.3432627320289612, |
|
"learning_rate": 5.2193382768052436e-05, |
|
"loss": 2.6066, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.6626262626262627, |
|
"grad_norm": 0.3716273009777069, |
|
"learning_rate": 5.19569068648094e-05, |
|
"loss": 2.6634, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.6634768740031898, |
|
"grad_norm": 0.3645707964897156, |
|
"learning_rate": 5.172077972692553e-05, |
|
"loss": 2.5782, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.664327485380117, |
|
"grad_norm": 0.43927833437919617, |
|
"learning_rate": 5.148500306854862e-05, |
|
"loss": 2.6775, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.6651780967570441, |
|
"grad_norm": 0.4325307607650757, |
|
"learning_rate": 5.124957860128237e-05, |
|
"loss": 2.6963, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.6660287081339713, |
|
"grad_norm": 0.39009419083595276, |
|
"learning_rate": 5.101450803417357e-05, |
|
"loss": 2.7308, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.6668793195108984, |
|
"grad_norm": 0.42211011052131653, |
|
"learning_rate": 5.0779793073700044e-05, |
|
"loss": 2.8391, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.6677299308878256, |
|
"grad_norm": 0.4249938130378723, |
|
"learning_rate": 5.054543542375809e-05, |
|
"loss": 2.6562, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.6685805422647528, |
|
"grad_norm": 0.4374600350856781, |
|
"learning_rate": 5.031143678565005e-05, |
|
"loss": 2.6552, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.6694311536416799, |
|
"grad_norm": 0.471137672662735, |
|
"learning_rate": 5.0077798858072156e-05, |
|
"loss": 2.6368, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.6702817650186071, |
|
"grad_norm": 0.4666772186756134, |
|
"learning_rate": 4.984452333710207e-05, |
|
"loss": 2.6859, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.6711323763955342, |
|
"grad_norm": 0.5150439739227295, |
|
"learning_rate": 4.961161191618649e-05, |
|
"loss": 2.9448, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.6719829877724615, |
|
"grad_norm": 0.4853833019733429, |
|
"learning_rate": 4.937906628612905e-05, |
|
"loss": 2.7409, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6728335991493887, |
|
"grad_norm": 0.4760463535785675, |
|
"learning_rate": 4.914688813507797e-05, |
|
"loss": 2.5815, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.6736842105263158, |
|
"grad_norm": 0.5204352140426636, |
|
"learning_rate": 4.89150791485137e-05, |
|
"loss": 2.7374, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.674534821903243, |
|
"grad_norm": 0.5126561522483826, |
|
"learning_rate": 4.86836410092368e-05, |
|
"loss": 2.755, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.6753854332801701, |
|
"grad_norm": 0.55113685131073, |
|
"learning_rate": 4.845257539735577e-05, |
|
"loss": 2.7027, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.6762360446570973, |
|
"grad_norm": 0.5475419759750366, |
|
"learning_rate": 4.822188399027461e-05, |
|
"loss": 2.667, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.6770866560340244, |
|
"grad_norm": 0.6212125420570374, |
|
"learning_rate": 4.799156846268095e-05, |
|
"loss": 2.6865, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.6779372674109516, |
|
"grad_norm": 0.6271610260009766, |
|
"learning_rate": 4.7761630486533694e-05, |
|
"loss": 2.9713, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.6787878787878788, |
|
"grad_norm": 0.6181414723396301, |
|
"learning_rate": 4.7532071731050975e-05, |
|
"loss": 2.8862, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.6796384901648059, |
|
"grad_norm": 0.6996387243270874, |
|
"learning_rate": 4.730289386269792e-05, |
|
"loss": 2.9082, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.6804891015417331, |
|
"grad_norm": 0.7603393793106079, |
|
"learning_rate": 4.70740985451747e-05, |
|
"loss": 3.034, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6813397129186602, |
|
"grad_norm": 0.27659621834754944, |
|
"learning_rate": 4.684568743940444e-05, |
|
"loss": 2.5099, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.6821903242955875, |
|
"grad_norm": 0.29582586884498596, |
|
"learning_rate": 4.661766220352097e-05, |
|
"loss": 2.5086, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.6830409356725147, |
|
"grad_norm": 0.31561413407325745, |
|
"learning_rate": 4.639002449285693e-05, |
|
"loss": 2.5214, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.6838915470494418, |
|
"grad_norm": 0.27388331294059753, |
|
"learning_rate": 4.616277595993196e-05, |
|
"loss": 2.4724, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.684742158426369, |
|
"grad_norm": 0.2823973000049591, |
|
"learning_rate": 4.593591825444028e-05, |
|
"loss": 2.4987, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.6855927698032961, |
|
"grad_norm": 0.3274148106575012, |
|
"learning_rate": 4.57094530232389e-05, |
|
"loss": 2.6108, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.6864433811802233, |
|
"grad_norm": 0.27294856309890747, |
|
"learning_rate": 4.5483381910335955e-05, |
|
"loss": 2.4774, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.6872939925571504, |
|
"grad_norm": 0.2771831750869751, |
|
"learning_rate": 4.525770655687821e-05, |
|
"loss": 2.4953, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.6881446039340776, |
|
"grad_norm": 0.28946414589881897, |
|
"learning_rate": 4.5032428601139644e-05, |
|
"loss": 2.7516, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.6889952153110048, |
|
"grad_norm": 0.2817944884300232, |
|
"learning_rate": 4.48075496785092e-05, |
|
"loss": 2.6658, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6898458266879319, |
|
"grad_norm": 0.2726970613002777, |
|
"learning_rate": 4.4583071421479194e-05, |
|
"loss": 2.6478, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.6906964380648591, |
|
"grad_norm": 0.2892582416534424, |
|
"learning_rate": 4.435899545963332e-05, |
|
"loss": 2.6592, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.6915470494417862, |
|
"grad_norm": 0.2825833559036255, |
|
"learning_rate": 4.4135323419634766e-05, |
|
"loss": 2.4431, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.6923976608187135, |
|
"grad_norm": 0.27093660831451416, |
|
"learning_rate": 4.391205692521453e-05, |
|
"loss": 2.4303, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.6932482721956407, |
|
"grad_norm": 0.2554977834224701, |
|
"learning_rate": 4.368919759715964e-05, |
|
"loss": 2.3259, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.6940988835725678, |
|
"grad_norm": 0.27814123034477234, |
|
"learning_rate": 4.346674705330117e-05, |
|
"loss": 2.5495, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.694949494949495, |
|
"grad_norm": 0.303751140832901, |
|
"learning_rate": 4.32447069085028e-05, |
|
"loss": 2.5834, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.6958001063264221, |
|
"grad_norm": 0.27305588126182556, |
|
"learning_rate": 4.302307877464893e-05, |
|
"loss": 2.479, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.6966507177033493, |
|
"grad_norm": 0.265235036611557, |
|
"learning_rate": 4.280186426063291e-05, |
|
"loss": 2.4847, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.6975013290802764, |
|
"grad_norm": 0.2711983025074005, |
|
"learning_rate": 4.258106497234551e-05, |
|
"loss": 2.3499, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.6983519404572036, |
|
"grad_norm": 0.30092155933380127, |
|
"learning_rate": 4.236068251266324e-05, |
|
"loss": 2.7319, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.6992025518341308, |
|
"grad_norm": 0.3007088601589203, |
|
"learning_rate": 4.214071848143655e-05, |
|
"loss": 2.5465, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.7000531632110579, |
|
"grad_norm": 0.3158135712146759, |
|
"learning_rate": 4.192117447547845e-05, |
|
"loss": 2.6333, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.7009037745879851, |
|
"grad_norm": 0.3173391819000244, |
|
"learning_rate": 4.170205208855281e-05, |
|
"loss": 2.8266, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.7017543859649122, |
|
"grad_norm": 0.3236771523952484, |
|
"learning_rate": 4.148335291136267e-05, |
|
"loss": 2.7447, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.7026049973418395, |
|
"grad_norm": 0.320290744304657, |
|
"learning_rate": 4.1265078531538916e-05, |
|
"loss": 2.8345, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.7034556087187667, |
|
"grad_norm": 0.33358582854270935, |
|
"learning_rate": 4.104723053362867e-05, |
|
"loss": 2.848, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.7043062200956938, |
|
"grad_norm": 0.35341066122055054, |
|
"learning_rate": 4.082981049908362e-05, |
|
"loss": 2.7216, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.705156831472621, |
|
"grad_norm": 0.35734865069389343, |
|
"learning_rate": 4.061282000624885e-05, |
|
"loss": 2.702, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.7060074428495481, |
|
"grad_norm": 0.3350493013858795, |
|
"learning_rate": 4.0396260630351066e-05, |
|
"loss": 2.4811, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.7068580542264753, |
|
"grad_norm": 0.35712873935699463, |
|
"learning_rate": 4.018013394348752e-05, |
|
"loss": 2.7484, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.7077086656034024, |
|
"grad_norm": 0.37364643812179565, |
|
"learning_rate": 3.996444151461417e-05, |
|
"loss": 2.6113, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.7085592769803296, |
|
"grad_norm": 0.41270262002944946, |
|
"learning_rate": 3.9749184909534565e-05, |
|
"loss": 2.6654, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.7094098883572568, |
|
"grad_norm": 0.3864672780036926, |
|
"learning_rate": 3.9534365690888566e-05, |
|
"loss": 2.6718, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.7102604997341839, |
|
"grad_norm": 0.4115494191646576, |
|
"learning_rate": 3.931998541814069e-05, |
|
"loss": 2.6621, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 0.4606288969516754, |
|
"learning_rate": 3.9106045647569e-05, |
|
"loss": 2.512, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.7119617224880382, |
|
"grad_norm": 0.42326587438583374, |
|
"learning_rate": 3.8892547932253795e-05, |
|
"loss": 2.6212, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.7128123338649655, |
|
"grad_norm": 0.43627670407295227, |
|
"learning_rate": 3.8679493822066314e-05, |
|
"loss": 2.596, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.7136629452418927, |
|
"grad_norm": 0.4475801885128021, |
|
"learning_rate": 3.846688486365748e-05, |
|
"loss": 2.64, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.7145135566188198, |
|
"grad_norm": 0.4370073080062866, |
|
"learning_rate": 3.825472260044658e-05, |
|
"loss": 2.5291, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.715364167995747, |
|
"grad_norm": 0.4832041561603546, |
|
"learning_rate": 3.804300857261025e-05, |
|
"loss": 2.659, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.7162147793726741, |
|
"grad_norm": 0.4905672073364258, |
|
"learning_rate": 3.783174431707119e-05, |
|
"loss": 2.7363, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.7170653907496013, |
|
"grad_norm": 0.5135957598686218, |
|
"learning_rate": 3.762093136748692e-05, |
|
"loss": 2.8243, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.7179160021265284, |
|
"grad_norm": 0.5257413387298584, |
|
"learning_rate": 3.7410571254238834e-05, |
|
"loss": 2.8406, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.7187666135034556, |
|
"grad_norm": 0.5150455236434937, |
|
"learning_rate": 3.7200665504420983e-05, |
|
"loss": 2.6053, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.7196172248803828, |
|
"grad_norm": 0.5135449767112732, |
|
"learning_rate": 3.69912156418289e-05, |
|
"loss": 2.4932, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.7204678362573099, |
|
"grad_norm": 0.5775859951972961, |
|
"learning_rate": 3.678222318694875e-05, |
|
"loss": 2.6011, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.7213184476342371, |
|
"grad_norm": 0.6324962973594666, |
|
"learning_rate": 3.657368965694617e-05, |
|
"loss": 2.8145, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.7221690590111642, |
|
"grad_norm": 0.6755786538124084, |
|
"learning_rate": 3.636561656565519e-05, |
|
"loss": 2.7566, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.7230196703880915, |
|
"grad_norm": 0.7153156995773315, |
|
"learning_rate": 3.615800542356738e-05, |
|
"loss": 2.9433, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.7238702817650186, |
|
"grad_norm": 0.29322749376296997, |
|
"learning_rate": 3.595085773782083e-05, |
|
"loss": 2.3433, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.7247208931419458, |
|
"grad_norm": 0.28010284900665283, |
|
"learning_rate": 3.574417501218913e-05, |
|
"loss": 2.5661, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.725571504518873, |
|
"grad_norm": 0.27005431056022644, |
|
"learning_rate": 3.55379587470706e-05, |
|
"loss": 2.4129, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.7264221158958001, |
|
"grad_norm": 0.2710054814815521, |
|
"learning_rate": 3.533221043947733e-05, |
|
"loss": 2.4066, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 0.27026283740997314, |
|
"learning_rate": 3.512693158302421e-05, |
|
"loss": 2.4139, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.7281233386496544, |
|
"grad_norm": 0.2927492558956146, |
|
"learning_rate": 3.492212366791831e-05, |
|
"loss": 2.5517, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.7289739500265816, |
|
"grad_norm": 0.2770216166973114, |
|
"learning_rate": 3.471778818094785e-05, |
|
"loss": 2.6102, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.7298245614035088, |
|
"grad_norm": 0.26914748549461365, |
|
"learning_rate": 3.45139266054715e-05, |
|
"loss": 2.6047, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.7306751727804359, |
|
"grad_norm": 0.27307602763175964, |
|
"learning_rate": 3.4310540421407665e-05, |
|
"loss": 2.5707, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.7315257841573631, |
|
"grad_norm": 0.2949882745742798, |
|
"learning_rate": 3.4107631105223525e-05, |
|
"loss": 2.502, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.7323763955342902, |
|
"grad_norm": 0.26683247089385986, |
|
"learning_rate": 3.390520012992474e-05, |
|
"loss": 2.5785, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.7332270069112174, |
|
"grad_norm": 0.2797079384326935, |
|
"learning_rate": 3.370324896504425e-05, |
|
"loss": 2.6119, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.7340776182881446, |
|
"grad_norm": 0.27322161197662354, |
|
"learning_rate": 3.3501779076631864e-05, |
|
"loss": 2.5449, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.7349282296650718, |
|
"grad_norm": 0.2743871212005615, |
|
"learning_rate": 3.330079192724379e-05, |
|
"loss": 2.6722, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.735778841041999, |
|
"grad_norm": 0.27956199645996094, |
|
"learning_rate": 3.3100288975931635e-05, |
|
"loss": 2.5033, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.7366294524189261, |
|
"grad_norm": 0.2711998522281647, |
|
"learning_rate": 3.290027167823204e-05, |
|
"loss": 2.5824, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.7374800637958533, |
|
"grad_norm": 0.277340292930603, |
|
"learning_rate": 3.270074148615615e-05, |
|
"loss": 2.5168, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.7383306751727804, |
|
"grad_norm": 0.26151034235954285, |
|
"learning_rate": 3.250169984817897e-05, |
|
"loss": 2.5749, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.7391812865497076, |
|
"grad_norm": 0.2847073972225189, |
|
"learning_rate": 3.230314820922883e-05, |
|
"loss": 2.4749, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.7400318979266348, |
|
"grad_norm": 0.26492998003959656, |
|
"learning_rate": 3.2105088010677e-05, |
|
"loss": 2.5874, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7408825093035619, |
|
"grad_norm": 0.2851366400718689, |
|
"learning_rate": 3.1907520690327184e-05, |
|
"loss": 2.7108, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.7417331206804891, |
|
"grad_norm": 0.29604560136795044, |
|
"learning_rate": 3.1710447682405076e-05, |
|
"loss": 2.8558, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.7425837320574162, |
|
"grad_norm": 0.2940882444381714, |
|
"learning_rate": 3.151387041754784e-05, |
|
"loss": 2.5961, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.7434343434343434, |
|
"grad_norm": 0.3165973126888275, |
|
"learning_rate": 3.131779032279397e-05, |
|
"loss": 2.5976, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.7442849548112705, |
|
"grad_norm": 0.3157351315021515, |
|
"learning_rate": 3.112220882157275e-05, |
|
"loss": 2.5901, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.7451355661881978, |
|
"grad_norm": 0.3334716558456421, |
|
"learning_rate": 3.092712733369387e-05, |
|
"loss": 2.6966, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.745986177565125, |
|
"grad_norm": 0.3281215727329254, |
|
"learning_rate": 3.073254727533732e-05, |
|
"loss": 2.616, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.7468367889420521, |
|
"grad_norm": 0.3478872776031494, |
|
"learning_rate": 3.053847005904298e-05, |
|
"loss": 2.7568, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.7476874003189793, |
|
"grad_norm": 0.35839834809303284, |
|
"learning_rate": 3.034489709370033e-05, |
|
"loss": 2.7552, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.7485380116959064, |
|
"grad_norm": 0.3570314049720764, |
|
"learning_rate": 3.0151829784538254e-05, |
|
"loss": 2.6189, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.7493886230728336, |
|
"grad_norm": 0.3771030008792877, |
|
"learning_rate": 2.995926953311504e-05, |
|
"loss": 3.0162, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.7502392344497608, |
|
"grad_norm": 0.38607585430145264, |
|
"learning_rate": 2.9767217737307806e-05, |
|
"loss": 2.8731, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.7502392344497608, |
|
"eval_loss": 2.657837152481079, |
|
"eval_runtime": 80.5036, |
|
"eval_samples_per_second": 12.298, |
|
"eval_steps_per_second": 6.149, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.7510898458266879, |
|
"grad_norm": 0.3817814588546753, |
|
"learning_rate": 2.9575675791302703e-05, |
|
"loss": 2.7678, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.7519404572036151, |
|
"grad_norm": 0.4072461724281311, |
|
"learning_rate": 2.9384645085584663e-05, |
|
"loss": 2.7004, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.7527910685805422, |
|
"grad_norm": 0.3934648036956787, |
|
"learning_rate": 2.9194127006927208e-05, |
|
"loss": 2.4883, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.7536416799574694, |
|
"grad_norm": 0.43659910559654236, |
|
"learning_rate": 2.9004122938382617e-05, |
|
"loss": 2.7447, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.7544922913343965, |
|
"grad_norm": 0.4242823123931885, |
|
"learning_rate": 2.881463425927161e-05, |
|
"loss": 2.7324, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.7553429027113238, |
|
"grad_norm": 0.4546271860599518, |
|
"learning_rate": 2.86256623451736e-05, |
|
"loss": 2.5985, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.756193514088251, |
|
"grad_norm": 0.462896466255188, |
|
"learning_rate": 2.8437208567916517e-05, |
|
"loss": 2.5522, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.7570441254651781, |
|
"grad_norm": 0.47179552912712097, |
|
"learning_rate": 2.8249274295566864e-05, |
|
"loss": 2.7653, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.7578947368421053, |
|
"grad_norm": 0.5069417357444763, |
|
"learning_rate": 2.8061860892420012e-05, |
|
"loss": 2.8404, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.7587453482190324, |
|
"grad_norm": 0.5162703990936279, |
|
"learning_rate": 2.787496971898994e-05, |
|
"loss": 2.5679, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.7595959595959596, |
|
"grad_norm": 0.5141138434410095, |
|
"learning_rate": 2.7688602131999565e-05, |
|
"loss": 2.7156, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.7604465709728868, |
|
"grad_norm": 0.5267017483711243, |
|
"learning_rate": 2.7502759484370944e-05, |
|
"loss": 2.8324, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.7612971823498139, |
|
"grad_norm": 0.5354244112968445, |
|
"learning_rate": 2.7317443125215357e-05, |
|
"loss": 2.7099, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.7621477937267411, |
|
"grad_norm": 0.5597330331802368, |
|
"learning_rate": 2.7132654399823444e-05, |
|
"loss": 2.7128, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.7629984051036682, |
|
"grad_norm": 0.6345269680023193, |
|
"learning_rate": 2.6948394649655627e-05, |
|
"loss": 2.8904, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.7638490164805954, |
|
"grad_norm": 0.5837516784667969, |
|
"learning_rate": 2.6764665212332253e-05, |
|
"loss": 2.5579, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.7646996278575225, |
|
"grad_norm": 0.7032825350761414, |
|
"learning_rate": 2.658146742162384e-05, |
|
"loss": 2.7115, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 0.7655502392344498, |
|
"grad_norm": 0.6780116558074951, |
|
"learning_rate": 2.6398802607441507e-05, |
|
"loss": 2.6573, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.766400850611377, |
|
"grad_norm": 0.2630327641963959, |
|
"learning_rate": 2.6216672095827266e-05, |
|
"loss": 2.5077, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 0.7672514619883041, |
|
"grad_norm": 0.2665589153766632, |
|
"learning_rate": 2.6035077208944415e-05, |
|
"loss": 2.4456, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.7681020733652313, |
|
"grad_norm": 0.2564515471458435, |
|
"learning_rate": 2.5854019265067853e-05, |
|
"loss": 2.2768, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.7689526847421584, |
|
"grad_norm": 0.290310263633728, |
|
"learning_rate": 2.5673499578574645e-05, |
|
"loss": 2.6339, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.7698032961190856, |
|
"grad_norm": 0.27893102169036865, |
|
"learning_rate": 2.5493519459934423e-05, |
|
"loss": 2.6156, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.7706539074960128, |
|
"grad_norm": 0.27719345688819885, |
|
"learning_rate": 2.531408021569982e-05, |
|
"loss": 2.4212, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.7715045188729399, |
|
"grad_norm": 0.2813560664653778, |
|
"learning_rate": 2.5135183148496978e-05, |
|
"loss": 2.5064, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 0.7723551302498671, |
|
"grad_norm": 0.28871771693229675, |
|
"learning_rate": 2.4956829557016338e-05, |
|
"loss": 2.505, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.7732057416267942, |
|
"grad_norm": 0.30079928040504456, |
|
"learning_rate": 2.4779020736002834e-05, |
|
"loss": 2.4488, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 0.7740563530037214, |
|
"grad_norm": 0.27679336071014404, |
|
"learning_rate": 2.4601757976246686e-05, |
|
"loss": 2.6442, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.7749069643806485, |
|
"grad_norm": 0.27239468693733215, |
|
"learning_rate": 2.4425042564574184e-05, |
|
"loss": 2.4256, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 0.7757575757575758, |
|
"grad_norm": 0.2782379686832428, |
|
"learning_rate": 2.4248875783837987e-05, |
|
"loss": 2.5383, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.776608187134503, |
|
"grad_norm": 0.28101786971092224, |
|
"learning_rate": 2.407325891290817e-05, |
|
"loss": 2.5509, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 0.7774587985114301, |
|
"grad_norm": 0.26222068071365356, |
|
"learning_rate": 2.3898193226662634e-05, |
|
"loss": 2.5718, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.7783094098883573, |
|
"grad_norm": 0.28495025634765625, |
|
"learning_rate": 2.3723679995978088e-05, |
|
"loss": 2.7216, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.7791600212652844, |
|
"grad_norm": 0.27799373865127563, |
|
"learning_rate": 2.3549720487720738e-05, |
|
"loss": 2.4825, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.7800106326422116, |
|
"grad_norm": 0.2805662751197815, |
|
"learning_rate": 2.3376315964737004e-05, |
|
"loss": 2.5418, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.7808612440191387, |
|
"grad_norm": 0.2894647717475891, |
|
"learning_rate": 2.3203467685844494e-05, |
|
"loss": 2.4872, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.7817118553960659, |
|
"grad_norm": 0.2725624740123749, |
|
"learning_rate": 2.3031176905822805e-05, |
|
"loss": 2.6821, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 0.7825624667729931, |
|
"grad_norm": 0.2858252227306366, |
|
"learning_rate": 2.2859444875404347e-05, |
|
"loss": 2.5256, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.7834130781499202, |
|
"grad_norm": 0.27502021193504333, |
|
"learning_rate": 2.268827284126539e-05, |
|
"loss": 2.6097, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 0.7842636895268474, |
|
"grad_norm": 0.30405184626579285, |
|
"learning_rate": 2.2517662046016975e-05, |
|
"loss": 2.5872, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.7851143009037745, |
|
"grad_norm": 0.3032589256763458, |
|
"learning_rate": 2.234761372819577e-05, |
|
"loss": 2.6253, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 0.7859649122807018, |
|
"grad_norm": 0.3149045705795288, |
|
"learning_rate": 2.2178129122255253e-05, |
|
"loss": 2.5631, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.786815523657629, |
|
"grad_norm": 0.3490796387195587, |
|
"learning_rate": 2.200920945855669e-05, |
|
"loss": 2.7784, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.7876661350345561, |
|
"grad_norm": 0.332727313041687, |
|
"learning_rate": 2.184085596336011e-05, |
|
"loss": 2.7369, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.7885167464114833, |
|
"grad_norm": 0.345241516828537, |
|
"learning_rate": 2.1673069858815554e-05, |
|
"loss": 2.734, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 0.7893673577884104, |
|
"grad_norm": 0.356146901845932, |
|
"learning_rate": 2.150585236295415e-05, |
|
"loss": 2.5664, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.7902179691653376, |
|
"grad_norm": 0.34294942021369934, |
|
"learning_rate": 2.133920468967915e-05, |
|
"loss": 2.7841, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 0.7910685805422647, |
|
"grad_norm": 0.3757612407207489, |
|
"learning_rate": 2.1173128048757306e-05, |
|
"loss": 2.8389, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.7919191919191919, |
|
"grad_norm": 0.37201401591300964, |
|
"learning_rate": 2.1007623645810003e-05, |
|
"loss": 2.7424, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.7927698032961191, |
|
"grad_norm": 0.37049031257629395, |
|
"learning_rate": 2.0842692682304444e-05, |
|
"loss": 2.7447, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.7936204146730462, |
|
"grad_norm": 0.40525051951408386, |
|
"learning_rate": 2.0678336355545048e-05, |
|
"loss": 2.7354, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 0.7944710260499734, |
|
"grad_norm": 0.37740442156791687, |
|
"learning_rate": 2.0514555858664663e-05, |
|
"loss": 2.5756, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.7953216374269005, |
|
"grad_norm": 0.399179607629776, |
|
"learning_rate": 2.0351352380616008e-05, |
|
"loss": 2.606, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.7961722488038278, |
|
"grad_norm": 0.4020417630672455, |
|
"learning_rate": 2.0188727106162874e-05, |
|
"loss": 2.5945, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.797022860180755, |
|
"grad_norm": 0.4364849627017975, |
|
"learning_rate": 2.0026681215871656e-05, |
|
"loss": 2.7867, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 0.7978734715576821, |
|
"grad_norm": 0.4335819482803345, |
|
"learning_rate": 1.986521588610285e-05, |
|
"loss": 2.6452, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.7987240829346093, |
|
"grad_norm": 0.44533243775367737, |
|
"learning_rate": 1.9704332289002293e-05, |
|
"loss": 2.7037, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 0.7995746943115364, |
|
"grad_norm": 0.46524372696876526, |
|
"learning_rate": 1.9544031592492763e-05, |
|
"loss": 2.7772, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.8004253056884636, |
|
"grad_norm": 0.4814035892486572, |
|
"learning_rate": 1.9384314960265692e-05, |
|
"loss": 2.7603, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 0.8012759170653907, |
|
"grad_norm": 0.5147087574005127, |
|
"learning_rate": 1.922518355177232e-05, |
|
"loss": 2.8118, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.8021265284423179, |
|
"grad_norm": 0.5196573734283447, |
|
"learning_rate": 1.906663852221565e-05, |
|
"loss": 2.7226, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 0.8029771398192451, |
|
"grad_norm": 0.538750171661377, |
|
"learning_rate": 1.890868102254182e-05, |
|
"loss": 2.8252, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.8038277511961722, |
|
"grad_norm": 0.5554125308990479, |
|
"learning_rate": 1.875131219943187e-05, |
|
"loss": 2.7985, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.8046783625730994, |
|
"grad_norm": 0.5610653162002563, |
|
"learning_rate": 1.8594533195293427e-05, |
|
"loss": 2.7006, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.8055289739500265, |
|
"grad_norm": 0.5772664546966553, |
|
"learning_rate": 1.843834514825229e-05, |
|
"loss": 2.7453, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 0.8063795853269538, |
|
"grad_norm": 0.7284188866615295, |
|
"learning_rate": 1.82827491921443e-05, |
|
"loss": 2.7826, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.807230196703881, |
|
"grad_norm": 0.6359315514564514, |
|
"learning_rate": 1.8127746456507077e-05, |
|
"loss": 2.5094, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 0.8080808080808081, |
|
"grad_norm": 0.7618624567985535, |
|
"learning_rate": 1.797333806657171e-05, |
|
"loss": 2.9762, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.8089314194577353, |
|
"grad_norm": 0.2761584520339966, |
|
"learning_rate": 1.7819525143254755e-05, |
|
"loss": 2.3495, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 0.8097820308346624, |
|
"grad_norm": 0.2787644565105438, |
|
"learning_rate": 1.7666308803150043e-05, |
|
"loss": 2.6593, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.8106326422115896, |
|
"grad_norm": 0.2583043873310089, |
|
"learning_rate": 1.751369015852046e-05, |
|
"loss": 2.3744, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 0.8114832535885167, |
|
"grad_norm": 0.27526992559432983, |
|
"learning_rate": 1.7361670317290012e-05, |
|
"loss": 2.424, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.8123338649654439, |
|
"grad_norm": 0.2713683843612671, |
|
"learning_rate": 1.7210250383035807e-05, |
|
"loss": 2.5069, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.8131844763423711, |
|
"grad_norm": 0.28076109290122986, |
|
"learning_rate": 1.7059431454979824e-05, |
|
"loss": 2.4204, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.8140350877192982, |
|
"grad_norm": 0.2972472012042999, |
|
"learning_rate": 1.6909214627981197e-05, |
|
"loss": 2.6109, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 0.8148856990962254, |
|
"grad_norm": 0.31532159447669983, |
|
"learning_rate": 1.6759600992528147e-05, |
|
"loss": 2.6416, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.8157363104731525, |
|
"grad_norm": 0.2699948847293854, |
|
"learning_rate": 1.6610591634729965e-05, |
|
"loss": 2.4951, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 0.8165869218500797, |
|
"grad_norm": 0.27630969882011414, |
|
"learning_rate": 1.6462187636309345e-05, |
|
"loss": 2.6336, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.817437533227007, |
|
"grad_norm": 0.27848508954048157, |
|
"learning_rate": 1.631439007459441e-05, |
|
"loss": 2.6848, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 0.8182881446039341, |
|
"grad_norm": 0.2589823007583618, |
|
"learning_rate": 1.61672000225108e-05, |
|
"loss": 2.6077, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.8191387559808613, |
|
"grad_norm": 0.27338507771492004, |
|
"learning_rate": 1.6020618548574108e-05, |
|
"loss": 2.4618, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 0.8199893673577884, |
|
"grad_norm": 0.26967307925224304, |
|
"learning_rate": 1.587464671688187e-05, |
|
"loss": 2.4733, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.8208399787347156, |
|
"grad_norm": 0.27979278564453125, |
|
"learning_rate": 1.5729285587106136e-05, |
|
"loss": 2.6789, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.8216905901116427, |
|
"grad_norm": 0.263261616230011, |
|
"learning_rate": 1.5584536214485457e-05, |
|
"loss": 2.5471, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.8225412014885699, |
|
"grad_norm": 0.27126452326774597, |
|
"learning_rate": 1.5440399649817385e-05, |
|
"loss": 2.6185, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 0.8233918128654971, |
|
"grad_norm": 0.2681058943271637, |
|
"learning_rate": 1.5296876939450978e-05, |
|
"loss": 2.567, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.8242424242424242, |
|
"grad_norm": 0.25772204995155334, |
|
"learning_rate": 1.5153969125278934e-05, |
|
"loss": 2.4961, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 0.8250930356193514, |
|
"grad_norm": 0.27491864562034607, |
|
"learning_rate": 1.5011677244730161e-05, |
|
"loss": 2.6572, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.8259436469962785, |
|
"grad_norm": 0.3069652318954468, |
|
"learning_rate": 1.4870002330762289e-05, |
|
"loss": 2.6633, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 0.8267942583732057, |
|
"grad_norm": 0.29441124200820923, |
|
"learning_rate": 1.4728945411854133e-05, |
|
"loss": 2.5631, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.827644869750133, |
|
"grad_norm": 0.29318681359291077, |
|
"learning_rate": 1.4588507511998162e-05, |
|
"loss": 2.7143, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 0.8284954811270601, |
|
"grad_norm": 0.3085382282733917, |
|
"learning_rate": 1.4448689650693147e-05, |
|
"loss": 2.6399, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.8293460925039873, |
|
"grad_norm": 0.3353934586048126, |
|
"learning_rate": 1.4309492842936768e-05, |
|
"loss": 2.7392, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.8301967038809144, |
|
"grad_norm": 0.35874679684638977, |
|
"learning_rate": 1.4170918099218166e-05, |
|
"loss": 2.6478, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.8310473152578416, |
|
"grad_norm": 0.3231167793273926, |
|
"learning_rate": 1.4032966425510663e-05, |
|
"loss": 2.5808, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 0.8318979266347687, |
|
"grad_norm": 0.3558177351951599, |
|
"learning_rate": 1.3895638823264446e-05, |
|
"loss": 2.8192, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.8327485380116959, |
|
"grad_norm": 0.3447312116622925, |
|
"learning_rate": 1.3758936289399348e-05, |
|
"loss": 2.439, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 0.8335991493886231, |
|
"grad_norm": 0.35401877760887146, |
|
"learning_rate": 1.3622859816297473e-05, |
|
"loss": 2.6874, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.8344497607655502, |
|
"grad_norm": 0.3697657585144043, |
|
"learning_rate": 1.3487410391796162e-05, |
|
"loss": 2.6217, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 0.8353003721424774, |
|
"grad_norm": 0.39387422800064087, |
|
"learning_rate": 1.3352588999180726e-05, |
|
"loss": 2.9086, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.8361509835194045, |
|
"grad_norm": 0.4256942570209503, |
|
"learning_rate": 1.3218396617177287e-05, |
|
"loss": 2.6578, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 0.8370015948963317, |
|
"grad_norm": 0.40965893864631653, |
|
"learning_rate": 1.308483421994573e-05, |
|
"loss": 2.6781, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.837852206273259, |
|
"grad_norm": 0.4041212797164917, |
|
"learning_rate": 1.2951902777072655e-05, |
|
"loss": 2.8038, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.8387028176501861, |
|
"grad_norm": 0.416715532541275, |
|
"learning_rate": 1.2819603253564205e-05, |
|
"loss": 2.5885, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.8395534290271133, |
|
"grad_norm": 0.4417300522327423, |
|
"learning_rate": 1.2687936609839235e-05, |
|
"loss": 2.8096, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 0.8404040404040404, |
|
"grad_norm": 0.47883424162864685, |
|
"learning_rate": 1.2556903801722219e-05, |
|
"loss": 2.8092, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.8412546517809676, |
|
"grad_norm": 0.4706938862800598, |
|
"learning_rate": 1.2426505780436326e-05, |
|
"loss": 2.6285, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 0.4550638496875763, |
|
"learning_rate": 1.2296743492596586e-05, |
|
"loss": 2.5213, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8429558745348219, |
|
"grad_norm": 0.48749181628227234, |
|
"learning_rate": 1.2167617880202908e-05, |
|
"loss": 2.791, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 0.8438064859117491, |
|
"grad_norm": 0.46483585238456726, |
|
"learning_rate": 1.2039129880633349e-05, |
|
"loss": 2.7672, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.8446570972886762, |
|
"grad_norm": 0.510725200176239, |
|
"learning_rate": 1.1911280426637273e-05, |
|
"loss": 2.9072, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 0.8455077086656034, |
|
"grad_norm": 0.5274655222892761, |
|
"learning_rate": 1.1784070446328476e-05, |
|
"loss": 2.8219, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.8463583200425305, |
|
"grad_norm": 0.5416675806045532, |
|
"learning_rate": 1.1657500863178694e-05, |
|
"loss": 2.7237, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.8472089314194577, |
|
"grad_norm": 0.5472414493560791, |
|
"learning_rate": 1.153157259601062e-05, |
|
"loss": 2.68, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.8480595427963848, |
|
"grad_norm": 0.5868435502052307, |
|
"learning_rate": 1.1406286558991375e-05, |
|
"loss": 2.7562, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 0.8489101541733121, |
|
"grad_norm": 0.6191973090171814, |
|
"learning_rate": 1.1281643661625895e-05, |
|
"loss": 2.7655, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 0.8497607655502393, |
|
"grad_norm": 0.6745545268058777, |
|
"learning_rate": 1.1157644808750312e-05, |
|
"loss": 2.8605, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 0.8506113769271664, |
|
"grad_norm": 0.697556734085083, |
|
"learning_rate": 1.103429090052528e-05, |
|
"loss": 2.6958, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8514619883040936, |
|
"grad_norm": 0.26258981227874756, |
|
"learning_rate": 1.0911582832429589e-05, |
|
"loss": 2.4457, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 0.8523125996810207, |
|
"grad_norm": 0.26820749044418335, |
|
"learning_rate": 1.0789521495253618e-05, |
|
"loss": 2.6249, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 0.8531632110579479, |
|
"grad_norm": 0.26697129011154175, |
|
"learning_rate": 1.0668107775092751e-05, |
|
"loss": 2.4464, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 0.8540138224348751, |
|
"grad_norm": 0.2953515350818634, |
|
"learning_rate": 1.0547342553341144e-05, |
|
"loss": 2.5413, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 0.8548644338118022, |
|
"grad_norm": 0.264387309551239, |
|
"learning_rate": 1.0427226706685178e-05, |
|
"loss": 2.6889, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.8557150451887294, |
|
"grad_norm": 0.2656628489494324, |
|
"learning_rate": 1.030776110709718e-05, |
|
"loss": 2.5115, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 0.8565656565656565, |
|
"grad_norm": 0.25700512528419495, |
|
"learning_rate": 1.0188946621828976e-05, |
|
"loss": 2.5474, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 0.8574162679425837, |
|
"grad_norm": 0.28423216938972473, |
|
"learning_rate": 1.0070784113405763e-05, |
|
"loss": 2.5926, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.8582668793195108, |
|
"grad_norm": 0.2653867304325104, |
|
"learning_rate": 9.953274439619741e-06, |
|
"loss": 2.4022, |
|
"step": 1009 |
|
}, |
|
{ |
|
"epoch": 0.8591174906964381, |
|
"grad_norm": 0.27558040618896484, |
|
"learning_rate": 9.836418453523833e-06, |
|
"loss": 2.6529, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.8599681020733653, |
|
"grad_norm": 0.25471609830856323, |
|
"learning_rate": 9.720217003425647e-06, |
|
"loss": 2.5865, |
|
"step": 1011 |
|
}, |
|
{ |
|
"epoch": 0.8608187134502924, |
|
"grad_norm": 0.2609470784664154, |
|
"learning_rate": 9.60467093288121e-06, |
|
"loss": 2.5492, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 0.8616693248272196, |
|
"grad_norm": 0.29608237743377686, |
|
"learning_rate": 9.489781080688865e-06, |
|
"loss": 2.5104, |
|
"step": 1013 |
|
}, |
|
{ |
|
"epoch": 0.8625199362041467, |
|
"grad_norm": 0.29410475492477417, |
|
"learning_rate": 9.375548280883128e-06, |
|
"loss": 2.6142, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 0.8633705475810739, |
|
"grad_norm": 0.269397497177124, |
|
"learning_rate": 9.261973362728827e-06, |
|
"loss": 2.5652, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.8642211589580011, |
|
"grad_norm": 0.28159505128860474, |
|
"learning_rate": 9.149057150714801e-06, |
|
"loss": 2.7499, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 0.8650717703349282, |
|
"grad_norm": 0.27257442474365234, |
|
"learning_rate": 9.036800464548157e-06, |
|
"loss": 2.6305, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 0.8659223817118554, |
|
"grad_norm": 0.26392441987991333, |
|
"learning_rate": 8.92520411914819e-06, |
|
"loss": 2.4767, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 0.8667729930887825, |
|
"grad_norm": 0.2779422998428345, |
|
"learning_rate": 8.814268924640468e-06, |
|
"loss": 2.6056, |
|
"step": 1019 |
|
}, |
|
{ |
|
"epoch": 0.8676236044657097, |
|
"grad_norm": 0.27914363145828247, |
|
"learning_rate": 8.70399568635104e-06, |
|
"loss": 2.5612, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.8684742158426368, |
|
"grad_norm": 0.28274431824684143, |
|
"learning_rate": 8.594385204800482e-06, |
|
"loss": 2.5673, |
|
"step": 1021 |
|
}, |
|
{ |
|
"epoch": 0.8693248272195641, |
|
"grad_norm": 0.30289795994758606, |
|
"learning_rate": 8.485438275698154e-06, |
|
"loss": 2.6624, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 0.8701754385964913, |
|
"grad_norm": 0.3084239363670349, |
|
"learning_rate": 8.377155689936434e-06, |
|
"loss": 2.5199, |
|
"step": 1023 |
|
}, |
|
{ |
|
"epoch": 0.8710260499734184, |
|
"grad_norm": 0.32038623094558716, |
|
"learning_rate": 8.269538233584883e-06, |
|
"loss": 2.5174, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 0.8718766613503456, |
|
"grad_norm": 0.32201388478279114, |
|
"learning_rate": 8.162586687884654e-06, |
|
"loss": 2.5764, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.8727272727272727, |
|
"grad_norm": 0.31848224997520447, |
|
"learning_rate": 8.056301829242784e-06, |
|
"loss": 2.6327, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 0.8735778841041999, |
|
"grad_norm": 0.34866663813591003, |
|
"learning_rate": 7.950684429226463e-06, |
|
"loss": 2.8354, |
|
"step": 1027 |
|
}, |
|
{ |
|
"epoch": 0.8744284954811271, |
|
"grad_norm": 0.3639064431190491, |
|
"learning_rate": 7.845735254557606e-06, |
|
"loss": 2.7546, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 0.8752791068580542, |
|
"grad_norm": 0.35733523964881897, |
|
"learning_rate": 7.741455067107162e-06, |
|
"loss": 2.69, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 0.8761297182349814, |
|
"grad_norm": 0.35862669348716736, |
|
"learning_rate": 7.637844623889556e-06, |
|
"loss": 2.5966, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.8769803296119085, |
|
"grad_norm": 0.36205005645751953, |
|
"learning_rate": 7.534904677057353e-06, |
|
"loss": 2.4966, |
|
"step": 1031 |
|
}, |
|
{ |
|
"epoch": 0.8778309409888357, |
|
"grad_norm": 0.3843689560890198, |
|
"learning_rate": 7.4326359738956515e-06, |
|
"loss": 2.7942, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 0.8786815523657628, |
|
"grad_norm": 0.40694114565849304, |
|
"learning_rate": 7.331039256816663e-06, |
|
"loss": 2.739, |
|
"step": 1033 |
|
}, |
|
{ |
|
"epoch": 0.87953216374269, |
|
"grad_norm": 0.40859633684158325, |
|
"learning_rate": 7.230115263354431e-06, |
|
"loss": 2.6879, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 0.8803827751196173, |
|
"grad_norm": 0.41907498240470886, |
|
"learning_rate": 7.129864726159408e-06, |
|
"loss": 2.5804, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.8812333864965444, |
|
"grad_norm": 0.4525774121284485, |
|
"learning_rate": 7.030288372993066e-06, |
|
"loss": 2.6814, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 0.8820839978734716, |
|
"grad_norm": 0.45679178833961487, |
|
"learning_rate": 6.931386926722772e-06, |
|
"loss": 2.7909, |
|
"step": 1037 |
|
}, |
|
{ |
|
"epoch": 0.8829346092503987, |
|
"grad_norm": 0.47028636932373047, |
|
"learning_rate": 6.833161105316421e-06, |
|
"loss": 2.8758, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 0.8837852206273259, |
|
"grad_norm": 0.4830760359764099, |
|
"learning_rate": 6.7356116218372566e-06, |
|
"loss": 2.6979, |
|
"step": 1039 |
|
}, |
|
{ |
|
"epoch": 0.8846358320042531, |
|
"grad_norm": 0.49996817111968994, |
|
"learning_rate": 6.63873918443868e-06, |
|
"loss": 2.8513, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.8854864433811802, |
|
"grad_norm": 0.4843011200428009, |
|
"learning_rate": 6.542544496359138e-06, |
|
"loss": 2.553, |
|
"step": 1041 |
|
}, |
|
{ |
|
"epoch": 0.8863370547581074, |
|
"grad_norm": 0.5989816188812256, |
|
"learning_rate": 6.447028255917054e-06, |
|
"loss": 2.5985, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 0.8871876661350345, |
|
"grad_norm": 0.5418705940246582, |
|
"learning_rate": 6.352191156505627e-06, |
|
"loss": 2.7277, |
|
"step": 1043 |
|
}, |
|
{ |
|
"epoch": 0.8880382775119617, |
|
"grad_norm": 0.5338729619979858, |
|
"learning_rate": 6.258033886587911e-06, |
|
"loss": 2.9324, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.5353463292121887, |
|
"learning_rate": 6.164557129691828e-06, |
|
"loss": 2.6506, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.889739500265816, |
|
"grad_norm": 0.6170979142189026, |
|
"learning_rate": 6.0717615644051206e-06, |
|
"loss": 2.7699, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 0.8905901116427433, |
|
"grad_norm": 0.6048348546028137, |
|
"learning_rate": 5.979647864370486e-06, |
|
"loss": 2.7457, |
|
"step": 1047 |
|
}, |
|
{ |
|
"epoch": 0.8914407230196704, |
|
"grad_norm": 0.6213808059692383, |
|
"learning_rate": 5.888216698280647e-06, |
|
"loss": 2.671, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 0.8922913343965976, |
|
"grad_norm": 0.6980977654457092, |
|
"learning_rate": 5.7974687298735454e-06, |
|
"loss": 2.9322, |
|
"step": 1049 |
|
}, |
|
{ |
|
"epoch": 0.8931419457735247, |
|
"grad_norm": 0.762441873550415, |
|
"learning_rate": 5.7074046179275255e-06, |
|
"loss": 2.9242, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.8939925571504519, |
|
"grad_norm": 0.2550997734069824, |
|
"learning_rate": 5.6180250162564455e-06, |
|
"loss": 2.3309, |
|
"step": 1051 |
|
}, |
|
{ |
|
"epoch": 0.8948431685273791, |
|
"grad_norm": 0.2799069285392761, |
|
"learning_rate": 5.5293305737050825e-06, |
|
"loss": 2.6428, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 0.8956937799043062, |
|
"grad_norm": 0.25860702991485596, |
|
"learning_rate": 5.441321934144339e-06, |
|
"loss": 2.2591, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 0.8965443912812334, |
|
"grad_norm": 0.262991726398468, |
|
"learning_rate": 5.35399973646653e-06, |
|
"loss": 2.3974, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 0.8973950026581605, |
|
"grad_norm": 0.25796017050743103, |
|
"learning_rate": 5.267364614580861e-06, |
|
"loss": 2.4487, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.8982456140350877, |
|
"grad_norm": 0.26405441761016846, |
|
"learning_rate": 5.181417197408734e-06, |
|
"loss": 2.3894, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 0.8990962254120148, |
|
"grad_norm": 0.27741700410842896, |
|
"learning_rate": 5.09615810887919e-06, |
|
"loss": 2.5089, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 0.899946836788942, |
|
"grad_norm": 0.27698126435279846, |
|
"learning_rate": 5.011587967924414e-06, |
|
"loss": 2.4868, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 0.9007974481658693, |
|
"grad_norm": 0.2699805796146393, |
|
"learning_rate": 4.927707388475255e-06, |
|
"loss": 2.4822, |
|
"step": 1059 |
|
}, |
|
{ |
|
"epoch": 0.9016480595427964, |
|
"grad_norm": 0.2688996493816376, |
|
"learning_rate": 4.84451697945667e-06, |
|
"loss": 2.574, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.9024986709197236, |
|
"grad_norm": 0.2651573419570923, |
|
"learning_rate": 4.7620173447834425e-06, |
|
"loss": 2.5219, |
|
"step": 1061 |
|
}, |
|
{ |
|
"epoch": 0.9033492822966507, |
|
"grad_norm": 0.27622607350349426, |
|
"learning_rate": 4.680209083355713e-06, |
|
"loss": 2.6289, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 0.9041998936735779, |
|
"grad_norm": 0.27151504158973694, |
|
"learning_rate": 4.5990927890545935e-06, |
|
"loss": 2.625, |
|
"step": 1063 |
|
}, |
|
{ |
|
"epoch": 0.9050505050505051, |
|
"grad_norm": 0.2963936924934387, |
|
"learning_rate": 4.518669050737989e-06, |
|
"loss": 2.4124, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 0.9059011164274322, |
|
"grad_norm": 0.2703079879283905, |
|
"learning_rate": 4.438938452236219e-06, |
|
"loss": 2.482, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.9067517278043594, |
|
"grad_norm": 0.25973039865493774, |
|
"learning_rate": 4.359901572347758e-06, |
|
"loss": 2.6443, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 0.9076023391812865, |
|
"grad_norm": 0.27102747559547424, |
|
"learning_rate": 4.281558984835143e-06, |
|
"loss": 2.6452, |
|
"step": 1067 |
|
}, |
|
{ |
|
"epoch": 0.9084529505582137, |
|
"grad_norm": 0.2813124656677246, |
|
"learning_rate": 4.203911258420712e-06, |
|
"loss": 2.6012, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 0.9093035619351408, |
|
"grad_norm": 0.274431437253952, |
|
"learning_rate": 4.126958956782545e-06, |
|
"loss": 2.6887, |
|
"step": 1069 |
|
}, |
|
{ |
|
"epoch": 0.910154173312068, |
|
"grad_norm": 0.2668261229991913, |
|
"learning_rate": 4.050702638550275e-06, |
|
"loss": 2.5839, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.9110047846889953, |
|
"grad_norm": 0.2745119631290436, |
|
"learning_rate": 3.975142857301117e-06, |
|
"loss": 2.4371, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 0.9118553960659224, |
|
"grad_norm": 0.31567510962486267, |
|
"learning_rate": 3.900280161555881e-06, |
|
"loss": 2.4209, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 0.9127060074428496, |
|
"grad_norm": 0.28338539600372314, |
|
"learning_rate": 3.826115094774863e-06, |
|
"loss": 2.47, |
|
"step": 1073 |
|
}, |
|
{ |
|
"epoch": 0.9135566188197767, |
|
"grad_norm": 0.3005933463573456, |
|
"learning_rate": 3.7526481953539915e-06, |
|
"loss": 2.5939, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 0.9144072301967039, |
|
"grad_norm": 0.31587204337120056, |
|
"learning_rate": 3.6798799966209497e-06, |
|
"loss": 2.6447, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.915257841573631, |
|
"grad_norm": 0.3279857635498047, |
|
"learning_rate": 3.607811026831176e-06, |
|
"loss": 2.5845, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 0.9161084529505582, |
|
"grad_norm": 0.3389846980571747, |
|
"learning_rate": 3.5364418091641373e-06, |
|
"loss": 2.7449, |
|
"step": 1077 |
|
}, |
|
{ |
|
"epoch": 0.9169590643274854, |
|
"grad_norm": 0.3381056487560272, |
|
"learning_rate": 3.4657728617195295e-06, |
|
"loss": 2.407, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 0.9178096757044125, |
|
"grad_norm": 0.3725632429122925, |
|
"learning_rate": 3.3958046975134495e-06, |
|
"loss": 2.8884, |
|
"step": 1079 |
|
}, |
|
{ |
|
"epoch": 0.9186602870813397, |
|
"grad_norm": 0.3629099726676941, |
|
"learning_rate": 3.32653782447474e-06, |
|
"loss": 2.6872, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.9195108984582668, |
|
"grad_norm": 0.37425747513771057, |
|
"learning_rate": 3.25797274544124e-06, |
|
"loss": 2.659, |
|
"step": 1081 |
|
}, |
|
{ |
|
"epoch": 0.920361509835194, |
|
"grad_norm": 0.3709467649459839, |
|
"learning_rate": 3.1901099581561845e-06, |
|
"loss": 2.5593, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 0.9212121212121213, |
|
"grad_norm": 0.37324512004852295, |
|
"learning_rate": 3.122949955264587e-06, |
|
"loss": 2.5461, |
|
"step": 1083 |
|
}, |
|
{ |
|
"epoch": 0.9220627325890484, |
|
"grad_norm": 0.39722326397895813, |
|
"learning_rate": 3.0564932243095866e-06, |
|
"loss": 2.8043, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 0.9229133439659756, |
|
"grad_norm": 0.40863049030303955, |
|
"learning_rate": 2.9907402477290514e-06, |
|
"loss": 2.5307, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.9237639553429027, |
|
"grad_norm": 0.4645911157131195, |
|
"learning_rate": 2.9256915028519573e-06, |
|
"loss": 2.3838, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 0.9246145667198299, |
|
"grad_norm": 0.42394137382507324, |
|
"learning_rate": 2.8613474618949366e-06, |
|
"loss": 2.7368, |
|
"step": 1087 |
|
}, |
|
{ |
|
"epoch": 0.925465178096757, |
|
"grad_norm": 0.44541221857070923, |
|
"learning_rate": 2.7977085919589254e-06, |
|
"loss": 2.6973, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 0.9263157894736842, |
|
"grad_norm": 0.45734384655952454, |
|
"learning_rate": 2.7347753550256872e-06, |
|
"loss": 2.8454, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 0.9271664008506114, |
|
"grad_norm": 0.4581248164176941, |
|
"learning_rate": 2.672548207954495e-06, |
|
"loss": 2.7356, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.9280170122275385, |
|
"grad_norm": 0.45908254384994507, |
|
"learning_rate": 2.6110276024788214e-06, |
|
"loss": 2.5562, |
|
"step": 1091 |
|
}, |
|
{ |
|
"epoch": 0.9288676236044657, |
|
"grad_norm": 0.4975565969944, |
|
"learning_rate": 2.550213985203076e-06, |
|
"loss": 2.8292, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 0.9297182349813928, |
|
"grad_norm": 0.5445611476898193, |
|
"learning_rate": 2.4901077975992838e-06, |
|
"loss": 2.9136, |
|
"step": 1093 |
|
}, |
|
{ |
|
"epoch": 0.93056884635832, |
|
"grad_norm": 0.5516471862792969, |
|
"learning_rate": 2.4307094760039785e-06, |
|
"loss": 3.0927, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 0.9314194577352473, |
|
"grad_norm": 0.5213038921356201, |
|
"learning_rate": 2.3720194516149818e-06, |
|
"loss": 2.705, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.9322700691121744, |
|
"grad_norm": 0.5873907208442688, |
|
"learning_rate": 2.3140381504882737e-06, |
|
"loss": 2.8415, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 0.9331206804891016, |
|
"grad_norm": 0.6056773662567139, |
|
"learning_rate": 2.2567659935349372e-06, |
|
"loss": 2.6933, |
|
"step": 1097 |
|
}, |
|
{ |
|
"epoch": 0.9339712918660287, |
|
"grad_norm": 0.6297914981842041, |
|
"learning_rate": 2.200203396517997e-06, |
|
"loss": 2.8197, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 0.9348219032429559, |
|
"grad_norm": 0.6759224534034729, |
|
"learning_rate": 2.144350770049597e-06, |
|
"loss": 2.7496, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 0.935672514619883, |
|
"grad_norm": 0.7379116415977478, |
|
"learning_rate": 2.0892085195878154e-06, |
|
"loss": 2.926, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9365231259968102, |
|
"grad_norm": 0.25760582089424133, |
|
"learning_rate": 2.034777045433811e-06, |
|
"loss": 2.3794, |
|
"step": 1101 |
|
}, |
|
{ |
|
"epoch": 0.9373737373737374, |
|
"grad_norm": 0.2775615155696869, |
|
"learning_rate": 1.9810567427289595e-06, |
|
"loss": 2.5169, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 0.9382243487506645, |
|
"grad_norm": 0.26485174894332886, |
|
"learning_rate": 1.92804800145191e-06, |
|
"loss": 2.4342, |
|
"step": 1103 |
|
}, |
|
{ |
|
"epoch": 0.9390749601275917, |
|
"grad_norm": 0.24917316436767578, |
|
"learning_rate": 1.8757512064157656e-06, |
|
"loss": 2.4177, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 0.9399255715045188, |
|
"grad_norm": 0.25125351548194885, |
|
"learning_rate": 1.8241667372653316e-06, |
|
"loss": 2.2352, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.940776182881446, |
|
"grad_norm": 0.2972431182861328, |
|
"learning_rate": 1.7732949684743594e-06, |
|
"loss": 2.4803, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 0.9416267942583733, |
|
"grad_norm": 0.2540951073169708, |
|
"learning_rate": 1.7231362693427288e-06, |
|
"loss": 2.5117, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 0.9424774056353004, |
|
"grad_norm": 0.26120516657829285, |
|
"learning_rate": 1.6736910039939157e-06, |
|
"loss": 2.3744, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 0.9433280170122276, |
|
"grad_norm": 0.2617899477481842, |
|
"learning_rate": 1.62495953137225e-06, |
|
"loss": 2.4657, |
|
"step": 1109 |
|
}, |
|
{ |
|
"epoch": 0.9441786283891547, |
|
"grad_norm": 0.29227423667907715, |
|
"learning_rate": 1.576942205240317e-06, |
|
"loss": 2.4297, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.9450292397660819, |
|
"grad_norm": 0.2886480689048767, |
|
"learning_rate": 1.5296393741764391e-06, |
|
"loss": 2.5686, |
|
"step": 1111 |
|
}, |
|
{ |
|
"epoch": 0.945879851143009, |
|
"grad_norm": 0.27778005599975586, |
|
"learning_rate": 1.4830513815720759e-06, |
|
"loss": 2.5925, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 0.9467304625199362, |
|
"grad_norm": 0.2743171155452728, |
|
"learning_rate": 1.4371785656294046e-06, |
|
"loss": 2.6989, |
|
"step": 1113 |
|
}, |
|
{ |
|
"epoch": 0.9475810738968634, |
|
"grad_norm": 0.27193981409072876, |
|
"learning_rate": 1.392021259358811e-06, |
|
"loss": 2.457, |
|
"step": 1114 |
|
}, |
|
{ |
|
"epoch": 0.9484316852737905, |
|
"grad_norm": 0.2713122069835663, |
|
"learning_rate": 1.3475797905764809e-06, |
|
"loss": 2.6029, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.9492822966507177, |
|
"grad_norm": 0.25593551993370056, |
|
"learning_rate": 1.303854481902067e-06, |
|
"loss": 2.6974, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 0.9501329080276448, |
|
"grad_norm": 0.26953616738319397, |
|
"learning_rate": 1.2608456507562705e-06, |
|
"loss": 2.6234, |
|
"step": 1117 |
|
}, |
|
{ |
|
"epoch": 0.950983519404572, |
|
"grad_norm": 0.28958868980407715, |
|
"learning_rate": 1.2185536093585747e-06, |
|
"loss": 2.6315, |
|
"step": 1118 |
|
}, |
|
{ |
|
"epoch": 0.9518341307814993, |
|
"grad_norm": 0.263473242521286, |
|
"learning_rate": 1.1769786647250147e-06, |
|
"loss": 2.5595, |
|
"step": 1119 |
|
}, |
|
{ |
|
"epoch": 0.9526847421584264, |
|
"grad_norm": 0.26081758737564087, |
|
"learning_rate": 1.1361211186658894e-06, |
|
"loss": 2.4885, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.9535353535353536, |
|
"grad_norm": 0.275305837392807, |
|
"learning_rate": 1.0959812677835968e-06, |
|
"loss": 2.794, |
|
"step": 1121 |
|
}, |
|
{ |
|
"epoch": 0.9543859649122807, |
|
"grad_norm": 0.28377169370651245, |
|
"learning_rate": 1.0565594034704918e-06, |
|
"loss": 2.7399, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 0.9552365762892079, |
|
"grad_norm": 0.282060831785202, |
|
"learning_rate": 1.0178558119067315e-06, |
|
"loss": 2.4815, |
|
"step": 1123 |
|
}, |
|
{ |
|
"epoch": 0.956087187666135, |
|
"grad_norm": 0.2975423038005829, |
|
"learning_rate": 9.798707740582447e-07, |
|
"loss": 2.5839, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 0.9569377990430622, |
|
"grad_norm": 0.3122364580631256, |
|
"learning_rate": 9.42604565674654e-07, |
|
"loss": 2.6381, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.9577884104199894, |
|
"grad_norm": 0.3378431797027588, |
|
"learning_rate": 9.060574572873237e-07, |
|
"loss": 2.5897, |
|
"step": 1126 |
|
}, |
|
{ |
|
"epoch": 0.9586390217969165, |
|
"grad_norm": 0.33718162775039673, |
|
"learning_rate": 8.702297142073379e-07, |
|
"loss": 2.689, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 0.9594896331738437, |
|
"grad_norm": 0.34144076704978943, |
|
"learning_rate": 8.351215965235915e-07, |
|
"loss": 2.4692, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 0.9603402445507708, |
|
"grad_norm": 0.3324032425880432, |
|
"learning_rate": 8.007333591009358e-07, |
|
"loss": 2.653, |
|
"step": 1129 |
|
}, |
|
{ |
|
"epoch": 0.961190855927698, |
|
"grad_norm": 0.3601853847503662, |
|
"learning_rate": 7.670652515782917e-07, |
|
"loss": 2.5995, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.9620414673046253, |
|
"grad_norm": 0.3701609969139099, |
|
"learning_rate": 7.341175183668503e-07, |
|
"loss": 2.6668, |
|
"step": 1131 |
|
}, |
|
{ |
|
"epoch": 0.9628920786815524, |
|
"grad_norm": 0.37852516770362854, |
|
"learning_rate": 7.018903986483083e-07, |
|
"loss": 2.7223, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 0.9637426900584796, |
|
"grad_norm": 0.391939640045166, |
|
"learning_rate": 6.703841263730914e-07, |
|
"loss": 2.7923, |
|
"step": 1133 |
|
}, |
|
{ |
|
"epoch": 0.9645933014354067, |
|
"grad_norm": 0.38707369565963745, |
|
"learning_rate": 6.395989302587113e-07, |
|
"loss": 2.6445, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 0.9654439128123339, |
|
"grad_norm": 0.4049709141254425, |
|
"learning_rate": 6.095350337880667e-07, |
|
"loss": 2.7038, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.966294524189261, |
|
"grad_norm": 0.41350454092025757, |
|
"learning_rate": 5.801926552078563e-07, |
|
"loss": 2.6472, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 0.9671451355661882, |
|
"grad_norm": 0.42945706844329834, |
|
"learning_rate": 5.515720075269348e-07, |
|
"loss": 2.7802, |
|
"step": 1137 |
|
}, |
|
{ |
|
"epoch": 0.9679957469431154, |
|
"grad_norm": 0.4451653063297272, |
|
"learning_rate": 5.236732985148374e-07, |
|
"loss": 2.7633, |
|
"step": 1138 |
|
}, |
|
{ |
|
"epoch": 0.9688463583200425, |
|
"grad_norm": 0.46683037281036377, |
|
"learning_rate": 4.964967307002244e-07, |
|
"loss": 2.8216, |
|
"step": 1139 |
|
}, |
|
{ |
|
"epoch": 0.9696969696969697, |
|
"grad_norm": 0.4562697112560272, |
|
"learning_rate": 4.7004250136940543e-07, |
|
"loss": 2.7924, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.9705475810738968, |
|
"grad_norm": 0.48352983593940735, |
|
"learning_rate": 4.443108025649623e-07, |
|
"loss": 2.5207, |
|
"step": 1141 |
|
}, |
|
{ |
|
"epoch": 0.971398192450824, |
|
"grad_norm": 0.4667017459869385, |
|
"learning_rate": 4.193018210843058e-07, |
|
"loss": 2.6552, |
|
"step": 1142 |
|
}, |
|
{ |
|
"epoch": 0.9722488038277513, |
|
"grad_norm": 0.49360570311546326, |
|
"learning_rate": 3.950157384783104e-07, |
|
"loss": 2.7523, |
|
"step": 1143 |
|
}, |
|
{ |
|
"epoch": 0.9730994152046784, |
|
"grad_norm": 0.526669442653656, |
|
"learning_rate": 3.714527310500371e-07, |
|
"loss": 2.7332, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 0.9739500265816056, |
|
"grad_norm": 0.5592300891876221, |
|
"learning_rate": 3.486129698534457e-07, |
|
"loss": 2.7264, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.9748006379585327, |
|
"grad_norm": 0.5670251250267029, |
|
"learning_rate": 3.264966206921294e-07, |
|
"loss": 2.8538, |
|
"step": 1146 |
|
}, |
|
{ |
|
"epoch": 0.9756512493354599, |
|
"grad_norm": 0.5744861960411072, |
|
"learning_rate": 3.0510384411812644e-07, |
|
"loss": 2.6884, |
|
"step": 1147 |
|
}, |
|
{ |
|
"epoch": 0.976501860712387, |
|
"grad_norm": 0.6117346882820129, |
|
"learning_rate": 2.844347954307325e-07, |
|
"loss": 2.8192, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 0.9773524720893142, |
|
"grad_norm": 0.7534440159797668, |
|
"learning_rate": 2.644896246754236e-07, |
|
"loss": 2.7148, |
|
"step": 1149 |
|
}, |
|
{ |
|
"epoch": 0.9782030834662414, |
|
"grad_norm": 0.7108234167098999, |
|
"learning_rate": 2.452684766427349e-07, |
|
"loss": 2.8751, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.9790536948431685, |
|
"grad_norm": 0.264021635055542, |
|
"learning_rate": 2.2677149086718364e-07, |
|
"loss": 2.6035, |
|
"step": 1151 |
|
}, |
|
{ |
|
"epoch": 0.9799043062200957, |
|
"grad_norm": 0.2746794819831848, |
|
"learning_rate": 2.0899880162630336e-07, |
|
"loss": 2.5935, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 0.9807549175970228, |
|
"grad_norm": 0.2742723226547241, |
|
"learning_rate": 1.9195053793964468e-07, |
|
"loss": 2.4199, |
|
"step": 1153 |
|
}, |
|
{ |
|
"epoch": 0.98160552897395, |
|
"grad_norm": 0.2900630235671997, |
|
"learning_rate": 1.7562682356786487e-07, |
|
"loss": 2.5906, |
|
"step": 1154 |
|
}, |
|
{ |
|
"epoch": 0.9824561403508771, |
|
"grad_norm": 0.26273003220558167, |
|
"learning_rate": 1.6002777701175086e-07, |
|
"loss": 2.4287, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.9833067517278044, |
|
"grad_norm": 0.28278976678848267, |
|
"learning_rate": 1.451535115114866e-07, |
|
"loss": 2.6875, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 0.9841573631047316, |
|
"grad_norm": 0.2644931674003601, |
|
"learning_rate": 1.310041350457092e-07, |
|
"loss": 2.5806, |
|
"step": 1157 |
|
}, |
|
{ |
|
"epoch": 0.9850079744816587, |
|
"grad_norm": 0.26039785146713257, |
|
"learning_rate": 1.1757975033078739e-07, |
|
"loss": 2.2813, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 0.9858585858585859, |
|
"grad_norm": 0.27614063024520874, |
|
"learning_rate": 1.0488045482008879e-07, |
|
"loss": 2.6836, |
|
"step": 1159 |
|
}, |
|
{ |
|
"epoch": 0.986709197235513, |
|
"grad_norm": 0.27048394083976746, |
|
"learning_rate": 9.29063407032249e-08, |
|
"loss": 2.5862, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.9875598086124402, |
|
"grad_norm": 0.28808289766311646, |
|
"learning_rate": 8.16574949054072e-08, |
|
"loss": 2.765, |
|
"step": 1161 |
|
}, |
|
{ |
|
"epoch": 0.9884104199893674, |
|
"grad_norm": 0.323574423789978, |
|
"learning_rate": 7.113399908681429e-08, |
|
"loss": 2.6432, |
|
"step": 1162 |
|
}, |
|
{ |
|
"epoch": 0.9892610313662945, |
|
"grad_norm": 0.3291761875152588, |
|
"learning_rate": 6.133592964201463e-08, |
|
"loss": 2.6037, |
|
"step": 1163 |
|
}, |
|
{ |
|
"epoch": 0.9901116427432217, |
|
"grad_norm": 0.345300555229187, |
|
"learning_rate": 5.226335769936697e-08, |
|
"loss": 2.6076, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 0.9909622541201488, |
|
"grad_norm": 0.37476396560668945, |
|
"learning_rate": 4.391634912056519e-08, |
|
"loss": 2.8919, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.991812865497076, |
|
"grad_norm": 0.36915716528892517, |
|
"learning_rate": 3.629496450011649e-08, |
|
"loss": 2.5514, |
|
"step": 1166 |
|
}, |
|
{ |
|
"epoch": 0.9926634768740031, |
|
"grad_norm": 0.3973197937011719, |
|
"learning_rate": 2.9399259164897274e-08, |
|
"loss": 2.8279, |
|
"step": 1167 |
|
}, |
|
{ |
|
"epoch": 0.9935140882509303, |
|
"grad_norm": 0.42460885643959045, |
|
"learning_rate": 2.322928317378681e-08, |
|
"loss": 2.5969, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 0.9943646996278576, |
|
"grad_norm": 0.4290263056755066, |
|
"learning_rate": 1.778508131728973e-08, |
|
"loss": 2.7255, |
|
"step": 1169 |
|
}, |
|
{ |
|
"epoch": 0.9952153110047847, |
|
"grad_norm": 0.4642498195171356, |
|
"learning_rate": 1.3066693117191886e-08, |
|
"loss": 2.7869, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.9960659223817119, |
|
"grad_norm": 0.5059999823570251, |
|
"learning_rate": 9.074152826271665e-09, |
|
"loss": 2.582, |
|
"step": 1171 |
|
}, |
|
{ |
|
"epoch": 0.996916533758639, |
|
"grad_norm": 0.49888238310813904, |
|
"learning_rate": 5.807489428111268e-09, |
|
"loss": 2.8126, |
|
"step": 1172 |
|
}, |
|
{ |
|
"epoch": 0.9977671451355662, |
|
"grad_norm": 0.5143745541572571, |
|
"learning_rate": 3.2667266368080484e-09, |
|
"loss": 2.7024, |
|
"step": 1173 |
|
}, |
|
{ |
|
"epoch": 0.9986177565124934, |
|
"grad_norm": 0.5905907154083252, |
|
"learning_rate": 1.4518828968523857e-09, |
|
"loss": 2.7198, |
|
"step": 1174 |
|
}, |
|
{ |
|
"epoch": 0.9994683678894205, |
|
"grad_norm": 0.6912564635276794, |
|
"learning_rate": 3.629713829500503e-10, |
|
"loss": 2.6901, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.0006379585326954, |
|
"grad_norm": 1.383741021156311, |
|
"learning_rate": 0.0, |
|
"loss": 4.1919, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 1.0006379585326954, |
|
"eval_loss": 2.6547293663024902, |
|
"eval_runtime": 80.6568, |
|
"eval_samples_per_second": 12.274, |
|
"eval_steps_per_second": 6.137, |
|
"step": 1176 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1176, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 294, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.199442414922629e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|