|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0021770682148041, |
|
"eval_steps": 500, |
|
"global_step": 345, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002902757619738752, |
|
"grad_norm": 0.5301488637924194, |
|
"learning_rate": 0.00019999585400705652, |
|
"loss": 5.4533, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005805515239477504, |
|
"grad_norm": 0.5975003838539124, |
|
"learning_rate": 0.00019998341637201124, |
|
"loss": 4.7975, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.008708272859216255, |
|
"grad_norm": 0.8240943551063538, |
|
"learning_rate": 0.00019996268812619107, |
|
"loss": 4.9359, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.011611030478955007, |
|
"grad_norm": 0.8740971684455872, |
|
"learning_rate": 0.00019993367098837926, |
|
"loss": 4.4682, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01451378809869376, |
|
"grad_norm": 1.309985637664795, |
|
"learning_rate": 0.00019989636736467278, |
|
"loss": 5.2548, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01741654571843251, |
|
"grad_norm": 1.2016607522964478, |
|
"learning_rate": 0.0001998507803482828, |
|
"loss": 4.8472, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.020319303338171262, |
|
"grad_norm": 1.1827248334884644, |
|
"learning_rate": 0.00019979691371927832, |
|
"loss": 5.2928, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.023222060957910014, |
|
"grad_norm": 1.7223974466323853, |
|
"learning_rate": 0.00019973477194427266, |
|
"loss": 4.7192, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.026124818577648767, |
|
"grad_norm": 1.4475376605987549, |
|
"learning_rate": 0.00019966436017605297, |
|
"loss": 4.7133, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.02902757619738752, |
|
"grad_norm": 2.1703498363494873, |
|
"learning_rate": 0.00019958568425315314, |
|
"loss": 4.4146, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03193033381712627, |
|
"grad_norm": 1.7314109802246094, |
|
"learning_rate": 0.0001994987506993696, |
|
"loss": 4.2274, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03483309143686502, |
|
"grad_norm": 2.2317986488342285, |
|
"learning_rate": 0.00019940356672322037, |
|
"loss": 4.4908, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03773584905660377, |
|
"grad_norm": 2.3612048625946045, |
|
"learning_rate": 0.00019930014021734733, |
|
"loss": 4.2928, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.040638606676342524, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019930014021734733, |
|
"loss": 4.3084, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.04354136429608128, |
|
"grad_norm": 2.0613327026367188, |
|
"learning_rate": 0.0001991884797578617, |
|
"loss": 3.9954, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04644412191582003, |
|
"grad_norm": 2.3426692485809326, |
|
"learning_rate": 0.00019906859460363307, |
|
"loss": 4.5533, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04934687953555878, |
|
"grad_norm": 2.8758199214935303, |
|
"learning_rate": 0.00019894049469552152, |
|
"loss": 3.4729, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.05224963715529753, |
|
"grad_norm": 2.3996334075927734, |
|
"learning_rate": 0.0001988041906555533, |
|
"loss": 4.2112, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.055152394775036286, |
|
"grad_norm": 1.8049657344818115, |
|
"learning_rate": 0.0001986596937860402, |
|
"loss": 3.4162, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05805515239477504, |
|
"grad_norm": 2.456997871398926, |
|
"learning_rate": 0.00019850701606864224, |
|
"loss": 4.234, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06095791001451379, |
|
"grad_norm": 2.200556755065918, |
|
"learning_rate": 0.0001983461701633742, |
|
"loss": 3.79, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.06386066763425254, |
|
"grad_norm": 2.045299768447876, |
|
"learning_rate": 0.00019817716940755586, |
|
"loss": 4.2698, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06676342525399129, |
|
"grad_norm": 1.7035149335861206, |
|
"learning_rate": 0.000198000027814706, |
|
"loss": 3.8499, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06966618287373004, |
|
"grad_norm": 2.4540529251098633, |
|
"learning_rate": 0.00019781476007338058, |
|
"loss": 4.9429, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07256894049346879, |
|
"grad_norm": 1.9538823366165161, |
|
"learning_rate": 0.00019762138154595446, |
|
"loss": 4.2875, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07547169811320754, |
|
"grad_norm": 2.1658666133880615, |
|
"learning_rate": 0.00019741990826734794, |
|
"loss": 4.0588, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0783744557329463, |
|
"grad_norm": 1.6644055843353271, |
|
"learning_rate": 0.00019721035694369673, |
|
"loss": 3.7266, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.08127721335268505, |
|
"grad_norm": 2.193331480026245, |
|
"learning_rate": 0.00019699274495096712, |
|
"loss": 3.9445, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0841799709724238, |
|
"grad_norm": 2.3478739261627197, |
|
"learning_rate": 0.00019676709033351482, |
|
"loss": 3.5157, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.08708272859216255, |
|
"grad_norm": 2.0770201683044434, |
|
"learning_rate": 0.0001965334118025888, |
|
"loss": 3.5606, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0899854862119013, |
|
"grad_norm": 2.276620864868164, |
|
"learning_rate": 0.00019629172873477995, |
|
"loss": 3.7209, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.09288824383164006, |
|
"grad_norm": 2.3815758228302, |
|
"learning_rate": 0.0001960420611704141, |
|
"loss": 4.2123, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09579100145137881, |
|
"grad_norm": 1.987587809562683, |
|
"learning_rate": 0.0001957844298118904, |
|
"loss": 3.7037, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.09869375907111756, |
|
"grad_norm": 1.8462159633636475, |
|
"learning_rate": 0.0001955188560219648, |
|
"loss": 3.0063, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.10159651669085631, |
|
"grad_norm": 1.7328358888626099, |
|
"learning_rate": 0.0001952453618219785, |
|
"loss": 4.1731, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.10449927431059507, |
|
"grad_norm": 2.9112722873687744, |
|
"learning_rate": 0.00019496396989003193, |
|
"loss": 4.0481, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10740203193033382, |
|
"grad_norm": 2.2112295627593994, |
|
"learning_rate": 0.00019467470355910438, |
|
"loss": 4.5963, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.11030478955007257, |
|
"grad_norm": 2.1279897689819336, |
|
"learning_rate": 0.0001943775868151192, |
|
"loss": 3.4653, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.11320754716981132, |
|
"grad_norm": 2.1699769496917725, |
|
"learning_rate": 0.00019407264429495484, |
|
"loss": 4.4511, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.11611030478955008, |
|
"grad_norm": 1.7325927019119263, |
|
"learning_rate": 0.00019375990128440204, |
|
"loss": 4.1323, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11901306240928883, |
|
"grad_norm": 1.8565714359283447, |
|
"learning_rate": 0.00019343938371606712, |
|
"loss": 4.0433, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.12191582002902758, |
|
"grad_norm": 1.9784877300262451, |
|
"learning_rate": 0.0001931111181672216, |
|
"loss": 3.3724, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.12481857764876633, |
|
"grad_norm": 1.8009449243545532, |
|
"learning_rate": 0.00019277513185759844, |
|
"loss": 4.197, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.12772133526850507, |
|
"grad_norm": 3.194614887237549, |
|
"learning_rate": 0.0001924314526471351, |
|
"loss": 4.0794, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.13062409288824384, |
|
"grad_norm": 3.4294867515563965, |
|
"learning_rate": 0.00019208010903366306, |
|
"loss": 4.0895, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.13352685050798258, |
|
"grad_norm": 2.3046109676361084, |
|
"learning_rate": 0.00019172113015054532, |
|
"loss": 4.2159, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.13642960812772134, |
|
"grad_norm": 3.2261159420013428, |
|
"learning_rate": 0.0001913545457642601, |
|
"loss": 3.2197, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.13933236574746008, |
|
"grad_norm": 1.6862419843673706, |
|
"learning_rate": 0.00019098038627193302, |
|
"loss": 3.4144, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.14223512336719885, |
|
"grad_norm": 2.0345373153686523, |
|
"learning_rate": 0.0001905986826988164, |
|
"loss": 3.106, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.14513788098693758, |
|
"grad_norm": 2.1441516876220703, |
|
"learning_rate": 0.00019020946669571654, |
|
"loss": 3.979, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14804063860667635, |
|
"grad_norm": 2.6867835521698, |
|
"learning_rate": 0.0001898127705363696, |
|
"loss": 4.0657, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.1509433962264151, |
|
"grad_norm": 2.0316073894500732, |
|
"learning_rate": 0.00018940862711476513, |
|
"loss": 3.9072, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 2.004814863204956, |
|
"learning_rate": 0.00018899706994241858, |
|
"loss": 4.1832, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.1567489114658926, |
|
"grad_norm": 1.810863971710205, |
|
"learning_rate": 0.00018857813314559257, |
|
"loss": 3.3366, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.15965166908563136, |
|
"grad_norm": 2.068857192993164, |
|
"learning_rate": 0.00018815185146246716, |
|
"loss": 4.1484, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1625544267053701, |
|
"grad_norm": 1.900846242904663, |
|
"learning_rate": 0.00018771826024025946, |
|
"loss": 3.1681, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.16545718432510886, |
|
"grad_norm": 2.1605849266052246, |
|
"learning_rate": 0.00018727739543229231, |
|
"loss": 3.1671, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.1683599419448476, |
|
"grad_norm": 1.944718360900879, |
|
"learning_rate": 0.00018682929359501338, |
|
"loss": 4.5958, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.17126269956458637, |
|
"grad_norm": 2.9172914028167725, |
|
"learning_rate": 0.00018637399188496382, |
|
"loss": 4.122, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.1741654571843251, |
|
"grad_norm": 2.346954822540283, |
|
"learning_rate": 0.00018591152805569715, |
|
"loss": 4.1201, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.17706821480406387, |
|
"grad_norm": 2.2824630737304688, |
|
"learning_rate": 0.00018544194045464886, |
|
"loss": 4.2025, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.1799709724238026, |
|
"grad_norm": 1.8054004907608032, |
|
"learning_rate": 0.0001849652680199565, |
|
"loss": 3.6063, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.18287373004354138, |
|
"grad_norm": 2.1201300621032715, |
|
"learning_rate": 0.0001844815502772311, |
|
"loss": 3.5376, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.18577648766328012, |
|
"grad_norm": 1.7177382707595825, |
|
"learning_rate": 0.00018399082733627965, |
|
"loss": 3.7342, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.18867924528301888, |
|
"grad_norm": 2.9193296432495117, |
|
"learning_rate": 0.00018349313988777914, |
|
"loss": 2.5638, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.19158200290275762, |
|
"grad_norm": 1.9819329977035522, |
|
"learning_rate": 0.00018298852919990252, |
|
"loss": 4.2545, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.19448476052249636, |
|
"grad_norm": 1.8844672441482544, |
|
"learning_rate": 0.00018247703711489686, |
|
"loss": 3.6233, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.19738751814223512, |
|
"grad_norm": 1.8098646402359009, |
|
"learning_rate": 0.00018195870604561365, |
|
"loss": 3.9222, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.20029027576197386, |
|
"grad_norm": 2.1591079235076904, |
|
"learning_rate": 0.000181433578971992, |
|
"loss": 3.7097, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.20319303338171263, |
|
"grad_norm": 2.3508942127227783, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 3.7016, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.20609579100145137, |
|
"grad_norm": 1.943665623664856, |
|
"learning_rate": 0.00018036311154549784, |
|
"loss": 3.5324, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.20899854862119013, |
|
"grad_norm": 1.8940976858139038, |
|
"learning_rate": 0.00017981785995563324, |
|
"loss": 3.9551, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.21190130624092887, |
|
"grad_norm": 2.0404138565063477, |
|
"learning_rate": 0.00017926598988008582, |
|
"loss": 3.3151, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.21480406386066764, |
|
"grad_norm": 2.0190603733062744, |
|
"learning_rate": 0.00017870754707984443, |
|
"loss": 4.3073, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.21770682148040638, |
|
"grad_norm": 1.989651083946228, |
|
"learning_rate": 0.00017814257786090719, |
|
"loss": 3.1581, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.22060957910014514, |
|
"grad_norm": 3.1509041786193848, |
|
"learning_rate": 0.000177571129070442, |
|
"loss": 3.8772, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.22351233671988388, |
|
"grad_norm": 1.903363585472107, |
|
"learning_rate": 0.00017699324809290193, |
|
"loss": 4.1305, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.22641509433962265, |
|
"grad_norm": 2.1415135860443115, |
|
"learning_rate": 0.00017640898284609612, |
|
"loss": 4.2865, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.22931785195936139, |
|
"grad_norm": 1.6867640018463135, |
|
"learning_rate": 0.0001758183817772163, |
|
"loss": 2.6165, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.23222060957910015, |
|
"grad_norm": 1.9801138639450073, |
|
"learning_rate": 0.0001752214938588198, |
|
"loss": 4.0186, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2351233671988389, |
|
"grad_norm": 2.25994610786438, |
|
"learning_rate": 0.00017461836858476856, |
|
"loss": 3.8012, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.23802612481857766, |
|
"grad_norm": 3.3158185482025146, |
|
"learning_rate": 0.0001740090559661252, |
|
"loss": 3.2479, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.2409288824383164, |
|
"grad_norm": 2.139110803604126, |
|
"learning_rate": 0.00017339360652700604, |
|
"loss": 2.6925, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.24383164005805516, |
|
"grad_norm": 1.8995939493179321, |
|
"learning_rate": 0.00017277207130039174, |
|
"loss": 4.1114, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2467343976777939, |
|
"grad_norm": 2.1001484394073486, |
|
"learning_rate": 0.00017214450182389559, |
|
"loss": 4.0802, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.24963715529753266, |
|
"grad_norm": 1.6680461168289185, |
|
"learning_rate": 0.00017151095013548994, |
|
"loss": 3.1914, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.2525399129172714, |
|
"grad_norm": 1.978389859199524, |
|
"learning_rate": 0.00017087146876919144, |
|
"loss": 3.858, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.25544267053701014, |
|
"grad_norm": 1.8887652158737183, |
|
"learning_rate": 0.00017022611075070474, |
|
"loss": 3.5546, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.25834542815674894, |
|
"grad_norm": 2.8925201892852783, |
|
"learning_rate": 0.00016957492959302558, |
|
"loss": 4.478, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.2612481857764877, |
|
"grad_norm": 1.920861005783081, |
|
"learning_rate": 0.00016891797929200375, |
|
"loss": 4.2126, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2641509433962264, |
|
"grad_norm": 1.6321172714233398, |
|
"learning_rate": 0.00016825531432186543, |
|
"loss": 3.0669, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.26705370101596515, |
|
"grad_norm": 2.127535343170166, |
|
"learning_rate": 0.00016758698963069643, |
|
"loss": 3.0706, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.26995645863570394, |
|
"grad_norm": 2.0623557567596436, |
|
"learning_rate": 0.00016691306063588583, |
|
"loss": 4.0167, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.2728592162554427, |
|
"grad_norm": 1.740623950958252, |
|
"learning_rate": 0.00016623358321953078, |
|
"loss": 3.4032, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2757619738751814, |
|
"grad_norm": 2.192186117172241, |
|
"learning_rate": 0.00016554861372380272, |
|
"loss": 3.6432, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.27866473149492016, |
|
"grad_norm": 3.0152950286865234, |
|
"learning_rate": 0.0001648582089462756, |
|
"loss": 3.1592, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.28156748911465895, |
|
"grad_norm": 1.8867627382278442, |
|
"learning_rate": 0.0001641624261352161, |
|
"loss": 3.3498, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.2844702467343977, |
|
"grad_norm": 1.9052848815917969, |
|
"learning_rate": 0.00016346132298483676, |
|
"loss": 3.1272, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.28737300435413643, |
|
"grad_norm": 1.7073307037353516, |
|
"learning_rate": 0.00016275495763051184, |
|
"loss": 3.206, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.29027576197387517, |
|
"grad_norm": 2.7498321533203125, |
|
"learning_rate": 0.00016204338864395684, |
|
"loss": 3.2865, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2931785195936139, |
|
"grad_norm": 1.8562026023864746, |
|
"learning_rate": 0.00016132667502837165, |
|
"loss": 3.2549, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.2960812772133527, |
|
"grad_norm": 1.724124789237976, |
|
"learning_rate": 0.00016060487621354815, |
|
"loss": 3.6638, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.29898403483309144, |
|
"grad_norm": 1.7688038349151611, |
|
"learning_rate": 0.00015987805205094227, |
|
"loss": 2.7772, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.3018867924528302, |
|
"grad_norm": 2.1941487789154053, |
|
"learning_rate": 0.0001591462628087109, |
|
"loss": 2.8096, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.3047895500725689, |
|
"grad_norm": 1.7136414051055908, |
|
"learning_rate": 0.00015840956916671477, |
|
"loss": 3.4411, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 2.4751169681549072, |
|
"learning_rate": 0.00015766803221148673, |
|
"loss": 2.9833, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.31059506531204645, |
|
"grad_norm": 2.0611205101013184, |
|
"learning_rate": 0.00015692171343116638, |
|
"loss": 2.6663, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.3134978229317852, |
|
"grad_norm": 1.6866419315338135, |
|
"learning_rate": 0.00015617067471040174, |
|
"loss": 3.2627, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.3164005805515239, |
|
"grad_norm": 1.548632025718689, |
|
"learning_rate": 0.0001554149783252175, |
|
"loss": 2.8767, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.3193033381712627, |
|
"grad_norm": 1.8421952724456787, |
|
"learning_rate": 0.00015465468693785125, |
|
"loss": 3.7856, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.32220609579100146, |
|
"grad_norm": 1.7316609621047974, |
|
"learning_rate": 0.00015388986359155758, |
|
"loss": 4.3054, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.3251088534107402, |
|
"grad_norm": 2.4119129180908203, |
|
"learning_rate": 0.00015312057170538035, |
|
"loss": 3.9081, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.32801161103047893, |
|
"grad_norm": 1.9937965869903564, |
|
"learning_rate": 0.00015234687506889428, |
|
"loss": 4.6076, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.3309143686502177, |
|
"grad_norm": 1.777130126953125, |
|
"learning_rate": 0.0001515688378369152, |
|
"loss": 2.5866, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.33381712626995647, |
|
"grad_norm": 2.239431142807007, |
|
"learning_rate": 0.00015078652452418063, |
|
"loss": 3.2308, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3367198838896952, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00015078652452418063, |
|
"loss": 3.0439, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.33962264150943394, |
|
"grad_norm": 2.8517825603485107, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 3.546, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.34252539912917274, |
|
"grad_norm": 2.105649948120117, |
|
"learning_rate": 0.00014920932948287593, |
|
"loss": 3.2135, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.3454281567489115, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014920932948287593, |
|
"loss": 3.8332, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.3483309143686502, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00014920932948287593, |
|
"loss": 3.4586, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.35123367198838895, |
|
"grad_norm": 5.404326915740967, |
|
"learning_rate": 0.00014841457853509606, |
|
"loss": 3.331, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.35413642960812775, |
|
"grad_norm": 2.6620254516601562, |
|
"learning_rate": 0.00014761581305729684, |
|
"loss": 3.9836, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3570391872278665, |
|
"grad_norm": 2.549010753631592, |
|
"learning_rate": 0.00014681309928299893, |
|
"loss": 3.7899, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.3599419448476052, |
|
"grad_norm": 3.8975048065185547, |
|
"learning_rate": 0.00014600650377311522, |
|
"loss": 3.5173, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.36284470246734396, |
|
"grad_norm": 5.37324857711792, |
|
"learning_rate": 0.00014519609341043157, |
|
"loss": 3.0372, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.36574746008708275, |
|
"grad_norm": 1.9681342840194702, |
|
"learning_rate": 0.00014438193539406089, |
|
"loss": 3.5476, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3686502177068215, |
|
"grad_norm": 1.8248546123504639, |
|
"learning_rate": 0.0001435640972338709, |
|
"loss": 3.7966, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.37155297532656023, |
|
"grad_norm": 1.9447401762008667, |
|
"learning_rate": 0.00014274264674488658, |
|
"loss": 3.7259, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.37445573294629897, |
|
"grad_norm": 1.9753526449203491, |
|
"learning_rate": 0.00014191765204166643, |
|
"loss": 3.6636, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.37735849056603776, |
|
"grad_norm": 1.8528705835342407, |
|
"learning_rate": 0.00014108918153265485, |
|
"loss": 3.8717, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3802612481857765, |
|
"grad_norm": 1.6633983850479126, |
|
"learning_rate": 0.00014025730391450947, |
|
"loss": 3.6534, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.38316400580551524, |
|
"grad_norm": 2.0460166931152344, |
|
"learning_rate": 0.00013942208816640505, |
|
"loss": 4.3184, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.386066763425254, |
|
"grad_norm": 1.5878854990005493, |
|
"learning_rate": 0.00013858360354431355, |
|
"loss": 3.1587, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3889695210449927, |
|
"grad_norm": 2.3371992111206055, |
|
"learning_rate": 0.00013774191957526143, |
|
"loss": 2.9895, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.3918722786647315, |
|
"grad_norm": 1.7218937873840332, |
|
"learning_rate": 0.00013689710605156472, |
|
"loss": 3.9084, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.39477503628447025, |
|
"grad_norm": 2.266514539718628, |
|
"learning_rate": 0.00013604923302504147, |
|
"loss": 3.7989, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.397677793904209, |
|
"grad_norm": 1.6445748805999756, |
|
"learning_rate": 0.00013519837080120346, |
|
"loss": 3.4014, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.4005805515239477, |
|
"grad_norm": 1.972373127937317, |
|
"learning_rate": 0.00013434458993342614, |
|
"loss": 3.2058, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.4034833091436865, |
|
"grad_norm": 2.3418309688568115, |
|
"learning_rate": 0.00013348796121709862, |
|
"loss": 4.0443, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.40638606676342526, |
|
"grad_norm": 1.811594843864441, |
|
"learning_rate": 0.00013262855568375317, |
|
"loss": 3.5496, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.409288824383164, |
|
"grad_norm": 1.8474693298339844, |
|
"learning_rate": 0.00013176644459517528, |
|
"loss": 3.6035, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.41219158200290273, |
|
"grad_norm": 1.9336134195327759, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 2.7298, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.41509433962264153, |
|
"grad_norm": 1.8413362503051758, |
|
"learning_rate": 0.00013003439191525807, |
|
"loss": 2.8708, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.41799709724238027, |
|
"grad_norm": 2.0277211666107178, |
|
"learning_rate": 0.0001291645939454825, |
|
"loss": 3.8391, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.420899854862119, |
|
"grad_norm": 1.8813992738723755, |
|
"learning_rate": 0.000128292377651693, |
|
"loss": 3.4416, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.42380261248185774, |
|
"grad_norm": 2.2389297485351562, |
|
"learning_rate": 0.00012741781535794154, |
|
"loss": 3.3343, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.42670537010159654, |
|
"grad_norm": 2.1361331939697266, |
|
"learning_rate": 0.0001265409795828101, |
|
"loss": 3.6481, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.4296081277213353, |
|
"grad_norm": 1.7442470788955688, |
|
"learning_rate": 0.00012566194303339739, |
|
"loss": 2.8798, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.432510885341074, |
|
"grad_norm": 1.9861546754837036, |
|
"learning_rate": 0.00012478077859929, |
|
"loss": 2.6437, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.43541364296081275, |
|
"grad_norm": 1.9143513441085815, |
|
"learning_rate": 0.0001238975593465185, |
|
"loss": 3.0054, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.43831640058055155, |
|
"grad_norm": 1.910510778427124, |
|
"learning_rate": 0.00012301235851149865, |
|
"loss": 3.073, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.4412191582002903, |
|
"grad_norm": 2.015235424041748, |
|
"learning_rate": 0.0001221252494949588, |
|
"loss": 3.3852, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.444121915820029, |
|
"grad_norm": 2.3909735679626465, |
|
"learning_rate": 0.00012123630585585333, |
|
"loss": 3.6159, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.44702467343976776, |
|
"grad_norm": 4.09874963760376, |
|
"learning_rate": 0.0001203456013052634, |
|
"loss": 3.8377, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.44992743105950656, |
|
"grad_norm": 2.008082151412964, |
|
"learning_rate": 0.00011945320970028461, |
|
"loss": 3.3051, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.4528301886792453, |
|
"grad_norm": 1.7395459413528442, |
|
"learning_rate": 0.00011855920503790292, |
|
"loss": 2.8138, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.45573294629898403, |
|
"grad_norm": 3.456113815307617, |
|
"learning_rate": 0.00011766366144885877, |
|
"loss": 3.8382, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.45863570391872277, |
|
"grad_norm": 1.6849101781845093, |
|
"learning_rate": 0.0001167666531915001, |
|
"loss": 3.2607, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 2.4074480533599854, |
|
"learning_rate": 0.00011586825464562514, |
|
"loss": 3.1549, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.4644412191582003, |
|
"grad_norm": 1.906053900718689, |
|
"learning_rate": 0.00011496854030631443, |
|
"loss": 3.0266, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.46734397677793904, |
|
"grad_norm": 3.594622850418091, |
|
"learning_rate": 0.00011406758477775406, |
|
"loss": 2.9502, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.4702467343976778, |
|
"grad_norm": 1.7513110637664795, |
|
"learning_rate": 0.00011316546276704924, |
|
"loss": 3.0875, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4731494920174166, |
|
"grad_norm": 1.782333254814148, |
|
"learning_rate": 0.00011226224907802985, |
|
"loss": 3.1332, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.4760522496371553, |
|
"grad_norm": 1.809478759765625, |
|
"learning_rate": 0.00011135801860504749, |
|
"loss": 3.6647, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.47895500725689405, |
|
"grad_norm": 1.8948771953582764, |
|
"learning_rate": 0.00011045284632676536, |
|
"loss": 4.1531, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4818577648766328, |
|
"grad_norm": 2.1463427543640137, |
|
"learning_rate": 0.00010954680729994102, |
|
"loss": 3.9761, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.4847605224963715, |
|
"grad_norm": 3.1157124042510986, |
|
"learning_rate": 0.00010863997665320272, |
|
"loss": 3.3557, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.4876632801161103, |
|
"grad_norm": 1.641317367553711, |
|
"learning_rate": 0.0001077324295808197, |
|
"loss": 2.8117, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.49056603773584906, |
|
"grad_norm": 2.0440993309020996, |
|
"learning_rate": 0.0001068242413364671, |
|
"loss": 3.7747, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.4934687953555878, |
|
"grad_norm": 1.8725652694702148, |
|
"learning_rate": 0.00010591548722698599, |
|
"loss": 3.5484, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.49637155297532654, |
|
"grad_norm": 2.0633366107940674, |
|
"learning_rate": 0.00010500624260613892, |
|
"loss": 3.1863, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.49927431059506533, |
|
"grad_norm": 1.8762496709823608, |
|
"learning_rate": 0.00010409658286836143, |
|
"loss": 3.2581, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.502177068214804, |
|
"grad_norm": 2.147141695022583, |
|
"learning_rate": 0.00010318658344251066, |
|
"loss": 3.5548, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.5050798258345428, |
|
"grad_norm": 1.9856010675430298, |
|
"learning_rate": 0.00010227631978561056, |
|
"loss": 3.4, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.5079825834542816, |
|
"grad_norm": 4.999744892120361, |
|
"learning_rate": 0.0001013658673765951, |
|
"loss": 3.1381, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5108853410740203, |
|
"grad_norm": 1.9928354024887085, |
|
"learning_rate": 0.00010045530171004955, |
|
"loss": 2.8732, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.5137880986937591, |
|
"grad_norm": 1.749778389930725, |
|
"learning_rate": 9.954469828995045e-05, |
|
"loss": 3.6324, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.5166908563134979, |
|
"grad_norm": 2.014143943786621, |
|
"learning_rate": 9.863413262340491e-05, |
|
"loss": 3.073, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.5195936139332366, |
|
"grad_norm": 2.1828532218933105, |
|
"learning_rate": 9.772368021438943e-05, |
|
"loss": 3.5193, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.5224963715529753, |
|
"grad_norm": 1.9171918630599976, |
|
"learning_rate": 9.681341655748934e-05, |
|
"loss": 3.6872, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.525399129172714, |
|
"grad_norm": 2.2952675819396973, |
|
"learning_rate": 9.590341713163858e-05, |
|
"loss": 3.7747, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.5283018867924528, |
|
"grad_norm": 2.325395345687866, |
|
"learning_rate": 9.499375739386112e-05, |
|
"loss": 3.6792, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.5312046444121916, |
|
"grad_norm": 1.756514072418213, |
|
"learning_rate": 9.4084512773014e-05, |
|
"loss": 2.9106, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.5341074020319303, |
|
"grad_norm": 1.7968791723251343, |
|
"learning_rate": 9.317575866353292e-05, |
|
"loss": 3.4895, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.5370101596516691, |
|
"grad_norm": 2.017638921737671, |
|
"learning_rate": 9.226757041918033e-05, |
|
"loss": 3.3524, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5399129172714079, |
|
"grad_norm": 1.6511162519454956, |
|
"learning_rate": 9.136002334679731e-05, |
|
"loss": 2.5666, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.5428156748911466, |
|
"grad_norm": 1.884466290473938, |
|
"learning_rate": 9.0453192700059e-05, |
|
"loss": 3.2128, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.5457184325108854, |
|
"grad_norm": 2.771385669708252, |
|
"learning_rate": 8.954715367323468e-05, |
|
"loss": 4.2103, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.548621190130624, |
|
"grad_norm": 1.8222163915634155, |
|
"learning_rate": 8.86419813949525e-05, |
|
"loss": 4.1274, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.5515239477503628, |
|
"grad_norm": 1.7211194038391113, |
|
"learning_rate": 8.773775092197017e-05, |
|
"loss": 3.0317, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5544267053701016, |
|
"grad_norm": 2.3453516960144043, |
|
"learning_rate": 8.683453723295074e-05, |
|
"loss": 3.9362, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.5573294629898403, |
|
"grad_norm": 1.9646939039230347, |
|
"learning_rate": 8.593241522224597e-05, |
|
"loss": 3.1403, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5602322206095791, |
|
"grad_norm": 1.6175512075424194, |
|
"learning_rate": 8.503145969368562e-05, |
|
"loss": 3.0328, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.5631349782293179, |
|
"grad_norm": 1.8903875350952148, |
|
"learning_rate": 8.413174535437487e-05, |
|
"loss": 3.1679, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5660377358490566, |
|
"grad_norm": 1.789034128189087, |
|
"learning_rate": 8.323334680849992e-05, |
|
"loss": 2.8819, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5689404934687954, |
|
"grad_norm": 2.002990245819092, |
|
"learning_rate": 8.233633855114127e-05, |
|
"loss": 3.258, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5718432510885341, |
|
"grad_norm": 2.053255796432495, |
|
"learning_rate": 8.14407949620971e-05, |
|
"loss": 3.7645, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.5747460087082729, |
|
"grad_norm": 2.13325834274292, |
|
"learning_rate": 8.054679029971541e-05, |
|
"loss": 2.9198, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5776487663280117, |
|
"grad_norm": 2.154493808746338, |
|
"learning_rate": 7.965439869473664e-05, |
|
"loss": 2.9222, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5805515239477503, |
|
"grad_norm": 1.912862777709961, |
|
"learning_rate": 7.87636941441467e-05, |
|
"loss": 4.0231, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5834542815674891, |
|
"grad_norm": 1.8815771341323853, |
|
"learning_rate": 7.787475050504125e-05, |
|
"loss": 2.7792, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5863570391872278, |
|
"grad_norm": 2.248081922531128, |
|
"learning_rate": 7.698764148850137e-05, |
|
"loss": 2.6916, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5892597968069666, |
|
"grad_norm": 1.8417608737945557, |
|
"learning_rate": 7.610244065348153e-05, |
|
"loss": 2.9318, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.5921625544267054, |
|
"grad_norm": 2.505697250366211, |
|
"learning_rate": 7.521922140071002e-05, |
|
"loss": 3.8375, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5950653120464441, |
|
"grad_norm": 2.0701253414154053, |
|
"learning_rate": 7.433805696660266e-05, |
|
"loss": 3.4407, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5979680696661829, |
|
"grad_norm": 2.3337976932525635, |
|
"learning_rate": 7.34590204171899e-05, |
|
"loss": 3.9581, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.6008708272859217, |
|
"grad_norm": 2.4665446281433105, |
|
"learning_rate": 7.258218464205848e-05, |
|
"loss": 3.5468, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.6037735849056604, |
|
"grad_norm": 1.7483268976211548, |
|
"learning_rate": 7.170762234830699e-05, |
|
"loss": 2.8491, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.6066763425253991, |
|
"grad_norm": 1.9214202165603638, |
|
"learning_rate": 7.08354060545175e-05, |
|
"loss": 3.1274, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.6095791001451378, |
|
"grad_norm": 2.279972553253174, |
|
"learning_rate": 6.996560808474195e-05, |
|
"loss": 3.6062, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6124818577648766, |
|
"grad_norm": 2.0444631576538086, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 3.5751, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 2.0278406143188477, |
|
"learning_rate": 6.823355540482475e-05, |
|
"loss": 2.8403, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.6182873730043541, |
|
"grad_norm": 5.514923095703125, |
|
"learning_rate": 6.737144431624687e-05, |
|
"loss": 3.4911, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.6211901306240929, |
|
"grad_norm": 3.965879201889038, |
|
"learning_rate": 6.651203878290139e-05, |
|
"loss": 3.5421, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.6240928882438317, |
|
"grad_norm": 1.7389204502105713, |
|
"learning_rate": 6.565541006657387e-05, |
|
"loss": 2.7665, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.6269956458635704, |
|
"grad_norm": 1.7644435167312622, |
|
"learning_rate": 6.480162919879657e-05, |
|
"loss": 2.3306, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.6298984034833092, |
|
"grad_norm": 1.799849033355713, |
|
"learning_rate": 6.395076697495854e-05, |
|
"loss": 3.06, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.6328011611030478, |
|
"grad_norm": 1.7353590726852417, |
|
"learning_rate": 6.310289394843528e-05, |
|
"loss": 3.0691, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.6357039187227866, |
|
"grad_norm": 1.8332058191299438, |
|
"learning_rate": 6.225808042473858e-05, |
|
"loss": 3.4982, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.6386066763425254, |
|
"grad_norm": 2.136359691619873, |
|
"learning_rate": 6.141639645568646e-05, |
|
"loss": 3.3539, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6415094339622641, |
|
"grad_norm": 2.038928508758545, |
|
"learning_rate": 6.057791183359496e-05, |
|
"loss": 2.6658, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.6444121915820029, |
|
"grad_norm": 2.400620222091675, |
|
"learning_rate": 5.974269608549052e-05, |
|
"loss": 3.4144, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.6473149492017417, |
|
"grad_norm": 1.9838178157806396, |
|
"learning_rate": 5.8910818467345185e-05, |
|
"loss": 3.2745, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.6502177068214804, |
|
"grad_norm": 1.9232710599899292, |
|
"learning_rate": 5.8082347958333625e-05, |
|
"loss": 3.5748, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.6531204644412192, |
|
"grad_norm": 2.4304771423339844, |
|
"learning_rate": 5.725735325511343e-05, |
|
"loss": 3.3168, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6560232220609579, |
|
"grad_norm": 1.825479507446289, |
|
"learning_rate": 5.643590276612909e-05, |
|
"loss": 2.7848, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.6589259796806967, |
|
"grad_norm": 2.0149223804473877, |
|
"learning_rate": 5.561806460593917e-05, |
|
"loss": 3.2352, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.6618287373004355, |
|
"grad_norm": 2.0452849864959717, |
|
"learning_rate": 5.4803906589568476e-05, |
|
"loss": 3.2581, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.6647314949201741, |
|
"grad_norm": 1.8912854194641113, |
|
"learning_rate": 5.399349622688479e-05, |
|
"loss": 3.1843, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.6676342525399129, |
|
"grad_norm": 1.9609266519546509, |
|
"learning_rate": 5.3186900717001095e-05, |
|
"loss": 2.4325, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6705370101596516, |
|
"grad_norm": 2.2313523292541504, |
|
"learning_rate": 5.238418694270317e-05, |
|
"loss": 3.4058, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.6734397677793904, |
|
"grad_norm": 2.4402058124542236, |
|
"learning_rate": 5.1585421464903994e-05, |
|
"loss": 3.9064, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6763425253991292, |
|
"grad_norm": 2.100404977798462, |
|
"learning_rate": 5.0790670517124097e-05, |
|
"loss": 3.6432, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.6792452830188679, |
|
"grad_norm": 2.1355984210968018, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 3.58, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.6821480406386067, |
|
"grad_norm": 2.045910596847534, |
|
"learning_rate": 4.921347547581939e-05, |
|
"loss": 2.9068, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6850507982583455, |
|
"grad_norm": 2.569124460220337, |
|
"learning_rate": 4.843116216308483e-05, |
|
"loss": 2.9852, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.6879535558780842, |
|
"grad_norm": 1.493397831916809, |
|
"learning_rate": 4.765312493110578e-05, |
|
"loss": 1.991, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.690856313497823, |
|
"grad_norm": 1.9058390855789185, |
|
"learning_rate": 4.687942829461969e-05, |
|
"loss": 3.2437, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.6937590711175616, |
|
"grad_norm": 2.2078254222869873, |
|
"learning_rate": 4.611013640844245e-05, |
|
"loss": 3.319, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.6966618287373004, |
|
"grad_norm": 4.914015769958496, |
|
"learning_rate": 4.5345313062148776e-05, |
|
"loss": 3.2462, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6995645863570392, |
|
"grad_norm": 2.244297742843628, |
|
"learning_rate": 4.4585021674782534e-05, |
|
"loss": 3.66, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.7024673439767779, |
|
"grad_norm": 2.599207639694214, |
|
"learning_rate": 4.38293252895983e-05, |
|
"loss": 2.5752, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.7053701015965167, |
|
"grad_norm": 1.918351411819458, |
|
"learning_rate": 4.3078286568833614e-05, |
|
"loss": 3.905, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.7082728592162555, |
|
"grad_norm": 1.9592002630233765, |
|
"learning_rate": 4.2331967788513295e-05, |
|
"loss": 2.9771, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.7111756168359942, |
|
"grad_norm": 1.787062168121338, |
|
"learning_rate": 4.159043083328521e-05, |
|
"loss": 2.4677, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.714078374455733, |
|
"grad_norm": 1.8812865018844604, |
|
"learning_rate": 4.0853737191289096e-05, |
|
"loss": 3.6701, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.7169811320754716, |
|
"grad_norm": 1.9155601263046265, |
|
"learning_rate": 4.012194794905775e-05, |
|
"loss": 3.6807, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.7198838896952104, |
|
"grad_norm": 2.015004873275757, |
|
"learning_rate": 3.939512378645185e-05, |
|
"loss": 2.9783, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.7227866473149492, |
|
"grad_norm": 2.053408622741699, |
|
"learning_rate": 3.8673324971628357e-05, |
|
"loss": 2.8061, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.7256894049346879, |
|
"grad_norm": 1.8491019010543823, |
|
"learning_rate": 3.795661135604319e-05, |
|
"loss": 3.2741, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7285921625544267, |
|
"grad_norm": 2.372168779373169, |
|
"learning_rate": 3.724504236948818e-05, |
|
"loss": 3.3095, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.7314949201741655, |
|
"grad_norm": 2.0113255977630615, |
|
"learning_rate": 3.653867701516326e-05, |
|
"loss": 3.5256, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.7343976777939042, |
|
"grad_norm": 1.9517208337783813, |
|
"learning_rate": 3.583757386478389e-05, |
|
"loss": 2.9625, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.737300435413643, |
|
"grad_norm": 2.208834171295166, |
|
"learning_rate": 3.5141791053724405e-05, |
|
"loss": 3.0578, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.7402031930333817, |
|
"grad_norm": 2.307220458984375, |
|
"learning_rate": 3.4451386276197293e-05, |
|
"loss": 2.9855, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.7431059506531205, |
|
"grad_norm": 2.1939680576324463, |
|
"learning_rate": 3.3766416780469256e-05, |
|
"loss": 3.673, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.7460087082728593, |
|
"grad_norm": 1.9280527830123901, |
|
"learning_rate": 3.308693936411421e-05, |
|
"loss": 3.0642, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.7489114658925979, |
|
"grad_norm": 2.047974109649658, |
|
"learning_rate": 3.2413010369303584e-05, |
|
"loss": 3.1728, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.7518142235123367, |
|
"grad_norm": 2.1966168880462646, |
|
"learning_rate": 3.174468567813461e-05, |
|
"loss": 3.2277, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.7547169811320755, |
|
"grad_norm": 2.072453022003174, |
|
"learning_rate": 3.108202070799626e-05, |
|
"loss": 3.3533, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7576197387518142, |
|
"grad_norm": 1.9733140468597412, |
|
"learning_rate": 3.0425070406974455e-05, |
|
"loss": 2.9843, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.760522496371553, |
|
"grad_norm": 2.302907943725586, |
|
"learning_rate": 2.9773889249295294e-05, |
|
"loss": 3.1157, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.7634252539912917, |
|
"grad_norm": 1.9516576528549194, |
|
"learning_rate": 2.9128531230808576e-05, |
|
"loss": 3.4501, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.7663280116110305, |
|
"grad_norm": 1.9993865489959717, |
|
"learning_rate": 2.8489049864510054e-05, |
|
"loss": 3.5931, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 2.091517686843872, |
|
"learning_rate": 2.7855498176104434e-05, |
|
"loss": 2.202, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.772133526850508, |
|
"grad_norm": 2.672689199447632, |
|
"learning_rate": 2.7227928699608263e-05, |
|
"loss": 3.4568, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.7750362844702468, |
|
"grad_norm": 2.0529282093048096, |
|
"learning_rate": 2.6606393472993973e-05, |
|
"loss": 3.4287, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.7779390420899854, |
|
"grad_norm": 1.8243032693862915, |
|
"learning_rate": 2.599094403387481e-05, |
|
"loss": 2.9586, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7808417997097242, |
|
"grad_norm": 2.381425619125366, |
|
"learning_rate": 2.5381631415231454e-05, |
|
"loss": 3.6723, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.783744557329463, |
|
"grad_norm": 3.504389524459839, |
|
"learning_rate": 2.4778506141180236e-05, |
|
"loss": 4.3296, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7866473149492017, |
|
"grad_norm": 1.7428265810012817, |
|
"learning_rate": 2.418161822278374e-05, |
|
"loss": 3.037, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.7895500725689405, |
|
"grad_norm": 2.81032133102417, |
|
"learning_rate": 2.3591017153903916e-05, |
|
"loss": 3.1645, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7924528301886793, |
|
"grad_norm": 2.1162257194519043, |
|
"learning_rate": 2.300675190709809e-05, |
|
"loss": 3.2709, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.795355587808418, |
|
"grad_norm": 1.9196466207504272, |
|
"learning_rate": 2.242887092955801e-05, |
|
"loss": 3.6456, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.7982583454281568, |
|
"grad_norm": 2.2060110569000244, |
|
"learning_rate": 2.1857422139092865e-05, |
|
"loss": 3.068, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.8011611030478955, |
|
"grad_norm": 1.857069492340088, |
|
"learning_rate": 2.1292452920155592e-05, |
|
"loss": 3.251, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.8040638606676342, |
|
"grad_norm": 3.120304584503174, |
|
"learning_rate": 2.0734010119914192e-05, |
|
"loss": 3.1381, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.806966618287373, |
|
"grad_norm": 2.208164930343628, |
|
"learning_rate": 2.018214004436677e-05, |
|
"loss": 3.0816, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.8098693759071117, |
|
"grad_norm": 1.976894736289978, |
|
"learning_rate": 1.9636888454502178e-05, |
|
"loss": 2.719, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.8127721335268505, |
|
"grad_norm": 2.5784201622009277, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 3.1057, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8156748911465893, |
|
"grad_norm": 2.8327383995056152, |
|
"learning_rate": 1.8566421028008018e-05, |
|
"loss": 3.8255, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.818577648766328, |
|
"grad_norm": 2.2047767639160156, |
|
"learning_rate": 1.804129395438635e-05, |
|
"loss": 3.244, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.8214804063860668, |
|
"grad_norm": 2.8230714797973633, |
|
"learning_rate": 1.7522962885103145e-05, |
|
"loss": 3.0961, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.8243831640058055, |
|
"grad_norm": 2.201507091522217, |
|
"learning_rate": 1.7011470800097496e-05, |
|
"loss": 2.6894, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.8272859216255443, |
|
"grad_norm": 1.9765757322311401, |
|
"learning_rate": 1.65068601122209e-05, |
|
"loss": 3.136, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.8301886792452831, |
|
"grad_norm": 2.06833815574646, |
|
"learning_rate": 1.600917266372035e-05, |
|
"loss": 3.6098, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.8330914368650217, |
|
"grad_norm": 2.757883310317993, |
|
"learning_rate": 1.5518449722768892e-05, |
|
"loss": 4.0251, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.8359941944847605, |
|
"grad_norm": 2.0423471927642822, |
|
"learning_rate": 1.5034731980043515e-05, |
|
"loss": 3.1681, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.8388969521044993, |
|
"grad_norm": 2.4651999473571777, |
|
"learning_rate": 1.4558059545351143e-05, |
|
"loss": 3.2775, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.841799709724238, |
|
"grad_norm": 1.7521671056747437, |
|
"learning_rate": 1.4088471944302861e-05, |
|
"loss": 2.355, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8447024673439768, |
|
"grad_norm": 1.9442704916000366, |
|
"learning_rate": 1.3626008115036181e-05, |
|
"loss": 3.1105, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.8476052249637155, |
|
"grad_norm": 2.1002039909362793, |
|
"learning_rate": 1.3170706404986644e-05, |
|
"loss": 3.6593, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.8505079825834543, |
|
"grad_norm": 1.7552965879440308, |
|
"learning_rate": 1.2722604567707719e-05, |
|
"loss": 2.6157, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.8534107402031931, |
|
"grad_norm": 1.8941353559494019, |
|
"learning_rate": 1.2281739759740574e-05, |
|
"loss": 3.2914, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.8563134978229318, |
|
"grad_norm": 2.11254620552063, |
|
"learning_rate": 1.1848148537532843e-05, |
|
"loss": 3.2055, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.8592162554426706, |
|
"grad_norm": 2.3030812740325928, |
|
"learning_rate": 1.142186685440747e-05, |
|
"loss": 2.8077, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.8621190130624092, |
|
"grad_norm": 1.9629480838775635, |
|
"learning_rate": 1.100293005758145e-05, |
|
"loss": 2.3917, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.865021770682148, |
|
"grad_norm": 1.9289971590042114, |
|
"learning_rate": 1.0591372885234885e-05, |
|
"loss": 3.2658, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.8679245283018868, |
|
"grad_norm": 1.8033056259155273, |
|
"learning_rate": 1.01872294636304e-05, |
|
"loss": 3.2022, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.8708272859216255, |
|
"grad_norm": 1.87389075756073, |
|
"learning_rate": 9.790533304283478e-06, |
|
"loss": 2.6739, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8737300435413643, |
|
"grad_norm": 2.6886935234069824, |
|
"learning_rate": 9.401317301183655e-06, |
|
"loss": 3.1875, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.8766328011611031, |
|
"grad_norm": 2.1857502460479736, |
|
"learning_rate": 9.019613728067e-06, |
|
"loss": 2.8756, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.8795355587808418, |
|
"grad_norm": 2.1285061836242676, |
|
"learning_rate": 8.645454235739903e-06, |
|
"loss": 3.2116, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.8824383164005806, |
|
"grad_norm": 2.7644810676574707, |
|
"learning_rate": 8.278869849454718e-06, |
|
"loss": 3.0152, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.8853410740203193, |
|
"grad_norm": 1.9984538555145264, |
|
"learning_rate": 7.91989096633693e-06, |
|
"loss": 2.7286, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.888243831640058, |
|
"grad_norm": 1.859739899635315, |
|
"learning_rate": 7.568547352864941e-06, |
|
"loss": 2.9108, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.8911465892597968, |
|
"grad_norm": 1.783887505531311, |
|
"learning_rate": 7.224868142401542e-06, |
|
"loss": 2.7539, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.8940493468795355, |
|
"grad_norm": 2.297299385070801, |
|
"learning_rate": 6.888881832778415e-06, |
|
"loss": 2.8574, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.8969521044992743, |
|
"grad_norm": 2.203857898712158, |
|
"learning_rate": 6.560616283932897e-06, |
|
"loss": 3.6275, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.8998548621190131, |
|
"grad_norm": 2.2782490253448486, |
|
"learning_rate": 6.240098715597975e-06, |
|
"loss": 3.1797, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9027576197387518, |
|
"grad_norm": 2.0081512928009033, |
|
"learning_rate": 5.927355705045179e-06, |
|
"loss": 3.09, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.9056603773584906, |
|
"grad_norm": 2.6315252780914307, |
|
"learning_rate": 5.6224131848808144e-06, |
|
"loss": 2.8839, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.9085631349782293, |
|
"grad_norm": 2.094134569168091, |
|
"learning_rate": 5.325296440895622e-06, |
|
"loss": 2.9956, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.9114658925979681, |
|
"grad_norm": 2.017035484313965, |
|
"learning_rate": 5.036030109968082e-06, |
|
"loss": 2.6596, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.9143686502177069, |
|
"grad_norm": 2.2012784481048584, |
|
"learning_rate": 4.754638178021498e-06, |
|
"loss": 3.1305, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.9172714078374455, |
|
"grad_norm": 1.8841356039047241, |
|
"learning_rate": 4.481143978035196e-06, |
|
"loss": 3.0464, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.9201741654571843, |
|
"grad_norm": 2.4728565216064453, |
|
"learning_rate": 4.2155701881096075e-06, |
|
"loss": 2.7735, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 2.1314468383789062, |
|
"learning_rate": 3.95793882958595e-06, |
|
"loss": 3.3511, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.9259796806966618, |
|
"grad_norm": 1.9269267320632935, |
|
"learning_rate": 3.7082712652200867e-06, |
|
"loss": 3.282, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.9288824383164006, |
|
"grad_norm": 1.958406925201416, |
|
"learning_rate": 3.4665881974112026e-06, |
|
"loss": 2.8489, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9317851959361393, |
|
"grad_norm": 2.2147128582000732, |
|
"learning_rate": 3.2329096664852064e-06, |
|
"loss": 3.6156, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.9346879535558781, |
|
"grad_norm": 1.944659948348999, |
|
"learning_rate": 3.0072550490328753e-06, |
|
"loss": 3.2088, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.9375907111756169, |
|
"grad_norm": 2.1794497966766357, |
|
"learning_rate": 2.7896430563032707e-06, |
|
"loss": 3.0827, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.9404934687953556, |
|
"grad_norm": 2.2770931720733643, |
|
"learning_rate": 2.580091732652101e-06, |
|
"loss": 3.3405, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.9433962264150944, |
|
"grad_norm": 2.1666173934936523, |
|
"learning_rate": 2.3786184540455448e-06, |
|
"loss": 2.7803, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.9462989840348331, |
|
"grad_norm": 2.107891321182251, |
|
"learning_rate": 2.1852399266194314e-06, |
|
"loss": 2.7457, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.9492017416545718, |
|
"grad_norm": 2.0576820373535156, |
|
"learning_rate": 1.9999721852939858e-06, |
|
"loss": 2.9182, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.9521044992743106, |
|
"grad_norm": 3.305752992630005, |
|
"learning_rate": 1.822830592444147e-06, |
|
"loss": 3.8223, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.9550072568940493, |
|
"grad_norm": 2.0414235591888428, |
|
"learning_rate": 1.6538298366257976e-06, |
|
"loss": 3.0047, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.9579100145137881, |
|
"grad_norm": 2.361135721206665, |
|
"learning_rate": 1.4929839313577609e-06, |
|
"loss": 3.9007, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9608127721335269, |
|
"grad_norm": 1.976693034172058, |
|
"learning_rate": 1.3403062139598076e-06, |
|
"loss": 3.2631, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.9637155297532656, |
|
"grad_norm": 1.9887497425079346, |
|
"learning_rate": 1.1958093444467079e-06, |
|
"loss": 3.5457, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.9666182873730044, |
|
"grad_norm": 2.3717265129089355, |
|
"learning_rate": 1.059505304478503e-06, |
|
"loss": 3.2485, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.969521044992743, |
|
"grad_norm": 1.9998297691345215, |
|
"learning_rate": 9.314053963669245e-07, |
|
"loss": 3.5795, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.9724238026124818, |
|
"grad_norm": 2.2316505908966064, |
|
"learning_rate": 8.115202421383083e-07, |
|
"loss": 3.2281, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.9753265602322206, |
|
"grad_norm": 1.9784519672393799, |
|
"learning_rate": 6.998597826526898e-07, |
|
"loss": 3.3497, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.9782293178519593, |
|
"grad_norm": 1.9381024837493896, |
|
"learning_rate": 5.964332767796399e-07, |
|
"loss": 2.771, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.9811320754716981, |
|
"grad_norm": 2.6159842014312744, |
|
"learning_rate": 5.012493006304131e-07, |
|
"loss": 3.2164, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.9840348330914369, |
|
"grad_norm": 2.405776262283325, |
|
"learning_rate": 4.143157468468717e-07, |
|
"loss": 2.6788, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.9869375907111756, |
|
"grad_norm": 2.2898902893066406, |
|
"learning_rate": 3.3563982394704266e-07, |
|
"loss": 4.1156, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9898403483309144, |
|
"grad_norm": 2.2498302459716797, |
|
"learning_rate": 2.652280557273512e-07, |
|
"loss": 3.3784, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.9927431059506531, |
|
"grad_norm": 2.19677996635437, |
|
"learning_rate": 2.030862807216649e-07, |
|
"loss": 3.2635, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.9956458635703919, |
|
"grad_norm": 2.383686065673828, |
|
"learning_rate": 1.4921965171720287e-07, |
|
"loss": 3.2177, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.9985486211901307, |
|
"grad_norm": 1.5856789350509644, |
|
"learning_rate": 1.0363263532724432e-07, |
|
"loss": 2.6107, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.9985486211901307, |
|
"eval_loss": 0.8033239841461182, |
|
"eval_runtime": 13.4743, |
|
"eval_samples_per_second": 21.522, |
|
"eval_steps_per_second": 5.418, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.0021770682148041, |
|
"grad_norm": 1.5614508390426636, |
|
"learning_rate": 6.632901162074711e-08, |
|
"loss": 2.3665, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.0021770682148041, |
|
"eval_loss": 0.8033127188682556, |
|
"eval_runtime": 13.2595, |
|
"eval_samples_per_second": 21.871, |
|
"eval_steps_per_second": 5.506, |
|
"step": 345 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 345, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.435086402578022e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|