|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 780, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.157794332025747, |
|
"learning_rate": 8.974358974358974e-08, |
|
"loss": 0.5858, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.0242493468561227, |
|
"learning_rate": 1.7948717948717948e-07, |
|
"loss": 0.6092, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.7745230799899288, |
|
"learning_rate": 2.692307692307692e-07, |
|
"loss": 0.5905, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.0024697741882873, |
|
"learning_rate": 3.5897435897435896e-07, |
|
"loss": 0.6548, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.131554528264668, |
|
"learning_rate": 4.4871794871794865e-07, |
|
"loss": 0.6324, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.6251050835532377, |
|
"learning_rate": 5.384615384615384e-07, |
|
"loss": 0.5575, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.114615782713657, |
|
"learning_rate": 6.282051282051282e-07, |
|
"loss": 0.5568, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.883211828526437, |
|
"learning_rate": 7.179487179487179e-07, |
|
"loss": 0.6242, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.097582117144788, |
|
"learning_rate": 8.076923076923077e-07, |
|
"loss": 0.5944, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.8984916101799536, |
|
"learning_rate": 8.974358974358973e-07, |
|
"loss": 0.6359, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.7500845254975426, |
|
"learning_rate": 9.871794871794872e-07, |
|
"loss": 0.6004, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.9464098867975723, |
|
"learning_rate": 1.0769230769230769e-06, |
|
"loss": 0.6183, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.742713392529882, |
|
"learning_rate": 1.1666666666666666e-06, |
|
"loss": 0.5473, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.554893065802702, |
|
"learning_rate": 1.2564102564102565e-06, |
|
"loss": 0.5389, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.8440244944229423, |
|
"learning_rate": 1.3461538461538462e-06, |
|
"loss": 0.563, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.3051065909946815, |
|
"learning_rate": 1.4358974358974359e-06, |
|
"loss": 0.5832, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.0186256795764828, |
|
"learning_rate": 1.5256410256410255e-06, |
|
"loss": 0.5626, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.3684781031146307, |
|
"learning_rate": 1.6153846153846154e-06, |
|
"loss": 0.5593, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.1296922237578837, |
|
"learning_rate": 1.7051282051282051e-06, |
|
"loss": 0.5083, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.2333623615611975, |
|
"learning_rate": 1.7948717948717946e-06, |
|
"loss": 0.5701, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.3716536064596365, |
|
"learning_rate": 1.8846153846153845e-06, |
|
"loss": 0.7116, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.6249010351941537, |
|
"learning_rate": 1.9743589743589744e-06, |
|
"loss": 0.6667, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.025777944483822, |
|
"learning_rate": 2.064102564102564e-06, |
|
"loss": 0.5018, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.0694837392683163, |
|
"learning_rate": 2.1538461538461538e-06, |
|
"loss": 0.5145, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.7977920012457864, |
|
"learning_rate": 2.243589743589744e-06, |
|
"loss": 0.4688, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.030335865996782, |
|
"learning_rate": 2.333333333333333e-06, |
|
"loss": 0.4719, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.2391443374029514, |
|
"learning_rate": 2.423076923076923e-06, |
|
"loss": 0.5303, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.9912422183097178, |
|
"learning_rate": 2.512820512820513e-06, |
|
"loss": 0.5431, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.0255309552484095, |
|
"learning_rate": 2.6025641025641026e-06, |
|
"loss": 0.5824, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.8770215122908283, |
|
"learning_rate": 2.6923076923076923e-06, |
|
"loss": 0.5082, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.98939906714483, |
|
"learning_rate": 2.782051282051282e-06, |
|
"loss": 0.5154, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.6816545538392116, |
|
"learning_rate": 2.8717948717948717e-06, |
|
"loss": 0.4833, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.8708630053016435, |
|
"learning_rate": 2.9615384615384614e-06, |
|
"loss": 0.4298, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.804255585106581, |
|
"learning_rate": 3.051282051282051e-06, |
|
"loss": 0.4966, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.1939960340576983, |
|
"learning_rate": 3.141025641025641e-06, |
|
"loss": 0.5959, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.251434624865691, |
|
"learning_rate": 3.230769230769231e-06, |
|
"loss": 0.5375, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.729304380380877, |
|
"learning_rate": 3.32051282051282e-06, |
|
"loss": 0.454, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.78866126998266, |
|
"learning_rate": 3.4102564102564103e-06, |
|
"loss": 0.4974, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.8070608359204237, |
|
"learning_rate": 3.5e-06, |
|
"loss": 0.5061, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.7633109832646323, |
|
"learning_rate": 3.5897435897435892e-06, |
|
"loss": 0.5171, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.7502966702710157, |
|
"learning_rate": 3.6794871794871797e-06, |
|
"loss": 0.5277, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.7810880679374266, |
|
"learning_rate": 3.769230769230769e-06, |
|
"loss": 0.486, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.7345248362598074, |
|
"learning_rate": 3.858974358974359e-06, |
|
"loss": 0.4403, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.8544499121276545, |
|
"learning_rate": 3.948717948717949e-06, |
|
"loss": 0.4559, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.9281302680981884, |
|
"learning_rate": 4.038461538461538e-06, |
|
"loss": 0.4892, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.823077284882776, |
|
"learning_rate": 4.128205128205128e-06, |
|
"loss": 0.4578, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.7087373941464228, |
|
"learning_rate": 4.217948717948718e-06, |
|
"loss": 0.4312, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.7936679868143406, |
|
"learning_rate": 4.3076923076923076e-06, |
|
"loss": 0.4268, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.7913865129800892, |
|
"learning_rate": 4.397435897435897e-06, |
|
"loss": 0.4978, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.680234009753019, |
|
"learning_rate": 4.487179487179488e-06, |
|
"loss": 0.3909, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.7084054193024387, |
|
"learning_rate": 4.576923076923077e-06, |
|
"loss": 0.4556, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.8283052735253307, |
|
"learning_rate": 4.666666666666666e-06, |
|
"loss": 0.6066, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.7700007607445167, |
|
"learning_rate": 4.756410256410257e-06, |
|
"loss": 0.4966, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.695503292082949, |
|
"learning_rate": 4.846153846153846e-06, |
|
"loss": 0.5229, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.7249370659175542, |
|
"learning_rate": 4.935897435897436e-06, |
|
"loss": 0.4866, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.6670150319109511, |
|
"learning_rate": 5.025641025641026e-06, |
|
"loss": 0.4372, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.9001530129586224, |
|
"learning_rate": 5.115384615384615e-06, |
|
"loss": 0.4961, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.8138821119614343, |
|
"learning_rate": 5.205128205128205e-06, |
|
"loss": 0.4956, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.7702871650778012, |
|
"learning_rate": 5.294871794871795e-06, |
|
"loss": 0.4819, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.6027880830820047, |
|
"learning_rate": 5.384615384615385e-06, |
|
"loss": 0.4691, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.5406345693116157, |
|
"learning_rate": 5.474358974358974e-06, |
|
"loss": 0.3892, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.4960225687619988, |
|
"learning_rate": 5.564102564102564e-06, |
|
"loss": 0.4675, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.4165399860657466, |
|
"learning_rate": 5.653846153846154e-06, |
|
"loss": 0.3659, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.783351454505062, |
|
"learning_rate": 5.743589743589743e-06, |
|
"loss": 0.5025, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.7461313827694103, |
|
"learning_rate": 5.833333333333333e-06, |
|
"loss": 0.4599, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.7177479188522022, |
|
"learning_rate": 5.923076923076923e-06, |
|
"loss": 0.4658, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.7430319738026099, |
|
"learning_rate": 6.0128205128205125e-06, |
|
"loss": 0.5034, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.8694656181485532, |
|
"learning_rate": 6.102564102564102e-06, |
|
"loss": 0.5335, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.5949394110724386, |
|
"learning_rate": 6.192307692307692e-06, |
|
"loss": 0.4341, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.0737191543993223, |
|
"learning_rate": 6.282051282051282e-06, |
|
"loss": 0.5444, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.701318366396795, |
|
"learning_rate": 6.371794871794871e-06, |
|
"loss": 0.4897, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.5564495848821291, |
|
"learning_rate": 6.461538461538462e-06, |
|
"loss": 0.4486, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.7515969674361473, |
|
"learning_rate": 6.5512820512820515e-06, |
|
"loss": 0.4836, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.723669518863802, |
|
"learning_rate": 6.64102564102564e-06, |
|
"loss": 0.4881, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.6232767387422016, |
|
"learning_rate": 6.730769230769231e-06, |
|
"loss": 0.4517, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.6836950386453475, |
|
"learning_rate": 6.8205128205128205e-06, |
|
"loss": 0.453, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.5970207533021141, |
|
"learning_rate": 6.91025641025641e-06, |
|
"loss": 0.4102, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.806038689668669, |
|
"learning_rate": 7e-06, |
|
"loss": 0.4938, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.682873077514678, |
|
"learning_rate": 6.9999649520318915e-06, |
|
"loss": 0.4656, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.7622694283436713, |
|
"learning_rate": 6.999859808829483e-06, |
|
"loss": 0.3833, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.7028611976071402, |
|
"learning_rate": 6.999684572498523e-06, |
|
"loss": 0.5228, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.8078825168256631, |
|
"learning_rate": 6.999439246548541e-06, |
|
"loss": 0.4219, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.5963387300318541, |
|
"learning_rate": 6.999123835892781e-06, |
|
"loss": 0.3838, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.752439994185435, |
|
"learning_rate": 6.998738346848099e-06, |
|
"loss": 0.5353, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.5765967217454646, |
|
"learning_rate": 6.998282787134845e-06, |
|
"loss": 0.4013, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.64099993127281, |
|
"learning_rate": 6.997757165876698e-06, |
|
"loss": 0.5268, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.6629395270016087, |
|
"learning_rate": 6.9971614936004935e-06, |
|
"loss": 0.4364, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.7737143292943345, |
|
"learning_rate": 6.996495782236003e-06, |
|
"loss": 0.445, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.7709235055203036, |
|
"learning_rate": 6.9957600451157e-06, |
|
"loss": 0.4809, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.6973893875173904, |
|
"learning_rate": 6.9949542969744955e-06, |
|
"loss": 0.475, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.6645616366762754, |
|
"learning_rate": 6.9940785539494385e-06, |
|
"loss": 0.4656, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.6726681899225762, |
|
"learning_rate": 6.9931328335793926e-06, |
|
"loss": 0.436, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.7236949440975784, |
|
"learning_rate": 6.992117154804688e-06, |
|
"loss": 0.474, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.6637387868033124, |
|
"learning_rate": 6.991031537966741e-06, |
|
"loss": 0.421, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.815724145981235, |
|
"learning_rate": 6.989876004807644e-06, |
|
"loss": 0.4889, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.65090673754103, |
|
"learning_rate": 6.9886505784697354e-06, |
|
"loss": 0.4313, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.7879715436757848, |
|
"learning_rate": 6.98735528349513e-06, |
|
"loss": 0.5158, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.6116310457967884, |
|
"learning_rate": 6.985990145825233e-06, |
|
"loss": 0.4152, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.7530216353063874, |
|
"learning_rate": 6.984555192800216e-06, |
|
"loss": 0.5415, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.6028754704316188, |
|
"learning_rate": 6.983050453158471e-06, |
|
"loss": 0.4666, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.6554540478959205, |
|
"learning_rate": 6.981475957036039e-06, |
|
"loss": 0.4338, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.7244082720999658, |
|
"learning_rate": 6.979831735965997e-06, |
|
"loss": 0.3997, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.8613328382698524, |
|
"learning_rate": 6.9781178228778385e-06, |
|
"loss": 0.4822, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.7472563980348217, |
|
"learning_rate": 6.9763342520968e-06, |
|
"loss": 0.531, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.7296385478020857, |
|
"learning_rate": 6.974481059343188e-06, |
|
"loss": 0.4556, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.7421477412265767, |
|
"learning_rate": 6.972558281731655e-06, |
|
"loss": 0.4739, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.8252883807781282, |
|
"learning_rate": 6.970565957770456e-06, |
|
"loss": 0.4603, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.4893874441098909, |
|
"learning_rate": 6.96850412736068e-06, |
|
"loss": 0.4074, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.706254344170796, |
|
"learning_rate": 6.9663728317954505e-06, |
|
"loss": 0.4931, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.7729579454846884, |
|
"learning_rate": 6.9641721137591e-06, |
|
"loss": 0.5236, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.673651236600404, |
|
"learning_rate": 6.961902017326311e-06, |
|
"loss": 0.4678, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.6483821733028223, |
|
"learning_rate": 6.959562587961235e-06, |
|
"loss": 0.4539, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.8334930429778364, |
|
"learning_rate": 6.9571538725165855e-06, |
|
"loss": 0.4598, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.6259182099235296, |
|
"learning_rate": 6.9546759192326944e-06, |
|
"loss": 0.4618, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.586763484875926, |
|
"learning_rate": 6.95212877773655e-06, |
|
"loss": 0.3916, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.729278408278385, |
|
"learning_rate": 6.949512499040799e-06, |
|
"loss": 0.443, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.7287260985135913, |
|
"learning_rate": 6.946827135542729e-06, |
|
"loss": 0.4058, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.5105369132311874, |
|
"learning_rate": 6.944072741023215e-06, |
|
"loss": 0.3816, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.7827477765014352, |
|
"learning_rate": 6.941249370645649e-06, |
|
"loss": 0.4411, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.6865474969699326, |
|
"learning_rate": 6.938357080954826e-06, |
|
"loss": 0.4536, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.6697703500975725, |
|
"learning_rate": 6.935395929875821e-06, |
|
"loss": 0.4773, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.846095347111975, |
|
"learning_rate": 6.93236597671282e-06, |
|
"loss": 0.5273, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.5599554260333042, |
|
"learning_rate": 6.929267282147936e-06, |
|
"loss": 0.4108, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.5737374777092443, |
|
"learning_rate": 6.9260999082400014e-06, |
|
"loss": 0.4233, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.7607156921341576, |
|
"learning_rate": 6.922863918423311e-06, |
|
"loss": 0.4391, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.8081142800994836, |
|
"learning_rate": 6.91955937750636e-06, |
|
"loss": 0.5029, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.6888032055738713, |
|
"learning_rate": 6.916186351670546e-06, |
|
"loss": 0.442, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.59484869751713, |
|
"learning_rate": 6.912744908468841e-06, |
|
"loss": 0.4274, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.5838918315915649, |
|
"learning_rate": 6.909235116824441e-06, |
|
"loss": 0.4862, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.5793657946026944, |
|
"learning_rate": 6.905657047029383e-06, |
|
"loss": 0.4122, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.8119234908825441, |
|
"learning_rate": 6.90201077074314e-06, |
|
"loss": 0.5385, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.7221288311251346, |
|
"learning_rate": 6.898296360991182e-06, |
|
"loss": 0.4986, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.6177348169551482, |
|
"learning_rate": 6.894513892163519e-06, |
|
"loss": 0.4351, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.8266262713227248, |
|
"learning_rate": 6.890663440013204e-06, |
|
"loss": 0.4635, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.5762526247077566, |
|
"learning_rate": 6.886745081654823e-06, |
|
"loss": 0.4404, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.7127197602668496, |
|
"learning_rate": 6.882758895562948e-06, |
|
"loss": 0.4798, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.7272250386199177, |
|
"learning_rate": 6.8787049615705635e-06, |
|
"loss": 0.4491, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.7401938465087832, |
|
"learning_rate": 6.8745833608674685e-06, |
|
"loss": 0.513, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.476494218502608, |
|
"learning_rate": 6.870394175998651e-06, |
|
"loss": 0.4126, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.6735073474825053, |
|
"learning_rate": 6.866137490862636e-06, |
|
"loss": 0.4784, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.61528219577023, |
|
"learning_rate": 6.861813390709803e-06, |
|
"loss": 0.3993, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.7160841750089657, |
|
"learning_rate": 6.857421962140681e-06, |
|
"loss": 0.437, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.6003971415765827, |
|
"learning_rate": 6.852963293104211e-06, |
|
"loss": 0.4234, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.673821864143384, |
|
"learning_rate": 6.848437472895989e-06, |
|
"loss": 0.36, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.5584858029998052, |
|
"learning_rate": 6.84384459215647e-06, |
|
"loss": 0.3831, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.8434118860530437, |
|
"learning_rate": 6.839184742869166e-06, |
|
"loss": 0.481, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.735764854231206, |
|
"learning_rate": 6.8344580183587866e-06, |
|
"loss": 0.4604, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.5746871313601694, |
|
"learning_rate": 6.829664513289387e-06, |
|
"loss": 0.4481, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.554650404317531, |
|
"learning_rate": 6.824804323662456e-06, |
|
"loss": 0.4246, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.776675138837105, |
|
"learning_rate": 6.8198775468150085e-06, |
|
"loss": 0.505, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.6556936569780774, |
|
"learning_rate": 6.814884281417627e-06, |
|
"loss": 0.4684, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.6529466478362556, |
|
"learning_rate": 6.8098246274724835e-06, |
|
"loss": 0.4179, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.5675006143134382, |
|
"learning_rate": 6.8046986863113455e-06, |
|
"loss": 0.3936, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.543693186637755, |
|
"learning_rate": 6.7995065605935405e-06, |
|
"loss": 0.4343, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.5960852724486323, |
|
"learning_rate": 6.7942483543039e-06, |
|
"loss": 0.4028, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.6707113583767172, |
|
"learning_rate": 6.788924172750679e-06, |
|
"loss": 0.4456, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.592042298214689, |
|
"learning_rate": 6.783534122563447e-06, |
|
"loss": 0.3896, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.594380804915327, |
|
"learning_rate": 6.7780783116909495e-06, |
|
"loss": 0.4269, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.425175264752805, |
|
"learning_rate": 6.772556849398952e-06, |
|
"loss": 0.4136, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.580030153001224, |
|
"learning_rate": 6.7669698462680434e-06, |
|
"loss": 0.4534, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.656736559657467, |
|
"learning_rate": 6.761317414191428e-06, |
|
"loss": 0.4262, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.5516325617950246, |
|
"learning_rate": 6.755599666372685e-06, |
|
"loss": 0.3525, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.5964299616457083, |
|
"learning_rate": 6.749816717323493e-06, |
|
"loss": 0.3582, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.5572727726198186, |
|
"learning_rate": 6.743968682861346e-06, |
|
"loss": 0.4277, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.59926615487551, |
|
"learning_rate": 6.738055680107233e-06, |
|
"loss": 0.3878, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.7037332135435532, |
|
"learning_rate": 6.7320778274832836e-06, |
|
"loss": 0.4137, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.5513259799733465, |
|
"learning_rate": 6.726035244710406e-06, |
|
"loss": 0.4037, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.6898460188300195, |
|
"learning_rate": 6.7199280528058844e-06, |
|
"loss": 0.3961, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.5729523035551953, |
|
"learning_rate": 6.713756374080959e-06, |
|
"loss": 0.3434, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.544572572819997, |
|
"learning_rate": 6.70752033213837e-06, |
|
"loss": 0.3416, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.7321648255912703, |
|
"learning_rate": 6.7012200518698904e-06, |
|
"loss": 0.3349, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.5902383819866788, |
|
"learning_rate": 6.6948556594538185e-06, |
|
"loss": 0.3897, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.578037443536385, |
|
"learning_rate": 6.688427282352449e-06, |
|
"loss": 0.374, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.637206599992538, |
|
"learning_rate": 6.681935049309533e-06, |
|
"loss": 0.356, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.6426011449900024, |
|
"learning_rate": 6.6753790903476814e-06, |
|
"loss": 0.3204, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 1.7335754223028719, |
|
"learning_rate": 6.668759536765778e-06, |
|
"loss": 0.3447, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 1.8716868794333945, |
|
"learning_rate": 6.6620765211363376e-06, |
|
"loss": 0.4708, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.0355113763091306, |
|
"learning_rate": 6.655330177302857e-06, |
|
"loss": 0.4357, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.6067153937975995, |
|
"learning_rate": 6.64852064037713e-06, |
|
"loss": 0.3237, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.566156260471038, |
|
"learning_rate": 6.6416480467365494e-06, |
|
"loss": 0.3271, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.415525082706651, |
|
"learning_rate": 6.634712534021367e-06, |
|
"loss": 0.3125, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.5322544502916775, |
|
"learning_rate": 6.627714241131943e-06, |
|
"loss": 0.2987, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.6559751396978282, |
|
"learning_rate": 6.62065330822596e-06, |
|
"loss": 0.331, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.6362651282972798, |
|
"learning_rate": 6.613529876715619e-06, |
|
"loss": 0.3522, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.7432266355020243, |
|
"learning_rate": 6.606344089264805e-06, |
|
"loss": 0.3721, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.6595653155086751, |
|
"learning_rate": 6.599096089786234e-06, |
|
"loss": 0.3272, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.7169078900169732, |
|
"learning_rate": 6.591786023438565e-06, |
|
"loss": 0.3205, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.5217863220075283, |
|
"learning_rate": 6.5844140366234956e-06, |
|
"loss": 0.3007, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.5386588460515103, |
|
"learning_rate": 6.576980276982832e-06, |
|
"loss": 0.2683, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.5586980831749289, |
|
"learning_rate": 6.569484893395527e-06, |
|
"loss": 0.3109, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.835334084851163, |
|
"learning_rate": 6.5619280359747045e-06, |
|
"loss": 0.3659, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.7303049780029398, |
|
"learning_rate": 6.55430985606465e-06, |
|
"loss": 0.3169, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.573272600381468, |
|
"learning_rate": 6.546630506237778e-06, |
|
"loss": 0.2737, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.5800960246726725, |
|
"learning_rate": 6.538890140291578e-06, |
|
"loss": 0.2962, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.5942214142753601, |
|
"learning_rate": 6.531088913245536e-06, |
|
"loss": 0.2912, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.6047857651904212, |
|
"learning_rate": 6.5232269813380254e-06, |
|
"loss": 0.3033, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.6641616618876198, |
|
"learning_rate": 6.5153045020231855e-06, |
|
"loss": 0.3071, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.684992093107794, |
|
"learning_rate": 6.507321633967758e-06, |
|
"loss": 0.2783, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.6325774194230926, |
|
"learning_rate": 6.499278537047919e-06, |
|
"loss": 0.2527, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.6121399506538092, |
|
"learning_rate": 6.49117537234607e-06, |
|
"loss": 0.2459, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.7019340316973153, |
|
"learning_rate": 6.483012302147617e-06, |
|
"loss": 0.2639, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.5471416195498167, |
|
"learning_rate": 6.474789489937715e-06, |
|
"loss": 0.2507, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.5639059610283246, |
|
"learning_rate": 6.4665071003979985e-06, |
|
"loss": 0.2227, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.6480830867630272, |
|
"learning_rate": 6.4581652994032816e-06, |
|
"loss": 0.2199, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.6418221792531331, |
|
"learning_rate": 6.449764254018236e-06, |
|
"loss": 0.2676, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.557597222846848, |
|
"learning_rate": 6.441304132494045e-06, |
|
"loss": 0.2057, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.6032321620665173, |
|
"learning_rate": 6.432785104265034e-06, |
|
"loss": 0.2325, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.068457437087488, |
|
"learning_rate": 6.424207339945278e-06, |
|
"loss": 0.3075, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.6387109523391752, |
|
"learning_rate": 6.415571011325181e-06, |
|
"loss": 0.2638, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.6478934557021592, |
|
"learning_rate": 6.406876291368041e-06, |
|
"loss": 0.2829, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.6015206620029365, |
|
"learning_rate": 6.3981233542065824e-06, |
|
"loss": 0.2542, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 1.6246765840899604, |
|
"learning_rate": 6.3893123751394695e-06, |
|
"loss": 0.2084, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.6287901735275234, |
|
"learning_rate": 6.380443530627797e-06, |
|
"loss": 0.2424, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.6578723488093603, |
|
"learning_rate": 6.371516998291552e-06, |
|
"loss": 0.2458, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.6660041316351297, |
|
"learning_rate": 6.3625329569060595e-06, |
|
"loss": 0.2427, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.578903354700284, |
|
"learning_rate": 6.3534915863984045e-06, |
|
"loss": 0.248, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 1.480820395340969, |
|
"learning_rate": 6.344393067843825e-06, |
|
"loss": 0.1903, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.5351995236858405, |
|
"learning_rate": 6.335237583462083e-06, |
|
"loss": 0.2444, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.4430941536344153, |
|
"learning_rate": 6.326025316613824e-06, |
|
"loss": 0.1888, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 1.705699712353775, |
|
"learning_rate": 6.3167564517968944e-06, |
|
"loss": 0.2381, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 1.5317499613408436, |
|
"learning_rate": 6.307431174642653e-06, |
|
"loss": 0.2012, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 1.6439194933028012, |
|
"learning_rate": 6.2980496719122544e-06, |
|
"loss": 0.2213, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 1.5931209129358592, |
|
"learning_rate": 6.288612131492901e-06, |
|
"loss": 0.2418, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.6370343275461072, |
|
"learning_rate": 6.279118742394089e-06, |
|
"loss": 0.2256, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.4744567429276645, |
|
"learning_rate": 6.2695696947438165e-06, |
|
"loss": 0.2009, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.8325833367980933, |
|
"learning_rate": 6.25996517978478e-06, |
|
"loss": 0.2345, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.6355207967953467, |
|
"learning_rate": 6.2503053898705416e-06, |
|
"loss": 0.2232, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.5592250115768092, |
|
"learning_rate": 6.2405905184616776e-06, |
|
"loss": 0.2144, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 1.5359450179179217, |
|
"learning_rate": 6.230820760121904e-06, |
|
"loss": 0.2025, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 1.6594758906536755, |
|
"learning_rate": 6.220996310514181e-06, |
|
"loss": 0.2248, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.614953425314903, |
|
"learning_rate": 6.21111736639679e-06, |
|
"loss": 0.2072, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 1.5868322471592755, |
|
"learning_rate": 6.201184125619403e-06, |
|
"loss": 0.1954, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 1.5846521300517467, |
|
"learning_rate": 6.191196787119104e-06, |
|
"loss": 0.1872, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.5858305578529914, |
|
"learning_rate": 6.181155550916423e-06, |
|
"loss": 0.2173, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 1.5685831511245525, |
|
"learning_rate": 6.171060618111317e-06, |
|
"loss": 0.2035, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 1.416118500599548, |
|
"learning_rate": 6.160912190879146e-06, |
|
"loss": 0.1546, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.6574554300523399, |
|
"learning_rate": 6.15071047246663e-06, |
|
"loss": 0.2404, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 1.555316260098642, |
|
"learning_rate": 6.140455667187765e-06, |
|
"loss": 0.1578, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 1.4540496982385553, |
|
"learning_rate": 6.13014798041975e-06, |
|
"loss": 0.1595, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 1.6687470479114859, |
|
"learning_rate": 6.119787618598854e-06, |
|
"loss": 0.2505, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 1.5334245760102851, |
|
"learning_rate": 6.109374789216296e-06, |
|
"loss": 0.1838, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.542810949224468, |
|
"learning_rate": 6.098909700814082e-06, |
|
"loss": 0.241, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 1.72874768735861, |
|
"learning_rate": 6.08839256298083e-06, |
|
"loss": 0.2054, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 1.5566017473024847, |
|
"learning_rate": 6.077823586347579e-06, |
|
"loss": 0.1827, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 1.5454330143421287, |
|
"learning_rate": 6.06720298258356e-06, |
|
"loss": 0.1951, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 1.5699376808386447, |
|
"learning_rate": 6.056530964391961e-06, |
|
"loss": 0.217, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 1.4790843551806858, |
|
"learning_rate": 6.0458077455056704e-06, |
|
"loss": 0.2034, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 1.5292597992204815, |
|
"learning_rate": 6.035033540682993e-06, |
|
"loss": 0.1917, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.5536070824489197, |
|
"learning_rate": 6.024208565703351e-06, |
|
"loss": 0.2102, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.519765331872237, |
|
"learning_rate": 6.013333037362959e-06, |
|
"loss": 0.1775, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 1.664352545830077, |
|
"learning_rate": 6.002407173470486e-06, |
|
"loss": 0.2252, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.6289900634763281, |
|
"learning_rate": 5.991431192842692e-06, |
|
"loss": 0.1929, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.650116318598881, |
|
"learning_rate": 5.980405315300045e-06, |
|
"loss": 0.2363, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 1.55668794790212, |
|
"learning_rate": 5.969329761662319e-06, |
|
"loss": 0.19, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 1.6677234649563633, |
|
"learning_rate": 5.9582047537441716e-06, |
|
"loss": 0.2332, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 1.5661036857828248, |
|
"learning_rate": 5.9470305143507e-06, |
|
"loss": 0.2237, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 1.4888010043108841, |
|
"learning_rate": 5.9358072672729845e-06, |
|
"loss": 0.2087, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 1.5244864099257356, |
|
"learning_rate": 5.924535237283598e-06, |
|
"loss": 0.1658, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 1.6137026161987666, |
|
"learning_rate": 5.913214650132112e-06, |
|
"loss": 0.1901, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 1.6772522686974038, |
|
"learning_rate": 5.901845732540568e-06, |
|
"loss": 0.258, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 1.5097022747012565, |
|
"learning_rate": 5.8904287121989455e-06, |
|
"loss": 0.1804, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 1.695764383503543, |
|
"learning_rate": 5.878963817760597e-06, |
|
"loss": 0.2051, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 1.6150072622733846, |
|
"learning_rate": 5.867451278837666e-06, |
|
"loss": 0.1778, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 1.4156360311668714, |
|
"learning_rate": 5.855891325996495e-06, |
|
"loss": 0.1941, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 1.626327498057916, |
|
"learning_rate": 5.8442841907530035e-06, |
|
"loss": 0.2307, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.0421635487814385, |
|
"learning_rate": 5.83263010556805e-06, |
|
"loss": 0.2468, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 1.670954185746529, |
|
"learning_rate": 5.820929303842783e-06, |
|
"loss": 0.2244, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 1.6749192214251005, |
|
"learning_rate": 5.809182019913959e-06, |
|
"loss": 0.2079, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 1.567908091990771, |
|
"learning_rate": 5.797388489049253e-06, |
|
"loss": 0.2012, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 1.558652423776024, |
|
"learning_rate": 5.785548947442547e-06, |
|
"loss": 0.2136, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 1.5035278794955325, |
|
"learning_rate": 5.7736636322092016e-06, |
|
"loss": 0.1752, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 1.6085107962690712, |
|
"learning_rate": 5.7617327813813e-06, |
|
"loss": 0.1841, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.5748033314990677, |
|
"learning_rate": 5.749756633902887e-06, |
|
"loss": 0.1564, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.4911389890937712, |
|
"learning_rate": 5.7377354296251855e-06, |
|
"loss": 0.1852, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.6116400263763508, |
|
"learning_rate": 5.725669409301782e-06, |
|
"loss": 0.1648, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 1.6655723963052835, |
|
"learning_rate": 5.71355881458382e-06, |
|
"loss": 0.2038, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.6292883296504315, |
|
"learning_rate": 5.701403888015149e-06, |
|
"loss": 0.2151, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.7652594110330915, |
|
"learning_rate": 5.689204873027471e-06, |
|
"loss": 0.2306, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 1.5674611629004112, |
|
"learning_rate": 5.676962013935464e-06, |
|
"loss": 0.1986, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 1.5432658298998765, |
|
"learning_rate": 5.664675555931892e-06, |
|
"loss": 0.1961, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 1.7003413063407276, |
|
"learning_rate": 5.652345745082691e-06, |
|
"loss": 0.1903, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 1.7383657441416522, |
|
"learning_rate": 5.639972828322043e-06, |
|
"loss": 0.2251, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 1.6827488344063495, |
|
"learning_rate": 5.627557053447427e-06, |
|
"loss": 0.1928, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.516962697761087, |
|
"learning_rate": 5.615098669114664e-06, |
|
"loss": 0.1969, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 1.5820218859618687, |
|
"learning_rate": 5.6025979248329265e-06, |
|
"loss": 0.2507, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 1.4664214189683091, |
|
"learning_rate": 5.590055070959752e-06, |
|
"loss": 0.1823, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 1.6412853370022484, |
|
"learning_rate": 5.577470358696021e-06, |
|
"loss": 0.2569, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 1.5791518887916571, |
|
"learning_rate": 5.564844040080931e-06, |
|
"loss": 0.2248, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 1.5129411232044963, |
|
"learning_rate": 5.5521763679869445e-06, |
|
"loss": 0.2014, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 1.6335198834860136, |
|
"learning_rate": 5.53946759611473e-06, |
|
"loss": 0.1881, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 1.4274726284837593, |
|
"learning_rate": 5.526717978988076e-06, |
|
"loss": 0.2049, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 1.766839389365021, |
|
"learning_rate": 5.513927771948798e-06, |
|
"loss": 0.2144, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.555081227625382, |
|
"learning_rate": 5.5010972311516184e-06, |
|
"loss": 0.1938, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.657103095692518, |
|
"learning_rate": 5.488226613559045e-06, |
|
"loss": 0.2392, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 1.4598629682187232, |
|
"learning_rate": 5.475316176936217e-06, |
|
"loss": 0.2095, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 1.5819596345489928, |
|
"learning_rate": 5.462366179845746e-06, |
|
"loss": 0.2112, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 1.6175196309898452, |
|
"learning_rate": 5.449376881642537e-06, |
|
"loss": 0.1802, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 1.5465731746504927, |
|
"learning_rate": 5.436348542468598e-06, |
|
"loss": 0.1841, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.572217945594608, |
|
"learning_rate": 5.423281423247821e-06, |
|
"loss": 0.1845, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.4536353215576343, |
|
"learning_rate": 5.4101757856807655e-06, |
|
"loss": 0.1327, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 1.4814309250755353, |
|
"learning_rate": 5.397031892239414e-06, |
|
"loss": 0.1659, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 1.7529052634887694, |
|
"learning_rate": 5.383850006161913e-06, |
|
"loss": 0.1938, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 1.6512796709873832, |
|
"learning_rate": 5.370630391447303e-06, |
|
"loss": 0.1961, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 1.7490980362206077, |
|
"learning_rate": 5.357373312850236e-06, |
|
"loss": 0.2206, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 1.5498728220026698, |
|
"learning_rate": 5.3440790358756615e-06, |
|
"loss": 0.2085, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 1.7873875096307177, |
|
"learning_rate": 5.330747826773522e-06, |
|
"loss": 0.2229, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 1.6231900940290174, |
|
"learning_rate": 5.317379952533411e-06, |
|
"loss": 0.2133, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 1.6076562253029956, |
|
"learning_rate": 5.303975680879232e-06, |
|
"loss": 0.189, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 1.5127508023725418, |
|
"learning_rate": 5.290535280263835e-06, |
|
"loss": 0.179, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 1.5724862709076817, |
|
"learning_rate": 5.277059019863637e-06, |
|
"loss": 0.1939, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 1.4830237205662076, |
|
"learning_rate": 5.263547169573235e-06, |
|
"loss": 0.1816, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.5159324470099858, |
|
"learning_rate": 5.25e-06, |
|
"loss": 0.1886, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 1.555949366187299, |
|
"learning_rate": 5.236417782458656e-06, |
|
"loss": 0.1648, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 1.4779915210345689, |
|
"learning_rate": 5.222800788965847e-06, |
|
"loss": 0.1949, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 1.3981863685885323, |
|
"learning_rate": 5.2091492922346894e-06, |
|
"loss": 0.201, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.4384702865295196, |
|
"learning_rate": 5.195463565669309e-06, |
|
"loss": 0.1997, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.417972528504501, |
|
"learning_rate": 5.18174388335937e-06, |
|
"loss": 0.1696, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 1.42232835008326, |
|
"learning_rate": 5.167990520074577e-06, |
|
"loss": 0.1399, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 1.4639848529175286, |
|
"learning_rate": 5.154203751259183e-06, |
|
"loss": 0.1462, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 1.5626053893272325, |
|
"learning_rate": 5.140383853026463e-06, |
|
"loss": 0.1969, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 1.4260156641561081, |
|
"learning_rate": 5.12653110215319e-06, |
|
"loss": 0.1604, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 1.4958379917355866, |
|
"learning_rate": 5.11264577607409e-06, |
|
"loss": 0.1677, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 1.5033492694128057, |
|
"learning_rate": 5.098728152876287e-06, |
|
"loss": 0.1747, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.603601503409923, |
|
"learning_rate": 5.084778511293731e-06, |
|
"loss": 0.1426, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.4112657690685506, |
|
"learning_rate": 5.070797130701618e-06, |
|
"loss": 0.1251, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 1.5494589174434166, |
|
"learning_rate": 5.056784291110794e-06, |
|
"loss": 0.1271, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.4343244322587299, |
|
"learning_rate": 5.04274027316215e-06, |
|
"loss": 0.1072, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.5882705358537534, |
|
"learning_rate": 5.028665358120995e-06, |
|
"loss": 0.1503, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 1.5363658563678113, |
|
"learning_rate": 5.014559827871426e-06, |
|
"loss": 0.1303, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.5525555773630861, |
|
"learning_rate": 5.00042396491069e-06, |
|
"loss": 0.1231, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.5224615240989583, |
|
"learning_rate": 4.9862580523435116e-06, |
|
"loss": 0.0949, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 1.630845815321759, |
|
"learning_rate": 4.972062373876435e-06, |
|
"loss": 0.0923, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 1.7508423838612863, |
|
"learning_rate": 4.95783721381214e-06, |
|
"loss": 0.1557, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 1.9071964194859563, |
|
"learning_rate": 4.943582857043742e-06, |
|
"loss": 0.1315, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 1.605736114180956, |
|
"learning_rate": 4.9292995890490945e-06, |
|
"loss": 0.1017, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 1.5908541224269843, |
|
"learning_rate": 4.914987695885067e-06, |
|
"loss": 0.1097, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 1.4636955088177965, |
|
"learning_rate": 4.900647464181817e-06, |
|
"loss": 0.1237, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 1.4495226641748151, |
|
"learning_rate": 4.886279181137049e-06, |
|
"loss": 0.0968, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 1.4675886550946595, |
|
"learning_rate": 4.871883134510263e-06, |
|
"loss": 0.1011, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 1.6564244575191527, |
|
"learning_rate": 4.8574596126169925e-06, |
|
"loss": 0.1273, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 1.533227037458805, |
|
"learning_rate": 4.843008904323029e-06, |
|
"loss": 0.1228, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 1.4820663768576567, |
|
"learning_rate": 4.828531299038638e-06, |
|
"loss": 0.1099, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.6027220866794623, |
|
"learning_rate": 4.81402708671276e-06, |
|
"loss": 0.0969, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.5893012168299308, |
|
"learning_rate": 4.799496557827208e-06, |
|
"loss": 0.1102, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.5207909320236375, |
|
"learning_rate": 4.7849400033908465e-06, |
|
"loss": 0.1002, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 1.4497038100819872, |
|
"learning_rate": 4.770357714933765e-06, |
|
"loss": 0.1122, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 1.5568347840805732, |
|
"learning_rate": 4.755749984501437e-06, |
|
"loss": 0.1132, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 1.4518839890957804, |
|
"learning_rate": 4.741117104648874e-06, |
|
"loss": 0.1007, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.3028020064942243, |
|
"learning_rate": 4.726459368434768e-06, |
|
"loss": 0.0843, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.297097957128824, |
|
"learning_rate": 4.711777069415615e-06, |
|
"loss": 0.0967, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.4228303278540766, |
|
"learning_rate": 4.697070501639841e-06, |
|
"loss": 0.089, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.3898888054417786, |
|
"learning_rate": 4.682339959641915e-06, |
|
"loss": 0.0903, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.5233221859336192, |
|
"learning_rate": 4.667585738436448e-06, |
|
"loss": 0.0964, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 1.5295654378299237, |
|
"learning_rate": 4.652808133512279e-06, |
|
"loss": 0.0848, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 1.6437228500672438, |
|
"learning_rate": 4.638007440826568e-06, |
|
"loss": 0.0804, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 1.5383689907196065, |
|
"learning_rate": 4.62318395679886e-06, |
|
"loss": 0.0709, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 1.648388694748838, |
|
"learning_rate": 4.6083379783051545e-06, |
|
"loss": 0.0858, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 1.4723706715778246, |
|
"learning_rate": 4.593469802671951e-06, |
|
"loss": 0.077, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.3850116104824282, |
|
"learning_rate": 4.5785797276703075e-06, |
|
"loss": 0.0559, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 1.5055414247850987, |
|
"learning_rate": 4.563668051509864e-06, |
|
"loss": 0.058, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 1.5049490521822901, |
|
"learning_rate": 4.548735072832879e-06, |
|
"loss": 0.0809, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 1.4324576412777454, |
|
"learning_rate": 4.533781090708244e-06, |
|
"loss": 0.0639, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 1.3643187351188586, |
|
"learning_rate": 4.518806404625495e-06, |
|
"loss": 0.0694, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 1.6866178990377743, |
|
"learning_rate": 4.503811314488816e-06, |
|
"loss": 0.0985, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 1.4991148902916245, |
|
"learning_rate": 4.48879612061103e-06, |
|
"loss": 0.079, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.4720035797123598, |
|
"learning_rate": 4.473761123707584e-06, |
|
"loss": 0.0921, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.44832360985762, |
|
"learning_rate": 4.458706624890534e-06, |
|
"loss": 0.0786, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 1.4275898527484465, |
|
"learning_rate": 4.443632925662504e-06, |
|
"loss": 0.0626, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.5167475621388269, |
|
"learning_rate": 4.428540327910652e-06, |
|
"loss": 0.0723, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.4931710488796675, |
|
"learning_rate": 4.41342913390063e-06, |
|
"loss": 0.0789, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 1.6494646753660156, |
|
"learning_rate": 4.398299646270518e-06, |
|
"loss": 0.0779, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 1.4632009250956346, |
|
"learning_rate": 4.3831521680247765e-06, |
|
"loss": 0.0874, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 1.3624276184155342, |
|
"learning_rate": 4.3679870025281644e-06, |
|
"loss": 0.0595, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.4387226841227507, |
|
"learning_rate": 4.352804453499677e-06, |
|
"loss": 0.0799, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.4191511182239602, |
|
"learning_rate": 4.3376048250064525e-06, |
|
"loss": 0.0676, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 1.5148194681863063, |
|
"learning_rate": 4.322388421457687e-06, |
|
"loss": 0.0804, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 1.4992240453279386, |
|
"learning_rate": 4.30715554759854e-06, |
|
"loss": 0.0663, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 1.4940181433918933, |
|
"learning_rate": 4.2919065085040285e-06, |
|
"loss": 0.0729, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 1.4292506822415902, |
|
"learning_rate": 4.276641609572911e-06, |
|
"loss": 0.077, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.3880559130653471, |
|
"learning_rate": 4.261361156521586e-06, |
|
"loss": 0.0621, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.3224698912209505, |
|
"learning_rate": 4.246065455377956e-06, |
|
"loss": 0.0664, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 1.6220854664403273, |
|
"learning_rate": 4.230754812475306e-06, |
|
"loss": 0.075, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.3904399822037876, |
|
"learning_rate": 4.215429534446161e-06, |
|
"loss": 0.068, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.394224095253479, |
|
"learning_rate": 4.200089928216156e-06, |
|
"loss": 0.0694, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 1.574441933806699, |
|
"learning_rate": 4.1847363009978776e-06, |
|
"loss": 0.0682, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 1.5729152887669275, |
|
"learning_rate": 4.169368960284718e-06, |
|
"loss": 0.0737, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.4927966019088243, |
|
"learning_rate": 4.153988213844717e-06, |
|
"loss": 0.0654, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 1.450195896852466, |
|
"learning_rate": 4.138594369714394e-06, |
|
"loss": 0.0642, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 1.490138365122018, |
|
"learning_rate": 4.123187736192583e-06, |
|
"loss": 0.0606, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.4999776523444637, |
|
"learning_rate": 4.107768621834257e-06, |
|
"loss": 0.0649, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 1.4426594028410584, |
|
"learning_rate": 4.092337335444343e-06, |
|
"loss": 0.059, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 1.2377723587830607, |
|
"learning_rate": 4.076894186071548e-06, |
|
"loss": 0.0486, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 1.6147108783025788, |
|
"learning_rate": 4.061439483002161e-06, |
|
"loss": 0.0776, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.4665221548831489, |
|
"learning_rate": 4.045973535753863e-06, |
|
"loss": 0.0485, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.3979103729080165, |
|
"learning_rate": 4.030496654069524e-06, |
|
"loss": 0.0534, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 1.5024378161971237, |
|
"learning_rate": 4.015009147911007e-06, |
|
"loss": 0.0795, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 1.503457250091824, |
|
"learning_rate": 3.9995113274529506e-06, |
|
"loss": 0.0589, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 1.3018136990746167, |
|
"learning_rate": 3.984003503076566e-06, |
|
"loss": 0.0751, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.439789169382561, |
|
"learning_rate": 3.968485985363416e-06, |
|
"loss": 0.0671, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.2600931675117872, |
|
"learning_rate": 3.952959085089193e-06, |
|
"loss": 0.0481, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 1.3902574629104398, |
|
"learning_rate": 3.937423113217505e-06, |
|
"loss": 0.0605, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 1.3343472976969517, |
|
"learning_rate": 3.92187838089363e-06, |
|
"loss": 0.0709, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 1.260320342311725, |
|
"learning_rate": 3.9063251994383055e-06, |
|
"loss": 0.0646, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 1.3456955333019955, |
|
"learning_rate": 3.8907638803414774e-06, |
|
"loss": 0.063, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.3089048629072635, |
|
"learning_rate": 3.875194735256067e-06, |
|
"loss": 0.0663, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.458165867482108, |
|
"learning_rate": 3.859618075991735e-06, |
|
"loss": 0.0592, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 1.549636534322225, |
|
"learning_rate": 3.844034214508625e-06, |
|
"loss": 0.0773, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.4213891673540038, |
|
"learning_rate": 3.828443462911128e-06, |
|
"loss": 0.0628, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.4478063303906572, |
|
"learning_rate": 3.8128461334416223e-06, |
|
"loss": 0.0746, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 1.371728736392384, |
|
"learning_rate": 3.7972425384742264e-06, |
|
"loss": 0.0592, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 1.550899976232104, |
|
"learning_rate": 3.781632990508541e-06, |
|
"loss": 0.0771, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 1.4139524918953215, |
|
"learning_rate": 3.766017802163386e-06, |
|
"loss": 0.0687, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 1.3375701401404378, |
|
"learning_rate": 3.7503972861705478e-06, |
|
"loss": 0.0699, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 1.272959839235587, |
|
"learning_rate": 3.7347717553685084e-06, |
|
"loss": 0.0469, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 1.423066310160438, |
|
"learning_rate": 3.7191415226961867e-06, |
|
"loss": 0.0557, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 1.481943036415743, |
|
"learning_rate": 3.703506901186665e-06, |
|
"loss": 0.0861, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 1.282314760769859, |
|
"learning_rate": 3.6878682039609253e-06, |
|
"loss": 0.0473, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 1.3737402610119698, |
|
"learning_rate": 3.6722257442215736e-06, |
|
"loss": 0.063, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.297558587148971, |
|
"learning_rate": 3.6565798352465697e-06, |
|
"loss": 0.0479, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.2272283470980672, |
|
"learning_rate": 3.640930790382953e-06, |
|
"loss": 0.0614, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 1.396522035608687, |
|
"learning_rate": 3.625278923040567e-06, |
|
"loss": 0.0757, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.4743070354454288, |
|
"learning_rate": 3.6096245466857808e-06, |
|
"loss": 0.0835, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.462951272372007, |
|
"learning_rate": 3.5939679748352146e-06, |
|
"loss": 0.0771, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 1.6323533328578446, |
|
"learning_rate": 3.578309521049456e-06, |
|
"loss": 0.0658, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 1.4279696560264612, |
|
"learning_rate": 3.562649498926785e-06, |
|
"loss": 0.0603, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 1.5206763046785612, |
|
"learning_rate": 3.546988222096891e-06, |
|
"loss": 0.0688, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 1.5407667167480086, |
|
"learning_rate": 3.531326004214592e-06, |
|
"loss": 0.055, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 1.6834449307372186, |
|
"learning_rate": 3.515663158953552e-06, |
|
"loss": 0.0649, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.46259189275902, |
|
"learning_rate": 3.5e-06, |
|
"loss": 0.0509, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 1.3866087835219674, |
|
"learning_rate": 3.484336841046448e-06, |
|
"loss": 0.0618, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 1.3649733562457513, |
|
"learning_rate": 3.468673995785409e-06, |
|
"loss": 0.0479, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 1.389982768703036, |
|
"learning_rate": 3.4530117779031096e-06, |
|
"loss": 0.0623, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 1.324763468165176, |
|
"learning_rate": 3.4373505010732152e-06, |
|
"loss": 0.0654, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 1.4641723601034753, |
|
"learning_rate": 3.4216904789505444e-06, |
|
"loss": 0.0628, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 1.2958642357583487, |
|
"learning_rate": 3.4060320251647866e-06, |
|
"loss": 0.0621, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 1.2985540091619692, |
|
"learning_rate": 3.3903754533142195e-06, |
|
"loss": 0.0632, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.5315659150226084, |
|
"learning_rate": 3.374721076959433e-06, |
|
"loss": 0.0577, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 1.5259417180562567, |
|
"learning_rate": 3.359069209617048e-06, |
|
"loss": 0.0714, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 1.3901162153354532, |
|
"learning_rate": 3.3434201647534306e-06, |
|
"loss": 0.0552, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 1.3074942111445218, |
|
"learning_rate": 3.3277742557784263e-06, |
|
"loss": 0.0597, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 1.4421771168227446, |
|
"learning_rate": 3.312131796039074e-06, |
|
"loss": 0.0888, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 1.217310142638148, |
|
"learning_rate": 3.296493098813335e-06, |
|
"loss": 0.0533, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 1.541068920563886, |
|
"learning_rate": 3.280858477303813e-06, |
|
"loss": 0.0866, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.469329265275979, |
|
"learning_rate": 3.265228244631491e-06, |
|
"loss": 0.0746, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.3663935959813323, |
|
"learning_rate": 3.2496027138294534e-06, |
|
"loss": 0.062, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 1.4233076114764764, |
|
"learning_rate": 3.2339821978366144e-06, |
|
"loss": 0.0549, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 1.4125981717018385, |
|
"learning_rate": 3.2183670094914596e-06, |
|
"loss": 0.0785, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 1.2443476277494099, |
|
"learning_rate": 3.2027574615257726e-06, |
|
"loss": 0.0594, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.3649462287414904, |
|
"learning_rate": 3.1871538665583784e-06, |
|
"loss": 0.0643, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.4442414248699849, |
|
"learning_rate": 3.171556537088873e-06, |
|
"loss": 0.0838, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 1.3162685413703437, |
|
"learning_rate": 3.155965785491375e-06, |
|
"loss": 0.0748, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 1.3094037910120564, |
|
"learning_rate": 3.140381924008266e-06, |
|
"loss": 0.0634, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 1.430140211858692, |
|
"learning_rate": 3.1248052647439327e-06, |
|
"loss": 0.0585, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 1.3430910234317202, |
|
"learning_rate": 3.109236119658523e-06, |
|
"loss": 0.0545, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.32857879641887, |
|
"learning_rate": 3.0936748005616936e-06, |
|
"loss": 0.0548, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.2304110525061114, |
|
"learning_rate": 3.0781216191063695e-06, |
|
"loss": 0.0367, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 1.2175987934274146, |
|
"learning_rate": 3.0625768867824957e-06, |
|
"loss": 0.0481, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 1.4251000510540908, |
|
"learning_rate": 3.047040914910806e-06, |
|
"loss": 0.0607, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 1.2232338433186838, |
|
"learning_rate": 3.0315140146365854e-06, |
|
"loss": 0.0508, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 1.8361121743484377, |
|
"learning_rate": 3.015996496923435e-06, |
|
"loss": 0.101, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.4585021415736545, |
|
"learning_rate": 3.00048867254705e-06, |
|
"loss": 0.0739, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.5678316587634316, |
|
"learning_rate": 2.9849908520889936e-06, |
|
"loss": 0.0732, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 1.5229061347267796, |
|
"learning_rate": 2.9695033459304766e-06, |
|
"loss": 0.0728, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 1.452042874894508, |
|
"learning_rate": 2.954026464246138e-06, |
|
"loss": 0.0566, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 1.4256233554820992, |
|
"learning_rate": 2.9385605169978387e-06, |
|
"loss": 0.0527, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 1.3611842237197957, |
|
"learning_rate": 2.923105813928453e-06, |
|
"loss": 0.0513, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 1.2023007795378646, |
|
"learning_rate": 2.907662664555658e-06, |
|
"loss": 0.0481, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.5067980607363667, |
|
"learning_rate": 2.8922313781657437e-06, |
|
"loss": 0.0554, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 1.1875974890545655, |
|
"learning_rate": 2.876812263807417e-06, |
|
"loss": 0.0455, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 1.4615326175716894, |
|
"learning_rate": 2.861405630285606e-06, |
|
"loss": 0.0653, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 1.2370763939930607, |
|
"learning_rate": 2.8460117861552833e-06, |
|
"loss": 0.0683, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 1.2836073017492384, |
|
"learning_rate": 2.8306310397152817e-06, |
|
"loss": 0.0638, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 1.2609833632520322, |
|
"learning_rate": 2.815263699002124e-06, |
|
"loss": 0.0469, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 1.1616499807754248, |
|
"learning_rate": 2.799910071783845e-06, |
|
"loss": 0.0408, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 1.1766220146156734, |
|
"learning_rate": 2.7845704655538383e-06, |
|
"loss": 0.0447, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 1.3126347990175358, |
|
"learning_rate": 2.7692451875246956e-06, |
|
"loss": 0.0644, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 1.1740389147488146, |
|
"learning_rate": 2.7539345446220444e-06, |
|
"loss": 0.0472, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 1.16464372281638, |
|
"learning_rate": 2.7386388434784143e-06, |
|
"loss": 0.0537, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 1.3253535370867997, |
|
"learning_rate": 2.723358390427089e-06, |
|
"loss": 0.0589, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 1.2251011192581163, |
|
"learning_rate": 2.708093491495973e-06, |
|
"loss": 0.0443, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 1.0091161261068804, |
|
"learning_rate": 2.6928444524014595e-06, |
|
"loss": 0.0359, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 1.146439960851428, |
|
"learning_rate": 2.6776115785423123e-06, |
|
"loss": 0.0365, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 1.0769495975019767, |
|
"learning_rate": 2.6623951749935487e-06, |
|
"loss": 0.0327, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 1.2241733458914144, |
|
"learning_rate": 2.6471955465003237e-06, |
|
"loss": 0.0478, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 1.3735061216517803, |
|
"learning_rate": 2.6320129974718355e-06, |
|
"loss": 0.0465, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.2436263847431321, |
|
"learning_rate": 2.616847831975224e-06, |
|
"loss": 0.0372, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.155193565242057, |
|
"learning_rate": 2.601700353729481e-06, |
|
"loss": 0.028, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 1.388487580103464, |
|
"learning_rate": 2.58657086609937e-06, |
|
"loss": 0.0359, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 1.3725232569394905, |
|
"learning_rate": 2.5714596720893473e-06, |
|
"loss": 0.0528, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 1.7184133842279257, |
|
"learning_rate": 2.5563670743374973e-06, |
|
"loss": 0.0514, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 1.3252001006412628, |
|
"learning_rate": 2.5412933751094662e-06, |
|
"loss": 0.0327, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 1.2819895992542942, |
|
"learning_rate": 2.5262388762924157e-06, |
|
"loss": 0.036, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 1.1604966130662278, |
|
"learning_rate": 2.5112038793889706e-06, |
|
"loss": 0.0412, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 1.3519367739400956, |
|
"learning_rate": 2.496188685511185e-06, |
|
"loss": 0.0302, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 1.2596321452487949, |
|
"learning_rate": 2.481193595374505e-06, |
|
"loss": 0.0295, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 1.3630477349522254, |
|
"learning_rate": 2.4662189092917563e-06, |
|
"loss": 0.0471, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 1.279411209620521, |
|
"learning_rate": 2.4512649271671214e-06, |
|
"loss": 0.0358, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 1.324111956217256, |
|
"learning_rate": 2.436331948490136e-06, |
|
"loss": 0.0379, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.3781397599646155, |
|
"learning_rate": 2.4214202723296924e-06, |
|
"loss": 0.0312, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 1.2268909864908872, |
|
"learning_rate": 2.4065301973280486e-06, |
|
"loss": 0.039, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 1.2617157160234982, |
|
"learning_rate": 2.391662021694847e-06, |
|
"loss": 0.0348, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 1.354583484533038, |
|
"learning_rate": 2.3768160432011395e-06, |
|
"loss": 0.045, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 1.4745147513254047, |
|
"learning_rate": 2.3619925591734323e-06, |
|
"loss": 0.0336, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 1.1138377432924618, |
|
"learning_rate": 2.3471918664877217e-06, |
|
"loss": 0.0327, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 1.1310391517344176, |
|
"learning_rate": 2.332414261563553e-06, |
|
"loss": 0.0322, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 1.1441571285331864, |
|
"learning_rate": 2.317660040358085e-06, |
|
"loss": 0.0366, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 1.1413826309676043, |
|
"learning_rate": 2.3029294983601598e-06, |
|
"loss": 0.0318, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 1.021725022217316, |
|
"learning_rate": 2.2882229305843866e-06, |
|
"loss": 0.0313, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 1.0432467561700225, |
|
"learning_rate": 2.2735406315652323e-06, |
|
"loss": 0.0313, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 1.0667155748112274, |
|
"learning_rate": 2.258882895351125e-06, |
|
"loss": 0.0276, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 1.076264156121743, |
|
"learning_rate": 2.2442500154985643e-06, |
|
"loss": 0.0314, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 1.2337389573803583, |
|
"learning_rate": 2.229642285066236e-06, |
|
"loss": 0.0276, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 1.1832754817783542, |
|
"learning_rate": 2.215059996609154e-06, |
|
"loss": 0.04, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 1.0855122122885388, |
|
"learning_rate": 2.200503442172792e-06, |
|
"loss": 0.0307, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 0.9598119147789916, |
|
"learning_rate": 2.185972913287241e-06, |
|
"loss": 0.0188, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 0.9791085255254713, |
|
"learning_rate": 2.1714687009613628e-06, |
|
"loss": 0.0177, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 1.0477977537384306, |
|
"learning_rate": 2.156991095676971e-06, |
|
"loss": 0.0306, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 1.0696773212764674, |
|
"learning_rate": 2.1425403873830083e-06, |
|
"loss": 0.0245, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.9827891858254738, |
|
"learning_rate": 2.1281168654897376e-06, |
|
"loss": 0.0242, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 1.0998427460091256, |
|
"learning_rate": 2.113720818862951e-06, |
|
"loss": 0.0324, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 1.1371184865733825, |
|
"learning_rate": 2.099352535818182e-06, |
|
"loss": 0.0292, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.9408745922040473, |
|
"learning_rate": 2.085012304114933e-06, |
|
"loss": 0.0287, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.9922924631524535, |
|
"learning_rate": 2.070700410950906e-06, |
|
"loss": 0.0278, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 1.2103088874857002, |
|
"learning_rate": 2.0564171429562587e-06, |
|
"loss": 0.0253, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 1.1353549204106046, |
|
"learning_rate": 2.042162786187862e-06, |
|
"loss": 0.0256, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 1.1954879688124018, |
|
"learning_rate": 2.027937626123565e-06, |
|
"loss": 0.0281, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 1.1145609650353256, |
|
"learning_rate": 2.0137419476564896e-06, |
|
"loss": 0.0254, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 0.9904281919589162, |
|
"learning_rate": 1.9995760350893098e-06, |
|
"loss": 0.0288, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.916410698775379, |
|
"learning_rate": 1.985440172128573e-06, |
|
"loss": 0.0192, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.9135967072547435, |
|
"learning_rate": 1.9713346418790058e-06, |
|
"loss": 0.0243, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 1.0987171731854852, |
|
"learning_rate": 1.957259726837849e-06, |
|
"loss": 0.0237, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 1.0651180981741601, |
|
"learning_rate": 1.9432157088892064e-06, |
|
"loss": 0.0231, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 1.3470623952115612, |
|
"learning_rate": 1.9292028692983824e-06, |
|
"loss": 0.0234, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 1.1189511726531576, |
|
"learning_rate": 1.91522148870627e-06, |
|
"loss": 0.0223, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 0.9537039081471808, |
|
"learning_rate": 1.9012718471237144e-06, |
|
"loss": 0.0252, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 1.3303345811221687, |
|
"learning_rate": 1.887354223925911e-06, |
|
"loss": 0.0227, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.9577549623126181, |
|
"learning_rate": 1.87346889784681e-06, |
|
"loss": 0.0235, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 1.103057703050364, |
|
"learning_rate": 1.8596161469735374e-06, |
|
"loss": 0.0251, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 0.9698969241730576, |
|
"learning_rate": 1.8457962487408175e-06, |
|
"loss": 0.021, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 0.9735505582853365, |
|
"learning_rate": 1.8320094799254222e-06, |
|
"loss": 0.024, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 1.0579516176062307, |
|
"learning_rate": 1.8182561166406308e-06, |
|
"loss": 0.0252, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 1.3591760479632482, |
|
"learning_rate": 1.8045364343306915e-06, |
|
"loss": 0.0232, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.9698743028005155, |
|
"learning_rate": 1.7908507077653124e-06, |
|
"loss": 0.0228, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 1.0221498824263056, |
|
"learning_rate": 1.7771992110341533e-06, |
|
"loss": 0.0203, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.9962335713738956, |
|
"learning_rate": 1.7635822175413446e-06, |
|
"loss": 0.0208, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.9025864381589136, |
|
"learning_rate": 1.7500000000000008e-06, |
|
"loss": 0.0196, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 1.0627981287441168, |
|
"learning_rate": 1.7364528304267646e-06, |
|
"loss": 0.0216, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 0.8998289761019697, |
|
"learning_rate": 1.7229409801363635e-06, |
|
"loss": 0.0177, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 1.1032294275464554, |
|
"learning_rate": 1.7094647197361656e-06, |
|
"loss": 0.0252, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 1.0706133630954544, |
|
"learning_rate": 1.6960243191207686e-06, |
|
"loss": 0.0169, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 1.0878049572672386, |
|
"learning_rate": 1.6826200474665891e-06, |
|
"loss": 0.0218, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 1.0854933564945395, |
|
"learning_rate": 1.669252173226479e-06, |
|
"loss": 0.0276, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 1.1327704937861105, |
|
"learning_rate": 1.6559209641243388e-06, |
|
"loss": 0.0211, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.9110425525754956, |
|
"learning_rate": 1.642626687149765e-06, |
|
"loss": 0.0279, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 1.021236405302058, |
|
"learning_rate": 1.629369608552696e-06, |
|
"loss": 0.0186, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 0.9240338884131678, |
|
"learning_rate": 1.6161499938380873e-06, |
|
"loss": 0.0156, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 0.890168680945506, |
|
"learning_rate": 1.6029681077605864e-06, |
|
"loss": 0.0205, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 1.066699979491714, |
|
"learning_rate": 1.5898242143192336e-06, |
|
"loss": 0.0232, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 0.916678367435508, |
|
"learning_rate": 1.576718576752179e-06, |
|
"loss": 0.0195, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 0.8932364526829346, |
|
"learning_rate": 1.5636514575314024e-06, |
|
"loss": 0.0183, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.8546476261334689, |
|
"learning_rate": 1.550623118357463e-06, |
|
"loss": 0.0187, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.9062797963432817, |
|
"learning_rate": 1.5376338201542538e-06, |
|
"loss": 0.0189, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 0.9603196879330562, |
|
"learning_rate": 1.5246838230637831e-06, |
|
"loss": 0.0218, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 0.8140801991948985, |
|
"learning_rate": 1.511773386440955e-06, |
|
"loss": 0.0154, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 0.8292223643786333, |
|
"learning_rate": 1.4989027688483808e-06, |
|
"loss": 0.023, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 1.172191159788987, |
|
"learning_rate": 1.4860722280512022e-06, |
|
"loss": 0.0229, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 1.0524048010257814, |
|
"learning_rate": 1.473282021011924e-06, |
|
"loss": 0.0237, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.8884909829231797, |
|
"learning_rate": 1.4605324038852707e-06, |
|
"loss": 0.0184, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.9152406178203973, |
|
"learning_rate": 1.4478236320130554e-06, |
|
"loss": 0.0215, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.8117379173139481, |
|
"learning_rate": 1.4351559599190708e-06, |
|
"loss": 0.0133, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 1.4449523252498588, |
|
"learning_rate": 1.4225296413039794e-06, |
|
"loss": 0.0154, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 0.965819888705113, |
|
"learning_rate": 1.4099449290402492e-06, |
|
"loss": 0.0242, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 0.753534692636084, |
|
"learning_rate": 1.3974020751670734e-06, |
|
"loss": 0.0147, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 1.04736497011196, |
|
"learning_rate": 1.3849013308853369e-06, |
|
"loss": 0.0254, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 0.8503127295336432, |
|
"learning_rate": 1.3724429465525733e-06, |
|
"loss": 0.0125, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 0.8142674930830482, |
|
"learning_rate": 1.360027171677957e-06, |
|
"loss": 0.0169, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 0.8572587753256264, |
|
"learning_rate": 1.3476542549173097e-06, |
|
"loss": 0.0238, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.9520723281018718, |
|
"learning_rate": 1.335324444068108e-06, |
|
"loss": 0.0261, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.9394209110837798, |
|
"learning_rate": 1.3230379860645363e-06, |
|
"loss": 0.0219, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 0.9794515331971883, |
|
"learning_rate": 1.3107951269725286e-06, |
|
"loss": 0.0167, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 0.839665023620838, |
|
"learning_rate": 1.2985961119848508e-06, |
|
"loss": 0.0154, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 0.8596538810285997, |
|
"learning_rate": 1.28644118541618e-06, |
|
"loss": 0.0217, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 0.9648126172460845, |
|
"learning_rate": 1.2743305906982184e-06, |
|
"loss": 0.0185, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 0.9764156247033144, |
|
"learning_rate": 1.2622645703748163e-06, |
|
"loss": 0.018, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.8055243584840867, |
|
"learning_rate": 1.2502433660971122e-06, |
|
"loss": 0.012, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.7744512728259326, |
|
"learning_rate": 1.2382672186187003e-06, |
|
"loss": 0.0161, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.7743427992830083, |
|
"learning_rate": 1.2263363677907975e-06, |
|
"loss": 0.0137, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 0.7684669655309241, |
|
"learning_rate": 1.214451052557453e-06, |
|
"loss": 0.0151, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.7858334522617288, |
|
"learning_rate": 1.202611510950747e-06, |
|
"loss": 0.0174, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.9474020449844386, |
|
"learning_rate": 1.1908179800860415e-06, |
|
"loss": 0.0179, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 0.7981094796922926, |
|
"learning_rate": 1.1790706961572176e-06, |
|
"loss": 0.0156, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 0.8306780864601689, |
|
"learning_rate": 1.167369894431949e-06, |
|
"loss": 0.0218, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.8809681256095874, |
|
"learning_rate": 1.1557158092469968e-06, |
|
"loss": 0.015, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.805530152973458, |
|
"learning_rate": 1.1441086740035036e-06, |
|
"loss": 0.017, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.8917842406873597, |
|
"learning_rate": 1.1325487211623343e-06, |
|
"loss": 0.0162, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 1.0306467922307088, |
|
"learning_rate": 1.121036182239403e-06, |
|
"loss": 0.0136, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.9249608665631512, |
|
"learning_rate": 1.1095712878010542e-06, |
|
"loss": 0.0245, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.8900342673768057, |
|
"learning_rate": 1.0981542674594327e-06, |
|
"loss": 0.0188, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 1.0239966323535892, |
|
"learning_rate": 1.08678534986789e-06, |
|
"loss": 0.0247, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.895563506776096, |
|
"learning_rate": 1.0754647627164022e-06, |
|
"loss": 0.0245, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.92433246398148, |
|
"learning_rate": 1.064192732727016e-06, |
|
"loss": 0.0161, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 0.9108465392412978, |
|
"learning_rate": 1.0529694856493002e-06, |
|
"loss": 0.0166, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.8310135807305727, |
|
"learning_rate": 1.0417952462558286e-06, |
|
"loss": 0.0223, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.8255760949970268, |
|
"learning_rate": 1.0306702383376813e-06, |
|
"loss": 0.0195, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 0.7324462030084228, |
|
"learning_rate": 1.0195946846999551e-06, |
|
"loss": 0.0169, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 0.7994877527959611, |
|
"learning_rate": 1.0085688071573086e-06, |
|
"loss": 0.0225, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 0.931125141975873, |
|
"learning_rate": 9.97592826529514e-07, |
|
"loss": 0.0252, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 0.9471536735997285, |
|
"learning_rate": 9.866669626370412e-07, |
|
"loss": 0.0205, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 0.9266292677740381, |
|
"learning_rate": 9.757914342966495e-07, |
|
"loss": 0.015, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 0.7337228958953126, |
|
"learning_rate": 9.649664593170062e-07, |
|
"loss": 0.0146, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.8563548759407368, |
|
"learning_rate": 9.541922544943295e-07, |
|
"loss": 0.0181, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.8954454947897389, |
|
"learning_rate": 9.434690356080394e-07, |
|
"loss": 0.0131, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 0.7629698438514336, |
|
"learning_rate": 9.327970174164409e-07, |
|
"loss": 0.0147, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.7699002072470454, |
|
"learning_rate": 9.221764136524202e-07, |
|
"loss": 0.0171, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.7224366715433516, |
|
"learning_rate": 9.116074370191705e-07, |
|
"loss": 0.0141, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 0.7827602493253287, |
|
"learning_rate": 9.010902991859196e-07, |
|
"loss": 0.0333, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 0.8407830538126995, |
|
"learning_rate": 8.906252107837054e-07, |
|
"loss": 0.0229, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 0.9271058679893376, |
|
"learning_rate": 8.802123814011458e-07, |
|
"loss": 0.0212, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 0.8249118909539904, |
|
"learning_rate": 8.698520195802499e-07, |
|
"loss": 0.0178, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 0.9062535578279738, |
|
"learning_rate": 8.595443328122345e-07, |
|
"loss": 0.0151, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.9007400301918379, |
|
"learning_rate": 8.492895275333705e-07, |
|
"loss": 0.0149, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 0.6906468788571227, |
|
"learning_rate": 8.390878091208544e-07, |
|
"loss": 0.0121, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 0.8191320475767269, |
|
"learning_rate": 8.289393818886837e-07, |
|
"loss": 0.0159, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.0171382420120563, |
|
"learning_rate": 8.188444490835774e-07, |
|
"loss": 0.0182, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 0.6877168878883905, |
|
"learning_rate": 8.088032128808952e-07, |
|
"loss": 0.0115, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 0.8301412318523347, |
|
"learning_rate": 7.988158743805973e-07, |
|
"loss": 0.0192, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": 0.7473939831637764, |
|
"learning_rate": 7.888826336032093e-07, |
|
"loss": 0.02, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.6891353595867798, |
|
"learning_rate": 7.790036894858198e-07, |
|
"loss": 0.0197, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.6608127967094858, |
|
"learning_rate": 7.691792398780962e-07, |
|
"loss": 0.0122, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 0.6801259718598915, |
|
"learning_rate": 7.594094815383223e-07, |
|
"loss": 0.0102, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 0.6607722356860742, |
|
"learning_rate": 7.496946101294585e-07, |
|
"loss": 0.0124, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 0.6764859766385257, |
|
"learning_rate": 7.400348202152192e-07, |
|
"loss": 0.0186, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 0.5890171229802623, |
|
"learning_rate": 7.304303052561841e-07, |
|
"loss": 0.0121, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 0.7586985486515609, |
|
"learning_rate": 7.208812576059113e-07, |
|
"loss": 0.0164, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 0.7284236571177901, |
|
"learning_rate": 7.113878685070994e-07, |
|
"loss": 0.0165, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.72155125556459, |
|
"learning_rate": 7.019503280877466e-07, |
|
"loss": 0.0105, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.7008419280912078, |
|
"learning_rate": 6.925688253573465e-07, |
|
"loss": 0.0125, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 0.5900754345006012, |
|
"learning_rate": 6.832435482031064e-07, |
|
"loss": 0.01, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 0.5236568988968674, |
|
"learning_rate": 6.73974683386176e-07, |
|
"loss": 0.0093, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 0.5402196308668088, |
|
"learning_rate": 6.647624165379173e-07, |
|
"loss": 0.012, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"grad_norm": 0.7153834377519408, |
|
"learning_rate": 6.55606932156175e-07, |
|
"loss": 0.0125, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 0.7035478419818489, |
|
"learning_rate": 6.465084136015951e-07, |
|
"loss": 0.0095, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 0.450441289716298, |
|
"learning_rate": 6.374670430939404e-07, |
|
"loss": 0.0063, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 0.6704283408089174, |
|
"learning_rate": 6.284830017084488e-07, |
|
"loss": 0.0074, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 0.5795149303845601, |
|
"learning_rate": 6.195564693722027e-07, |
|
"loss": 0.014, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 0.6404441767465421, |
|
"learning_rate": 6.106876248605299e-07, |
|
"loss": 0.0126, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 0.49124988946928483, |
|
"learning_rate": 6.018766457934177e-07, |
|
"loss": 0.0054, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 0.6895422647706185, |
|
"learning_rate": 5.931237086319592e-07, |
|
"loss": 0.009, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.7133260869582704, |
|
"learning_rate": 5.844289886748196e-07, |
|
"loss": 0.0136, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 0.6485862251152251, |
|
"learning_rate": 5.757926600547231e-07, |
|
"loss": 0.0076, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 0.7007270882813788, |
|
"learning_rate": 5.672148957349661e-07, |
|
"loss": 0.0095, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 0.5453225501286796, |
|
"learning_rate": 5.586958675059548e-07, |
|
"loss": 0.0137, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 0.46759765618604715, |
|
"learning_rate": 5.502357459817639e-07, |
|
"loss": 0.01, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 0.40298536871030216, |
|
"learning_rate": 5.418347005967189e-07, |
|
"loss": 0.0074, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 0.7139987824205981, |
|
"learning_rate": 5.334928996020013e-07, |
|
"loss": 0.0109, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 0.4688415136809408, |
|
"learning_rate": 5.252105100622848e-07, |
|
"loss": 0.0077, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 0.5717898546159872, |
|
"learning_rate": 5.169876978523828e-07, |
|
"loss": 0.0105, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 0.5863304725088486, |
|
"learning_rate": 5.088246276539292e-07, |
|
"loss": 0.0127, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 0.748734639494948, |
|
"learning_rate": 5.0072146295208e-07, |
|
"loss": 0.0083, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 0.5106657629410736, |
|
"learning_rate": 4.926783660322411e-07, |
|
"loss": 0.0093, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 0.5249256431071327, |
|
"learning_rate": 4.846954979768149e-07, |
|
"loss": 0.009, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 0.5837936537112237, |
|
"learning_rate": 4.7677301866197455e-07, |
|
"loss": 0.0111, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.5436208344015698, |
|
"learning_rate": 4.6891108675446453e-07, |
|
"loss": 0.0081, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 0.45668925332803584, |
|
"learning_rate": 4.611098597084226e-07, |
|
"loss": 0.0086, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 0.5234369553141341, |
|
"learning_rate": 4.533694937622227e-07, |
|
"loss": 0.0098, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 0.43090051877682023, |
|
"learning_rate": 4.456901439353499e-07, |
|
"loss": 0.0069, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.5065114847080426, |
|
"learning_rate": 4.3807196402529535e-07, |
|
"loss": 0.0078, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.3681825929403806, |
|
"learning_rate": 4.3051510660447336e-07, |
|
"loss": 0.0046, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 0.5198397329641357, |
|
"learning_rate": 4.2301972301716934e-07, |
|
"loss": 0.0091, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 0.9754162271751636, |
|
"learning_rate": 4.155859633765044e-07, |
|
"loss": 0.0081, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 0.3500513380364396, |
|
"learning_rate": 4.0821397656143503e-07, |
|
"loss": 0.005, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 0.5822401487852554, |
|
"learning_rate": 4.009039102137657e-07, |
|
"loss": 0.0043, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 0.41873061343417606, |
|
"learning_rate": 3.9365591073519387e-07, |
|
"loss": 0.0078, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 0.49737057792780415, |
|
"learning_rate": 3.8647012328438085e-07, |
|
"loss": 0.0063, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 0.5416348114415329, |
|
"learning_rate": 3.793466917740402e-07, |
|
"loss": 0.0069, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 0.5946803237802584, |
|
"learning_rate": 3.7228575886805744e-07, |
|
"loss": 0.0101, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 0.5792466423496542, |
|
"learning_rate": 3.6528746597863283e-07, |
|
"loss": 0.0091, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 0.5149239013215101, |
|
"learning_rate": 3.583519532634516e-07, |
|
"loss": 0.0094, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 0.5108129659609797, |
|
"learning_rate": 3.514793596228702e-07, |
|
"loss": 0.0079, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 0.5884548387452648, |
|
"learning_rate": 3.44669822697144e-07, |
|
"loss": 0.0065, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 0.45883688525931454, |
|
"learning_rate": 3.3792347886366265e-07, |
|
"loss": 0.0086, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 0.5151424364772744, |
|
"learning_rate": 3.31240463234221e-07, |
|
"loss": 0.0075, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 0.5951741308945779, |
|
"learning_rate": 3.2462090965231767e-07, |
|
"loss": 0.0055, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 0.4910893940517418, |
|
"learning_rate": 3.180649506904667e-07, |
|
"loss": 0.0094, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 0.3032369959343678, |
|
"learning_rate": 3.1157271764755085e-07, |
|
"loss": 0.0056, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.46113289164382765, |
|
"learning_rate": 3.0514434054618216e-07, |
|
"loss": 0.0063, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.649482604619634, |
|
"learning_rate": 2.987799481301091e-07, |
|
"loss": 0.0062, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 0.56089990875009, |
|
"learning_rate": 2.924796678616297e-07, |
|
"loss": 0.0069, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 0.4098789353635756, |
|
"learning_rate": 2.862436259190414e-07, |
|
"loss": 0.0072, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 0.42421611582010255, |
|
"learning_rate": 2.800719471941152e-07, |
|
"loss": 0.0055, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"grad_norm": 0.46243164776878504, |
|
"learning_rate": 2.739647552895949e-07, |
|
"loss": 0.0101, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.506707466448847, |
|
"learning_rate": 2.6792217251671744e-07, |
|
"loss": 0.0049, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.3925167921918842, |
|
"learning_rate": 2.619443198927677e-07, |
|
"loss": 0.0072, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 0.3420002956507376, |
|
"learning_rate": 2.5603131713865374e-07, |
|
"loss": 0.0076, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 0.29143440480895755, |
|
"learning_rate": 2.50183282676508e-07, |
|
"loss": 0.0053, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 0.35335225088438177, |
|
"learning_rate": 2.444003336273163e-07, |
|
"loss": 0.0075, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 0.7377895380554025, |
|
"learning_rate": 2.3868258580857164e-07, |
|
"loss": 0.0079, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 0.3732383566357089, |
|
"learning_rate": 2.3303015373195713e-07, |
|
"loss": 0.0079, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 0.30406904337344, |
|
"learning_rate": 2.2744315060104846e-07, |
|
"loss": 0.0058, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 0.2979113383173518, |
|
"learning_rate": 2.2192168830904963e-07, |
|
"loss": 0.0046, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 0.26022505864795753, |
|
"learning_rate": 2.1646587743655287e-07, |
|
"loss": 0.0045, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.38203960053911995, |
|
"learning_rate": 2.1107582724932088e-07, |
|
"loss": 0.0067, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 0.4611969431592941, |
|
"learning_rate": 2.0575164569610016e-07, |
|
"loss": 0.0058, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 0.48964538570578786, |
|
"learning_rate": 2.0049343940645935e-07, |
|
"loss": 0.0058, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 0.3748977896181229, |
|
"learning_rate": 1.953013136886541e-07, |
|
"loss": 0.0079, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 0.3684735348004529, |
|
"learning_rate": 1.901753725275166e-07, |
|
"loss": 0.0034, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 0.45813437926806905, |
|
"learning_rate": 1.8511571858237357e-07, |
|
"loss": 0.0049, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 0.38090807447648417, |
|
"learning_rate": 1.801224531849908e-07, |
|
"loss": 0.0081, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 0.7300382830511903, |
|
"learning_rate": 1.7519567633754352e-07, |
|
"loss": 0.0067, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 0.4521252420340857, |
|
"learning_rate": 1.70335486710614e-07, |
|
"loss": 0.0107, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 0.3290046038461478, |
|
"learning_rate": 1.6554198164121265e-07, |
|
"loss": 0.0056, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 0.23389758856912887, |
|
"learning_rate": 1.6081525713083428e-07, |
|
"loss": 0.004, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 0.4187101993019587, |
|
"learning_rate": 1.561554078435296e-07, |
|
"loss": 0.0062, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 0.43647005515014636, |
|
"learning_rate": 1.5156252710401207e-07, |
|
"loss": 0.0076, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 0.3727002889542965, |
|
"learning_rate": 1.4703670689578884e-07, |
|
"loss": 0.0066, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 0.39326688282125816, |
|
"learning_rate": 1.4257803785931926e-07, |
|
"loss": 0.0069, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 0.31355113796472983, |
|
"learning_rate": 1.3818660929019717e-07, |
|
"loss": 0.0058, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 0.3582549090050492, |
|
"learning_rate": 1.3386250913736408e-07, |
|
"loss": 0.0055, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"grad_norm": 0.3557691839899451, |
|
"learning_rate": 1.296058240013491e-07, |
|
"loss": 0.007, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.27232509097291063, |
|
"learning_rate": 1.2541663913253191e-07, |
|
"loss": 0.0054, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.29919984285618584, |
|
"learning_rate": 1.2129503842943645e-07, |
|
"loss": 0.0073, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 0.32031918250676045, |
|
"learning_rate": 1.1724110443705115e-07, |
|
"loss": 0.0062, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 0.32462046027184965, |
|
"learning_rate": 1.1325491834517676e-07, |
|
"loss": 0.0069, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 0.3703162253466964, |
|
"learning_rate": 1.0933655998679653e-07, |
|
"loss": 0.0049, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 0.3830835431373469, |
|
"learning_rate": 1.0548610783648199e-07, |
|
"loss": 0.0077, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 0.3042237914246762, |
|
"learning_rate": 1.0170363900881795e-07, |
|
"loss": 0.0039, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 0.2615931115864544, |
|
"learning_rate": 9.798922925685994e-08, |
|
"loss": 0.0042, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 0.35403759124068945, |
|
"learning_rate": 9.434295297061668e-08, |
|
"loss": 0.0083, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 0.44091888309449573, |
|
"learning_rate": 9.076488317555886e-08, |
|
"loss": 0.0058, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 0.405216970933536, |
|
"learning_rate": 8.725509153115918e-08, |
|
"loss": 0.0092, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 0.28509534429106465, |
|
"learning_rate": 8.38136483294546e-08, |
|
"loss": 0.0037, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 0.2940024574327348, |
|
"learning_rate": 8.044062249364048e-08, |
|
"loss": 0.0054, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 0.36361039302741816, |
|
"learning_rate": 7.713608157668921e-08, |
|
"loss": 0.0081, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 0.39672091027135914, |
|
"learning_rate": 7.390009175999835e-08, |
|
"loss": 0.0094, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 0.40191807872522295, |
|
"learning_rate": 7.073271785206314e-08, |
|
"loss": 0.0078, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.2901495174113024, |
|
"learning_rate": 6.763402328718116e-08, |
|
"loss": 0.0052, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.2532854567420693, |
|
"learning_rate": 6.460407012417918e-08, |
|
"loss": 0.004, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 0.3225727935088736, |
|
"learning_rate": 6.164291904517333e-08, |
|
"loss": 0.0083, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 0.3069170665460628, |
|
"learning_rate": 5.875062935435121e-08, |
|
"loss": 0.0052, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 0.2612538982055557, |
|
"learning_rate": 5.592725897678446e-08, |
|
"loss": 0.0057, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.24090688702155097, |
|
"learning_rate": 5.3172864457271926e-08, |
|
"loss": 0.0035, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 0.36351641612749136, |
|
"learning_rate": 5.048750095920151e-08, |
|
"loss": 0.0062, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 0.43612520351048434, |
|
"learning_rate": 4.787122226345014e-08, |
|
"loss": 0.005, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 0.29747843114335953, |
|
"learning_rate": 4.532408076730504e-08, |
|
"loss": 0.0057, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 0.3032920639088825, |
|
"learning_rate": 4.2846127483414206e-08, |
|
"loss": 0.0061, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 0.33781142844587425, |
|
"learning_rate": 4.043741203876483e-08, |
|
"loss": 0.0048, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 0.2791313716105004, |
|
"learning_rate": 3.80979826736893e-08, |
|
"loss": 0.0054, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 0.424062860990937, |
|
"learning_rate": 3.58278862409e-08, |
|
"loss": 0.0076, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.35602806799069403, |
|
"learning_rate": 3.3627168204549306e-08, |
|
"loss": 0.0044, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 0.2967466134268081, |
|
"learning_rate": 3.1495872639320357e-08, |
|
"loss": 0.0059, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 0.29343066100283227, |
|
"learning_rate": 2.9434042229544543e-08, |
|
"loss": 0.0067, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 0.25925792441623674, |
|
"learning_rate": 2.7441718268344737e-08, |
|
"loss": 0.0055, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 0.33458705494389285, |
|
"learning_rate": 2.5518940656811095e-08, |
|
"loss": 0.0088, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 0.32973000192844754, |
|
"learning_rate": 2.3665747903199418e-08, |
|
"loss": 0.0076, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 0.3978241736214976, |
|
"learning_rate": 2.1882177122162173e-08, |
|
"loss": 0.0083, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 0.3846684615361078, |
|
"learning_rate": 2.0168264034002404e-08, |
|
"loss": 0.0107, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 0.3281597086771846, |
|
"learning_rate": 1.8524042963961095e-08, |
|
"loss": 0.0057, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 0.3098674134494085, |
|
"learning_rate": 1.6949546841528607e-08, |
|
"loss": 0.0055, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 0.3827204692079133, |
|
"learning_rate": 1.544480719978447e-08, |
|
"loss": 0.0086, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 0.332400995275947, |
|
"learning_rate": 1.4009854174767521e-08, |
|
"loss": 0.0093, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 0.29676359757111426, |
|
"learning_rate": 1.2644716504870091e-08, |
|
"loss": 0.0073, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 0.4005292260275929, |
|
"learning_rate": 1.1349421530265246e-08, |
|
"loss": 0.0094, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 0.3811448231039689, |
|
"learning_rate": 1.0123995192356183e-08, |
|
"loss": 0.0099, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 0.3593225301938938, |
|
"learning_rate": 8.968462033259405e-09, |
|
"loss": 0.0086, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 0.3738117335803207, |
|
"learning_rate": 7.882845195312016e-09, |
|
"loss": 0.0052, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 0.2792515305128813, |
|
"learning_rate": 6.8671664206073625e-09, |
|
"loss": 0.0048, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 0.325832261539929, |
|
"learning_rate": 5.921446050561386e-09, |
|
"loss": 0.0083, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 0.3060573357618961, |
|
"learning_rate": 5.0457030255038334e-09, |
|
"loss": 0.0043, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"grad_norm": 0.29897144167044903, |
|
"learning_rate": 4.239954884299401e-09, |
|
"loss": 0.0068, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"grad_norm": 0.33960368756040765, |
|
"learning_rate": 3.5042177639972304e-09, |
|
"loss": 0.0086, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"grad_norm": 0.31535432780159894, |
|
"learning_rate": 2.838506399506446e-09, |
|
"loss": 0.007, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.4802190663449603, |
|
"learning_rate": 2.2428341233012294e-09, |
|
"loss": 0.0199, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 0.42217488555827554, |
|
"learning_rate": 1.7172128651554152e-09, |
|
"loss": 0.0108, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 0.39472643041620875, |
|
"learning_rate": 1.2616531519011874e-09, |
|
"loss": 0.0081, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 0.29963966680365745, |
|
"learning_rate": 8.761641072196346e-10, |
|
"loss": 0.0072, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 0.3767724810022492, |
|
"learning_rate": 5.607534514585066e-10, |
|
"loss": 0.0057, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"grad_norm": 0.2953142117019799, |
|
"learning_rate": 3.1542750147639517e-10, |
|
"loss": 0.0055, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 0.28636138519269905, |
|
"learning_rate": 1.401911705168346e-10, |
|
"loss": 0.0056, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 0.2993184966019565, |
|
"learning_rate": 3.5047968109214176e-11, |
|
"loss": 0.0063, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.3531925803337977, |
|
"learning_rate": 0.0, |
|
"loss": 0.0076, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 780, |
|
"total_flos": 0.0, |
|
"train_loss": 0.17006511208034145, |
|
"train_runtime": 1554.1821, |
|
"train_samples_per_second": 16.086, |
|
"train_steps_per_second": 0.502 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 780, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|