|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 1000, |
|
"global_step": 100000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.8100409507751465, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 1.816, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.0648651123046875, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 1.5968, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.9017549753189087, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 1.56, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.5334885120391846, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 1.587, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.3036648035049438, |
|
"learning_rate": 0.0003, |
|
"loss": 1.6182, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.709660530090332, |
|
"learning_rate": 0.00029969849246231153, |
|
"loss": 1.6102, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5684775114059448, |
|
"learning_rate": 0.0002993969849246231, |
|
"loss": 1.6094, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.3330438137054443, |
|
"learning_rate": 0.00029909547738693465, |
|
"loss": 1.6118, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.1563549041748047, |
|
"learning_rate": 0.0002987939698492462, |
|
"loss": 1.6596, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.4043567180633545, |
|
"learning_rate": 0.00029849547738693464, |
|
"loss": 1.6071, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 1.585342288017273, |
|
"eval_runtime": 37.6462, |
|
"eval_samples_per_second": 26.563, |
|
"eval_steps_per_second": 3.32, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.3647234439849854, |
|
"learning_rate": 0.0002981939698492462, |
|
"loss": 1.611, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.3917016983032227, |
|
"learning_rate": 0.00029789246231155776, |
|
"loss": 1.6003, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.7931370735168457, |
|
"learning_rate": 0.0002975909547738693, |
|
"loss": 1.5789, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.542971611022949, |
|
"learning_rate": 0.0002972894472361809, |
|
"loss": 1.5435, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.8555421829223633, |
|
"learning_rate": 0.00029698793969849243, |
|
"loss": 1.5513, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.9988830089569092, |
|
"learning_rate": 0.000296686432160804, |
|
"loss": 1.5763, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.5328696966171265, |
|
"learning_rate": 0.00029638492462311555, |
|
"loss": 1.5529, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.442533254623413, |
|
"learning_rate": 0.0002960834170854271, |
|
"loss": 1.5581, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.4188216924667358, |
|
"learning_rate": 0.00029578190954773867, |
|
"loss": 1.5598, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.700873851776123, |
|
"learning_rate": 0.00029548040201005023, |
|
"loss": 1.6091, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 1.5680323839187622, |
|
"eval_runtime": 37.9632, |
|
"eval_samples_per_second": 26.341, |
|
"eval_steps_per_second": 3.293, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.415462493896484, |
|
"learning_rate": 0.0002951788944723618, |
|
"loss": 1.5435, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.5002624988555908, |
|
"learning_rate": 0.00029487738693467335, |
|
"loss": 1.5485, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.8552610874176025, |
|
"learning_rate": 0.0002945758793969849, |
|
"loss": 1.5687, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.6914422512054443, |
|
"learning_rate": 0.00029427437185929647, |
|
"loss": 1.5549, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.5994210243225098, |
|
"learning_rate": 0.00029397286432160803, |
|
"loss": 1.5541, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.9448769092559814, |
|
"learning_rate": 0.0002936713567839196, |
|
"loss": 1.5348, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.3909597396850586, |
|
"learning_rate": 0.00029336984924623115, |
|
"loss": 1.5629, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.4517822265625, |
|
"learning_rate": 0.0002930683417085427, |
|
"loss": 1.4946, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.7407867908477783, |
|
"learning_rate": 0.0002927668341708542, |
|
"loss": 1.568, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.3732205629348755, |
|
"learning_rate": 0.0002924653266331658, |
|
"loss": 1.4928, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_loss": 1.5172981023788452, |
|
"eval_runtime": 37.8358, |
|
"eval_samples_per_second": 26.43, |
|
"eval_steps_per_second": 3.304, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.9255911111831665, |
|
"learning_rate": 0.0002921638190954774, |
|
"loss": 1.5208, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.7328695058822632, |
|
"learning_rate": 0.00029186231155778895, |
|
"loss": 1.5442, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.286285400390625, |
|
"learning_rate": 0.00029156080402010045, |
|
"loss": 1.5071, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.426595687866211, |
|
"learning_rate": 0.000291259296482412, |
|
"loss": 1.5424, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.8213595151901245, |
|
"learning_rate": 0.0002909577889447236, |
|
"loss": 1.487, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.4181461334228516, |
|
"learning_rate": 0.000290659296482412, |
|
"loss": 1.5083, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.4696974754333496, |
|
"learning_rate": 0.0002903577889447236, |
|
"loss": 1.5204, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.285097360610962, |
|
"learning_rate": 0.00029005628140703517, |
|
"loss": 1.515, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.7307722568511963, |
|
"learning_rate": 0.00028975477386934673, |
|
"loss": 1.5283, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.5405428409576416, |
|
"learning_rate": 0.00028945326633165823, |
|
"loss": 1.4657, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 1.4836663007736206, |
|
"eval_runtime": 37.7733, |
|
"eval_samples_per_second": 26.474, |
|
"eval_steps_per_second": 3.309, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.2221779823303223, |
|
"learning_rate": 0.00028915175879396985, |
|
"loss": 1.4936, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.700119733810425, |
|
"learning_rate": 0.0002888502512562814, |
|
"loss": 1.446, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.11588716506958, |
|
"learning_rate": 0.0002885487437185929, |
|
"loss": 1.4789, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.144611358642578, |
|
"learning_rate": 0.00028824723618090447, |
|
"loss": 1.4913, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.7891815900802612, |
|
"learning_rate": 0.0002879457286432161, |
|
"loss": 1.4693, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.2549595832824707, |
|
"learning_rate": 0.0002876442211055276, |
|
"loss": 1.4957, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.4034409523010254, |
|
"learning_rate": 0.00028734271356783915, |
|
"loss": 1.4909, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.4686906337738037, |
|
"learning_rate": 0.0002870412060301507, |
|
"loss": 1.4989, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.1314849853515625, |
|
"learning_rate": 0.0002867396984924623, |
|
"loss": 1.4899, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.703493595123291, |
|
"learning_rate": 0.00028643819095477383, |
|
"loss": 1.4897, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.5144654512405396, |
|
"eval_runtime": 38.0015, |
|
"eval_samples_per_second": 26.315, |
|
"eval_steps_per_second": 3.289, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.8537943363189697, |
|
"learning_rate": 0.0002861366834170854, |
|
"loss": 1.4702, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.885312557220459, |
|
"learning_rate": 0.00028583517587939695, |
|
"loss": 1.4918, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.6149489879608154, |
|
"learning_rate": 0.0002855336683417085, |
|
"loss": 1.4867, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.8222806453704834, |
|
"learning_rate": 0.00028523216080402007, |
|
"loss": 1.4894, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.105160713195801, |
|
"learning_rate": 0.0002849306532663316, |
|
"loss": 1.4865, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.9180357456207275, |
|
"learning_rate": 0.0002846291457286432, |
|
"loss": 1.4365, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.4675670862197876, |
|
"learning_rate": 0.00028432763819095474, |
|
"loss": 1.4323, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 3.664919376373291, |
|
"learning_rate": 0.0002840261306532663, |
|
"loss": 1.4605, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.5559368133544922, |
|
"learning_rate": 0.00028372462311557786, |
|
"loss": 1.4799, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.0738680362701416, |
|
"learning_rate": 0.0002834261306532663, |
|
"loss": 1.4923, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 1.4727822542190552, |
|
"eval_runtime": 38.2425, |
|
"eval_samples_per_second": 26.149, |
|
"eval_steps_per_second": 3.269, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.9228754043579102, |
|
"learning_rate": 0.00028312462311557785, |
|
"loss": 1.4127, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 2.0438356399536133, |
|
"learning_rate": 0.0002828231155778894, |
|
"loss": 1.4835, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.734626293182373, |
|
"learning_rate": 0.00028252160804020097, |
|
"loss": 1.4489, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.1490132808685303, |
|
"learning_rate": 0.0002822201005025125, |
|
"loss": 1.4684, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.1819868087768555, |
|
"learning_rate": 0.0002819185929648241, |
|
"loss": 1.4416, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.5763262510299683, |
|
"learning_rate": 0.00028161708542713565, |
|
"loss": 1.4532, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.9584680795669556, |
|
"learning_rate": 0.0002813155778894472, |
|
"loss": 1.4558, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 2.6148059368133545, |
|
"learning_rate": 0.00028101407035175876, |
|
"loss": 1.4588, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.5689460039138794, |
|
"learning_rate": 0.0002807125628140703, |
|
"loss": 1.4352, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.145756483078003, |
|
"learning_rate": 0.0002804110552763819, |
|
"loss": 1.4207, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"eval_loss": 1.4386738538742065, |
|
"eval_runtime": 38.107, |
|
"eval_samples_per_second": 26.242, |
|
"eval_steps_per_second": 3.28, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 4.316162586212158, |
|
"learning_rate": 0.00028010954773869344, |
|
"loss": 1.4085, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.0866541862487793, |
|
"learning_rate": 0.000279808040201005, |
|
"loss": 1.4634, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.0577406883239746, |
|
"learning_rate": 0.00027950653266331656, |
|
"loss": 1.4515, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.723168969154358, |
|
"learning_rate": 0.0002792050251256281, |
|
"loss": 1.4372, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.8033313751220703, |
|
"learning_rate": 0.0002789035175879397, |
|
"loss": 1.4844, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.051619529724121, |
|
"learning_rate": 0.00027860201005025124, |
|
"loss": 1.4352, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.4199312925338745, |
|
"learning_rate": 0.0002783005025125628, |
|
"loss": 1.4641, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.3949058055877686, |
|
"learning_rate": 0.00027799899497487436, |
|
"loss": 1.4592, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.8449528217315674, |
|
"learning_rate": 0.0002776974874371859, |
|
"loss": 1.4196, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.709972858428955, |
|
"learning_rate": 0.0002773959798994975, |
|
"loss": 1.4375, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 1.4270827770233154, |
|
"eval_runtime": 38.3346, |
|
"eval_samples_per_second": 26.086, |
|
"eval_steps_per_second": 3.261, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.7984100580215454, |
|
"learning_rate": 0.00027709447236180904, |
|
"loss": 1.3943, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.1693639755249023, |
|
"learning_rate": 0.00027679597989949746, |
|
"loss": 1.4636, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.8211654424667358, |
|
"learning_rate": 0.000276494472361809, |
|
"loss": 1.4539, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.11051869392395, |
|
"learning_rate": 0.0002761929648241206, |
|
"loss": 1.4214, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.5553231239318848, |
|
"learning_rate": 0.00027589145728643214, |
|
"loss": 1.4475, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.0080809593200684, |
|
"learning_rate": 0.0002755899497487437, |
|
"loss": 1.4024, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.6698598861694336, |
|
"learning_rate": 0.00027528844221105526, |
|
"loss": 1.4159, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.2336277961730957, |
|
"learning_rate": 0.0002749869346733668, |
|
"loss": 1.437, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.7006186246871948, |
|
"learning_rate": 0.0002746854271356784, |
|
"loss": 1.4465, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.934051513671875, |
|
"learning_rate": 0.0002743839195979899, |
|
"loss": 1.4319, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_loss": 1.4331704378128052, |
|
"eval_runtime": 37.9595, |
|
"eval_samples_per_second": 26.344, |
|
"eval_steps_per_second": 3.293, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.549532890319824, |
|
"learning_rate": 0.0002740824120603015, |
|
"loss": 1.4018, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.9921625852584839, |
|
"learning_rate": 0.00027378090452261306, |
|
"loss": 1.4354, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.5784940719604492, |
|
"learning_rate": 0.0002734793969849246, |
|
"loss": 1.4515, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.9822384119033813, |
|
"learning_rate": 0.0002731778894472361, |
|
"loss": 1.4784, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 3.0514814853668213, |
|
"learning_rate": 0.00027287638190954774, |
|
"loss": 1.4235, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.5947296619415283, |
|
"learning_rate": 0.0002725748743718593, |
|
"loss": 1.4325, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.838723659515381, |
|
"learning_rate": 0.0002722733668341708, |
|
"loss": 1.4318, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.7525815963745117, |
|
"learning_rate": 0.00027197185929648236, |
|
"loss": 1.4323, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.186182975769043, |
|
"learning_rate": 0.000271670351758794, |
|
"loss": 1.4122, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.5111092329025269, |
|
"learning_rate": 0.00027136884422110553, |
|
"loss": 1.4278, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.4226535558700562, |
|
"eval_runtime": 37.925, |
|
"eval_samples_per_second": 26.368, |
|
"eval_steps_per_second": 3.296, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.4402307271957397, |
|
"learning_rate": 0.00027106733668341704, |
|
"loss": 1.4775, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 4.803475379943848, |
|
"learning_rate": 0.0002707658291457286, |
|
"loss": 1.4434, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.159541606903076, |
|
"learning_rate": 0.0002704643216080402, |
|
"loss": 1.4505, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.613765835762024, |
|
"learning_rate": 0.0002701658291457286, |
|
"loss": 1.4336, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.0653555393218994, |
|
"learning_rate": 0.0002698643216080402, |
|
"loss": 1.4238, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.0688183307647705, |
|
"learning_rate": 0.00026956281407035176, |
|
"loss": 1.4048, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.271068572998047, |
|
"learning_rate": 0.0002692613065326633, |
|
"loss": 1.4412, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.7365072965621948, |
|
"learning_rate": 0.0002689597989949748, |
|
"loss": 1.3864, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.7095474004745483, |
|
"learning_rate": 0.00026865829145728643, |
|
"loss": 1.4509, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.595015287399292, |
|
"learning_rate": 0.000268356783919598, |
|
"loss": 1.4068, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"eval_loss": 1.4620698690414429, |
|
"eval_runtime": 37.8254, |
|
"eval_samples_per_second": 26.437, |
|
"eval_steps_per_second": 3.305, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.6796025037765503, |
|
"learning_rate": 0.0002680552763819095, |
|
"loss": 1.4059, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.259477376937866, |
|
"learning_rate": 0.00026775376884422106, |
|
"loss": 1.4112, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 4.8005051612854, |
|
"learning_rate": 0.00026745226130653267, |
|
"loss": 1.367, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.824021577835083, |
|
"learning_rate": 0.00026715075376884423, |
|
"loss": 1.4156, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.4818904399871826, |
|
"learning_rate": 0.00026684924623115574, |
|
"loss": 1.3846, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.6064958572387695, |
|
"learning_rate": 0.0002665477386934673, |
|
"loss": 1.4062, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.8354562520980835, |
|
"learning_rate": 0.00026624623115577886, |
|
"loss": 1.3761, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 3.094172477722168, |
|
"learning_rate": 0.0002659447236180904, |
|
"loss": 1.3576, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.000718832015991, |
|
"learning_rate": 0.000265643216080402, |
|
"loss": 1.401, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.301866054534912, |
|
"learning_rate": 0.00026534170854271353, |
|
"loss": 1.4267, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 1.4072773456573486, |
|
"eval_runtime": 37.8474, |
|
"eval_samples_per_second": 26.422, |
|
"eval_steps_per_second": 3.303, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.8116004467010498, |
|
"learning_rate": 0.0002650402010050251, |
|
"loss": 1.4141, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.7951298952102661, |
|
"learning_rate": 0.00026473869346733665, |
|
"loss": 1.4006, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.9248169660568237, |
|
"learning_rate": 0.0002644371859296482, |
|
"loss": 1.4143, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 3.0492172241210938, |
|
"learning_rate": 0.00026413567839195977, |
|
"loss": 1.3808, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.3698550462722778, |
|
"learning_rate": 0.00026383417085427133, |
|
"loss": 1.339, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.8333966732025146, |
|
"learning_rate": 0.0002635326633165829, |
|
"loss": 1.3977, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.5511767864227295, |
|
"learning_rate": 0.0002632341708542713, |
|
"loss": 1.4027, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.912987470626831, |
|
"learning_rate": 0.0002629326633165829, |
|
"loss": 1.4062, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.8692814111709595, |
|
"learning_rate": 0.00026263115577889444, |
|
"loss": 1.3901, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.620612859725952, |
|
"learning_rate": 0.000262329648241206, |
|
"loss": 1.3992, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_loss": 1.3693994283676147, |
|
"eval_runtime": 38.004, |
|
"eval_samples_per_second": 26.313, |
|
"eval_steps_per_second": 3.289, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 3.1771810054779053, |
|
"learning_rate": 0.00026202814070351756, |
|
"loss": 1.3733, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.4650421142578125, |
|
"learning_rate": 0.0002617266331658291, |
|
"loss": 1.399, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.9789535999298096, |
|
"learning_rate": 0.0002614251256281407, |
|
"loss": 1.4291, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.4404784440994263, |
|
"learning_rate": 0.00026112361809045223, |
|
"loss": 1.3833, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.0667450428009033, |
|
"learning_rate": 0.0002608221105527638, |
|
"loss": 1.3884, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.014460563659668, |
|
"learning_rate": 0.00026052060301507535, |
|
"loss": 1.3819, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.360121965408325, |
|
"learning_rate": 0.0002602190954773869, |
|
"loss": 1.3695, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.6982303857803345, |
|
"learning_rate": 0.00025991758793969847, |
|
"loss": 1.3864, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.2350399494171143, |
|
"learning_rate": 0.00025961608040201003, |
|
"loss": 1.4096, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.4647042751312256, |
|
"learning_rate": 0.0002593145728643216, |
|
"loss": 1.3915, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 1.3878337144851685, |
|
"eval_runtime": 37.7254, |
|
"eval_samples_per_second": 26.507, |
|
"eval_steps_per_second": 3.313, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.002542734146118, |
|
"learning_rate": 0.00025901306532663315, |
|
"loss": 1.4214, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.9857007265090942, |
|
"learning_rate": 0.0002587115577889447, |
|
"loss": 1.3636, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.4016737937927246, |
|
"learning_rate": 0.00025841005025125627, |
|
"loss": 1.4259, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.929931879043579, |
|
"learning_rate": 0.0002581085427135678, |
|
"loss": 1.3937, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.6266632080078125, |
|
"learning_rate": 0.0002578070351758794, |
|
"loss": 1.3678, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.905378580093384, |
|
"learning_rate": 0.00025750552763819095, |
|
"loss": 1.3526, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.535842180252075, |
|
"learning_rate": 0.0002572040201005025, |
|
"loss": 1.4062, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.5988209247589111, |
|
"learning_rate": 0.000256902512562814, |
|
"loss": 1.3915, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.5643303394317627, |
|
"learning_rate": 0.0002566010050251256, |
|
"loss": 1.3783, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.4297415018081665, |
|
"learning_rate": 0.0002562994974874372, |
|
"loss": 1.3782, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 1.405114769935608, |
|
"eval_runtime": 37.9898, |
|
"eval_samples_per_second": 26.323, |
|
"eval_steps_per_second": 3.29, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.6650172472000122, |
|
"learning_rate": 0.0002559979899497487, |
|
"loss": 1.3387, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.118579864501953, |
|
"learning_rate": 0.00025569648241206025, |
|
"loss": 1.393, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.74748694896698, |
|
"learning_rate": 0.00025539497487437186, |
|
"loss": 1.3353, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.794631004333496, |
|
"learning_rate": 0.0002550934673366834, |
|
"loss": 1.3942, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.7065675258636475, |
|
"learning_rate": 0.00025479195979899493, |
|
"loss": 1.3962, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 3.389014720916748, |
|
"learning_rate": 0.0002544904522613065, |
|
"loss": 1.3758, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.534252405166626, |
|
"learning_rate": 0.0002541889447236181, |
|
"loss": 1.3526, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.7374197244644165, |
|
"learning_rate": 0.0002538874371859296, |
|
"loss": 1.3577, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.1230342388153076, |
|
"learning_rate": 0.00025358592964824117, |
|
"loss": 1.3548, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.261570692062378, |
|
"learning_rate": 0.0002532844221105527, |
|
"loss": 1.3932, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 1.3275749683380127, |
|
"eval_runtime": 37.9493, |
|
"eval_samples_per_second": 26.351, |
|
"eval_steps_per_second": 3.294, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 3.0108933448791504, |
|
"learning_rate": 0.00025298291457286434, |
|
"loss": 1.3445, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 3.536722421646118, |
|
"learning_rate": 0.00025268140703517584, |
|
"loss": 1.364, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.637465238571167, |
|
"learning_rate": 0.0002523829145728643, |
|
"loss": 1.376, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.8907904624938965, |
|
"learning_rate": 0.0002520814070351759, |
|
"loss": 1.3623, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.4385364055633545, |
|
"learning_rate": 0.0002517798994974874, |
|
"loss": 1.318, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.9113733768463135, |
|
"learning_rate": 0.00025147839195979895, |
|
"loss": 1.3906, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 5.8118414878845215, |
|
"learning_rate": 0.00025117688442211056, |
|
"loss": 1.3336, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.9629586935043335, |
|
"learning_rate": 0.0002508753768844221, |
|
"loss": 1.3959, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.0420243740081787, |
|
"learning_rate": 0.0002505738693467336, |
|
"loss": 1.3523, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.0758414268493652, |
|
"learning_rate": 0.0002502723618090452, |
|
"loss": 1.3747, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"eval_loss": 1.3606867790222168, |
|
"eval_runtime": 37.9681, |
|
"eval_samples_per_second": 26.338, |
|
"eval_steps_per_second": 3.292, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.486980438232422, |
|
"learning_rate": 0.00024997085427135675, |
|
"loss": 1.3402, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.211982250213623, |
|
"learning_rate": 0.0002496693467336683, |
|
"loss": 1.3419, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.3362228870391846, |
|
"learning_rate": 0.00024936783919597986, |
|
"loss": 1.3748, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.515100121498108, |
|
"learning_rate": 0.0002490663316582914, |
|
"loss": 1.3747, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.1747968196868896, |
|
"learning_rate": 0.000248764824120603, |
|
"loss": 1.3458, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.6045758724212646, |
|
"learning_rate": 0.00024846331658291454, |
|
"loss": 1.3623, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.5456433296203613, |
|
"learning_rate": 0.0002481618090452261, |
|
"loss": 1.3107, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.5310312509536743, |
|
"learning_rate": 0.00024786030150753766, |
|
"loss": 1.3541, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 3.2094223499298096, |
|
"learning_rate": 0.0002475587939698492, |
|
"loss": 1.3445, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.7595880031585693, |
|
"learning_rate": 0.0002472572864321608, |
|
"loss": 1.3537, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_loss": 1.3503804206848145, |
|
"eval_runtime": 37.8049, |
|
"eval_samples_per_second": 26.452, |
|
"eval_steps_per_second": 3.306, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 5.4382781982421875, |
|
"learning_rate": 0.00024695577889447234, |
|
"loss": 1.3584, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.7903175354003906, |
|
"learning_rate": 0.0002466542713567839, |
|
"loss": 1.3272, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.6171114444732666, |
|
"learning_rate": 0.00024635276381909546, |
|
"loss": 1.3601, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.9426279067993164, |
|
"learning_rate": 0.000246051256281407, |
|
"loss": 1.3782, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.36596941947937, |
|
"learning_rate": 0.0002457497487437186, |
|
"loss": 1.3307, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.3205448389053345, |
|
"learning_rate": 0.00024544824120603014, |
|
"loss": 1.3929, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.9464951753616333, |
|
"learning_rate": 0.0002451467336683417, |
|
"loss": 1.3415, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.7700294256210327, |
|
"learning_rate": 0.00024484522613065326, |
|
"loss": 1.3473, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.687060832977295, |
|
"learning_rate": 0.0002445437185929648, |
|
"loss": 1.3606, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.02754282951355, |
|
"learning_rate": 0.0002442422110552764, |
|
"loss": 1.3799, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_loss": 1.365315556526184, |
|
"eval_runtime": 37.6707, |
|
"eval_samples_per_second": 26.546, |
|
"eval_steps_per_second": 3.318, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.187087059020996, |
|
"learning_rate": 0.0002439407035175879, |
|
"loss": 1.3585, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 3.8181040287017822, |
|
"learning_rate": 0.00024363919597989947, |
|
"loss": 1.3723, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.6949020624160767, |
|
"learning_rate": 0.00024333768844221105, |
|
"loss": 1.3074, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.716754913330078, |
|
"learning_rate": 0.00024303618090452259, |
|
"loss": 1.3589, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.5216838121414185, |
|
"learning_rate": 0.00024273467336683415, |
|
"loss": 1.3398, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.7370058298110962, |
|
"learning_rate": 0.0002424331658291457, |
|
"loss": 1.3546, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.0907745361328125, |
|
"learning_rate": 0.00024213165829145726, |
|
"loss": 1.3161, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.9564626216888428, |
|
"learning_rate": 0.00024183015075376882, |
|
"loss": 1.3623, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.6082723140716553, |
|
"learning_rate": 0.00024152864321608038, |
|
"loss": 1.3158, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.0046592950820923, |
|
"learning_rate": 0.00024122713567839192, |
|
"loss": 1.3366, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.3484834432601929, |
|
"eval_runtime": 37.9475, |
|
"eval_samples_per_second": 26.352, |
|
"eval_steps_per_second": 3.294, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.5935070514678955, |
|
"learning_rate": 0.0002409256281407035, |
|
"loss": 1.3512, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 3.790050506591797, |
|
"learning_rate": 0.00024062412060301506, |
|
"loss": 1.3272, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.3440461158752441, |
|
"learning_rate": 0.00024032562814070351, |
|
"loss": 1.333, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 6.51857852935791, |
|
"learning_rate": 0.00024002412060301505, |
|
"loss": 1.3334, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.882919192314148, |
|
"learning_rate": 0.0002397226130653266, |
|
"loss": 1.3241, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.361558198928833, |
|
"learning_rate": 0.00023942110552763817, |
|
"loss": 1.3207, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.0967071056365967, |
|
"learning_rate": 0.00023911959798994975, |
|
"loss": 1.2993, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.2517688274383545, |
|
"learning_rate": 0.00023881809045226128, |
|
"loss": 1.3353, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 7.7647480964660645, |
|
"learning_rate": 0.00023851658291457284, |
|
"loss": 1.3326, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.0270638465881348, |
|
"learning_rate": 0.0002382180904522613, |
|
"loss": 1.3046, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"eval_loss": 1.3456777334213257, |
|
"eval_runtime": 38.0868, |
|
"eval_samples_per_second": 26.256, |
|
"eval_steps_per_second": 3.282, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.9642785787582397, |
|
"learning_rate": 0.00023791658291457283, |
|
"loss": 1.3131, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.517357587814331, |
|
"learning_rate": 0.0002376150753768844, |
|
"loss": 1.3627, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.4660860300064087, |
|
"learning_rate": 0.00023731356783919598, |
|
"loss": 1.2805, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 3.102552652359009, |
|
"learning_rate": 0.00023701206030150753, |
|
"loss": 1.339, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 2.017504930496216, |
|
"learning_rate": 0.00023671055276381907, |
|
"loss": 1.3307, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.4260824918746948, |
|
"learning_rate": 0.00023640904522613063, |
|
"loss": 1.3216, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 4.0052361488342285, |
|
"learning_rate": 0.0002361075376884422, |
|
"loss": 1.3544, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 3.664625883102417, |
|
"learning_rate": 0.00023580603015075375, |
|
"loss": 1.3508, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.1044421195983887, |
|
"learning_rate": 0.0002355045226130653, |
|
"loss": 1.3205, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.6608549356460571, |
|
"learning_rate": 0.00023520301507537686, |
|
"loss": 1.3373, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"eval_loss": 1.319154977798462, |
|
"eval_runtime": 37.7789, |
|
"eval_samples_per_second": 26.47, |
|
"eval_steps_per_second": 3.309, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 2.131612777709961, |
|
"learning_rate": 0.00023490150753768845, |
|
"loss": 1.3244, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.0854969024658203, |
|
"learning_rate": 0.00023459999999999998, |
|
"loss": 1.3357, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.3622310161590576, |
|
"learning_rate": 0.0002343075376884422, |
|
"loss": 1.4118, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.5198066234588623, |
|
"learning_rate": 0.00023400603015075376, |
|
"loss": 1.319, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.4654555320739746, |
|
"learning_rate": 0.00023370452261306532, |
|
"loss": 1.3055, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.53120756149292, |
|
"learning_rate": 0.00023340301507537685, |
|
"loss": 1.3763, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.199324131011963, |
|
"learning_rate": 0.00023310150753768843, |
|
"loss": 1.3148, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.951871633529663, |
|
"learning_rate": 0.0002328, |
|
"loss": 1.3234, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.5513529777526855, |
|
"learning_rate": 0.00023249849246231153, |
|
"loss": 1.302, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 5.096097469329834, |
|
"learning_rate": 0.00023219698492462309, |
|
"loss": 1.3102, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"eval_loss": 1.3704819679260254, |
|
"eval_runtime": 37.8283, |
|
"eval_samples_per_second": 26.435, |
|
"eval_steps_per_second": 3.304, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.3565678596496582, |
|
"learning_rate": 0.00023189547738693467, |
|
"loss": 1.3182, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 3.1972274780273438, |
|
"learning_rate": 0.00023159396984924623, |
|
"loss": 1.316, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 2.4728245735168457, |
|
"learning_rate": 0.00023129246231155776, |
|
"loss": 1.2934, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 1.917893648147583, |
|
"learning_rate": 0.00023099095477386932, |
|
"loss": 1.3241, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.30876088142395, |
|
"learning_rate": 0.00023068944723618086, |
|
"loss": 1.3031, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.5653178691864014, |
|
"learning_rate": 0.00023038793969849244, |
|
"loss": 1.2819, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 3.500821352005005, |
|
"learning_rate": 0.000230086432160804, |
|
"loss": 1.2829, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.6564580202102661, |
|
"learning_rate": 0.00022978492462311556, |
|
"loss": 1.3209, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.6477315425872803, |
|
"learning_rate": 0.0002294834170854271, |
|
"loss": 1.2991, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.9583780765533447, |
|
"learning_rate": 0.00022918190954773868, |
|
"loss": 1.3011, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 1.3160556554794312, |
|
"eval_runtime": 37.7643, |
|
"eval_samples_per_second": 26.48, |
|
"eval_steps_per_second": 3.31, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.3997368812561035, |
|
"learning_rate": 0.00022888040201005024, |
|
"loss": 1.2866, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.5909266471862793, |
|
"learning_rate": 0.00022857889447236177, |
|
"loss": 1.3133, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.9457557201385498, |
|
"learning_rate": 0.00022827738693467333, |
|
"loss": 1.2716, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 2.85856032371521, |
|
"learning_rate": 0.00022797587939698492, |
|
"loss": 1.2932, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 3.180671215057373, |
|
"learning_rate": 0.00022767437185929648, |
|
"loss": 1.317, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.630612850189209, |
|
"learning_rate": 0.000227372864321608, |
|
"loss": 1.3176, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.159804582595825, |
|
"learning_rate": 0.00022707135678391957, |
|
"loss": 1.3288, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.314036250114441, |
|
"learning_rate": 0.00022676984924623116, |
|
"loss": 1.3157, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.718198776245117, |
|
"learning_rate": 0.0002264683417085427, |
|
"loss": 1.2915, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.3423640727996826, |
|
"learning_rate": 0.00022616683417085425, |
|
"loss": 1.2976, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 1.3594353199005127, |
|
"eval_runtime": 37.7829, |
|
"eval_samples_per_second": 26.467, |
|
"eval_steps_per_second": 3.308, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.3341753482818604, |
|
"learning_rate": 0.0002258653266331658, |
|
"loss": 1.322, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 2.0798075199127197, |
|
"learning_rate": 0.0002255638190954774, |
|
"loss": 1.3182, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.5256847143173218, |
|
"learning_rate": 0.00022526231155778893, |
|
"loss": 1.3102, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.4831185340881348, |
|
"learning_rate": 0.00022496080402010049, |
|
"loss": 1.3183, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 9.853681564331055, |
|
"learning_rate": 0.00022465929648241204, |
|
"loss": 1.2963, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.833552837371826, |
|
"learning_rate": 0.00022435778894472358, |
|
"loss": 1.3226, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.7486400604248047, |
|
"learning_rate": 0.00022405628140703516, |
|
"loss": 1.2742, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.3708908557891846, |
|
"learning_rate": 0.00022375477386934672, |
|
"loss": 1.2878, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 3.6677916049957275, |
|
"learning_rate": 0.00022345326633165826, |
|
"loss": 1.3113, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.7909395694732666, |
|
"learning_rate": 0.00022315175879396981, |
|
"loss": 1.3221, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"eval_loss": 1.313453197479248, |
|
"eval_runtime": 37.7782, |
|
"eval_samples_per_second": 26.47, |
|
"eval_steps_per_second": 3.309, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.592221736907959, |
|
"learning_rate": 0.0002228502512562814, |
|
"loss": 1.2918, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.911118984222412, |
|
"learning_rate": 0.00022254874371859296, |
|
"loss": 1.3392, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.15328049659729, |
|
"learning_rate": 0.0002222472361809045, |
|
"loss": 1.261, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 3.0731029510498047, |
|
"learning_rate": 0.00022194572864321605, |
|
"loss": 1.289, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 3.032560348510742, |
|
"learning_rate": 0.00022164422110552764, |
|
"loss": 1.3186, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 5.388736724853516, |
|
"learning_rate": 0.00022134271356783917, |
|
"loss": 1.3214, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.6400022506713867, |
|
"learning_rate": 0.00022104120603015073, |
|
"loss": 1.2936, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 3.9355711936950684, |
|
"learning_rate": 0.0002207396984924623, |
|
"loss": 1.3039, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.6818647384643555, |
|
"learning_rate": 0.00022043819095477388, |
|
"loss": 1.2992, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.2356157302856445, |
|
"learning_rate": 0.0002201366834170854, |
|
"loss": 1.3011, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"eval_loss": 1.3157364130020142, |
|
"eval_runtime": 37.9238, |
|
"eval_samples_per_second": 26.369, |
|
"eval_steps_per_second": 3.296, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.158803701400757, |
|
"learning_rate": 0.00021983517587939697, |
|
"loss": 1.308, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 1.4748259782791138, |
|
"learning_rate": 0.0002195336683417085, |
|
"loss": 1.2873, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 2.382047653198242, |
|
"learning_rate": 0.0002192321608040201, |
|
"loss": 1.2795, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.8785953521728516, |
|
"learning_rate": 0.00021893065326633165, |
|
"loss": 1.3101, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 2.4842770099639893, |
|
"learning_rate": 0.0002186291457286432, |
|
"loss": 1.3124, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.7258535623550415, |
|
"learning_rate": 0.00021832763819095474, |
|
"loss": 1.3315, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 2.157860517501831, |
|
"learning_rate": 0.00021802613065326633, |
|
"loss": 1.2848, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 3.1965837478637695, |
|
"learning_rate": 0.00021772462311557788, |
|
"loss": 1.3105, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 3.141603708267212, |
|
"learning_rate": 0.00021742311557788942, |
|
"loss": 1.3197, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.0368692874908447, |
|
"learning_rate": 0.00021712160804020098, |
|
"loss": 1.3113, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"eval_loss": 1.3079107999801636, |
|
"eval_runtime": 37.8037, |
|
"eval_samples_per_second": 26.452, |
|
"eval_steps_per_second": 3.307, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 3.013373851776123, |
|
"learning_rate": 0.00021682010050251254, |
|
"loss": 1.2892, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 2.766491651535034, |
|
"learning_rate": 0.00021651859296482412, |
|
"loss": 1.3414, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 1.6288301944732666, |
|
"learning_rate": 0.00021621708542713566, |
|
"loss": 1.3156, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.3904545307159424, |
|
"learning_rate": 0.00021591557788944721, |
|
"loss": 1.2905, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 2.263744831085205, |
|
"learning_rate": 0.00021561407035175877, |
|
"loss": 1.2961, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 1.985129714012146, |
|
"learning_rate": 0.00021531256281407033, |
|
"loss": 1.2703, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.4574270248413086, |
|
"learning_rate": 0.0002150110552763819, |
|
"loss": 1.2793, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.312525510787964, |
|
"learning_rate": 0.00021470954773869345, |
|
"loss": 1.2669, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.5253132581710815, |
|
"learning_rate": 0.00021440804020100498, |
|
"loss": 1.3187, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.7550122737884521, |
|
"learning_rate": 0.00021410653266331657, |
|
"loss": 1.3154, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"eval_loss": 1.2937275171279907, |
|
"eval_runtime": 37.9639, |
|
"eval_samples_per_second": 26.341, |
|
"eval_steps_per_second": 3.293, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 2.492000102996826, |
|
"learning_rate": 0.00021380502512562813, |
|
"loss": 1.2868, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 4.013311862945557, |
|
"learning_rate": 0.00021350351758793966, |
|
"loss": 1.2578, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 3.991748809814453, |
|
"learning_rate": 0.00021320201005025122, |
|
"loss": 1.3347, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 4.655180931091309, |
|
"learning_rate": 0.0002129005025125628, |
|
"loss": 1.2935, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.9497921466827393, |
|
"learning_rate": 0.00021259899497487437, |
|
"loss": 1.248, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 3.372061252593994, |
|
"learning_rate": 0.0002122974874371859, |
|
"loss": 1.2877, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.1920547485351562, |
|
"learning_rate": 0.00021199597989949746, |
|
"loss": 1.2407, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 3.5231897830963135, |
|
"learning_rate": 0.0002116974874371859, |
|
"loss": 1.2296, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 4.537712097167969, |
|
"learning_rate": 0.00021139597989949745, |
|
"loss": 1.2704, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 3.12864351272583, |
|
"learning_rate": 0.00021109447236180903, |
|
"loss": 1.3093, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 1.2697720527648926, |
|
"eval_runtime": 37.8104, |
|
"eval_samples_per_second": 26.448, |
|
"eval_steps_per_second": 3.306, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.9532142877578735, |
|
"learning_rate": 0.0002107929648241206, |
|
"loss": 1.2892, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 1.9121806621551514, |
|
"learning_rate": 0.00021049145728643215, |
|
"loss": 1.282, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.2597557306289673, |
|
"learning_rate": 0.00021018994974874368, |
|
"loss": 1.2793, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.7637083530426025, |
|
"learning_rate": 0.00020988844221105527, |
|
"loss": 1.3253, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 3.788984775543213, |
|
"learning_rate": 0.00020958693467336683, |
|
"loss": 1.249, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 3.1422038078308105, |
|
"learning_rate": 0.00020928542713567836, |
|
"loss": 1.2429, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 1.995868444442749, |
|
"learning_rate": 0.00020898391959798992, |
|
"loss": 1.2827, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.3635036945343018, |
|
"learning_rate": 0.00020868241206030148, |
|
"loss": 1.2653, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.0892832279205322, |
|
"learning_rate": 0.00020838090452261307, |
|
"loss": 1.2814, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.8766140937805176, |
|
"learning_rate": 0.0002080793969849246, |
|
"loss": 1.2809, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"eval_loss": 1.2703502178192139, |
|
"eval_runtime": 37.818, |
|
"eval_samples_per_second": 26.442, |
|
"eval_steps_per_second": 3.305, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.5487587451934814, |
|
"learning_rate": 0.00020777788944723616, |
|
"loss": 1.2811, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.325295925140381, |
|
"learning_rate": 0.00020747638190954772, |
|
"loss": 1.2769, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 1.741773009300232, |
|
"learning_rate": 0.00020717487437185928, |
|
"loss": 1.2741, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 5.916422367095947, |
|
"learning_rate": 0.00020687336683417084, |
|
"loss": 1.2567, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 2.166018009185791, |
|
"learning_rate": 0.0002065718592964824, |
|
"loss": 1.2491, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 1.7622108459472656, |
|
"learning_rate": 0.00020627035175879393, |
|
"loss": 1.2815, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 2.2861111164093018, |
|
"learning_rate": 0.00020596884422110552, |
|
"loss": 1.2485, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 2.8738324642181396, |
|
"learning_rate": 0.00020566733668341708, |
|
"loss": 1.2747, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 1.920782208442688, |
|
"learning_rate": 0.00020536582914572863, |
|
"loss": 1.3094, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.591792345046997, |
|
"learning_rate": 0.00020506432160804017, |
|
"loss": 1.3178, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 1.2383744716644287, |
|
"eval_runtime": 37.8786, |
|
"eval_samples_per_second": 26.4, |
|
"eval_steps_per_second": 3.3, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 3.4940438270568848, |
|
"learning_rate": 0.00020476281407035175, |
|
"loss": 1.2755, |
|
"step": 32100 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.377112627029419, |
|
"learning_rate": 0.0002044613065326633, |
|
"loss": 1.2667, |
|
"step": 32200 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.5229716300964355, |
|
"learning_rate": 0.00020415979899497485, |
|
"loss": 1.2695, |
|
"step": 32300 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.469883441925049, |
|
"learning_rate": 0.0002038582914572864, |
|
"loss": 1.3089, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.9299498796463013, |
|
"learning_rate": 0.000203556783919598, |
|
"loss": 1.2835, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.486790895462036, |
|
"learning_rate": 0.00020325527638190955, |
|
"loss": 1.2531, |
|
"step": 32600 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 3.485691785812378, |
|
"learning_rate": 0.00020295376884422108, |
|
"loss": 1.2568, |
|
"step": 32700 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 1.674727201461792, |
|
"learning_rate": 0.00020265226130653264, |
|
"loss": 1.2739, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 4.50739049911499, |
|
"learning_rate": 0.00020235075376884417, |
|
"loss": 1.211, |
|
"step": 32900 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 11.218056678771973, |
|
"learning_rate": 0.00020204924623115576, |
|
"loss": 1.2891, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"eval_loss": 1.2705625295639038, |
|
"eval_runtime": 37.8291, |
|
"eval_samples_per_second": 26.435, |
|
"eval_steps_per_second": 3.304, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 1.9991952180862427, |
|
"learning_rate": 0.00020174773869346732, |
|
"loss": 1.2636, |
|
"step": 33100 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 3.0366969108581543, |
|
"learning_rate": 0.00020144623115577888, |
|
"loss": 1.2903, |
|
"step": 33200 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 1.7985395193099976, |
|
"learning_rate": 0.0002011447236180904, |
|
"loss": 1.2437, |
|
"step": 33300 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 3.8208954334259033, |
|
"learning_rate": 0.000200843216080402, |
|
"loss": 1.244, |
|
"step": 33400 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 3.2836215496063232, |
|
"learning_rate": 0.00020054170854271356, |
|
"loss": 1.2837, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 3.15663480758667, |
|
"learning_rate": 0.0002002402010050251, |
|
"loss": 1.2253, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 1.6871391534805298, |
|
"learning_rate": 0.00019993869346733665, |
|
"loss": 1.2564, |
|
"step": 33700 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 2.3701913356781006, |
|
"learning_rate": 0.00019963718592964824, |
|
"loss": 1.2925, |
|
"step": 33800 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 2.9534804821014404, |
|
"learning_rate": 0.0001993356783919598, |
|
"loss": 1.2613, |
|
"step": 33900 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.273113489151001, |
|
"learning_rate": 0.00019903417085427133, |
|
"loss": 1.29, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"eval_loss": 1.2713490724563599, |
|
"eval_runtime": 37.9786, |
|
"eval_samples_per_second": 26.331, |
|
"eval_steps_per_second": 3.291, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.1708054542541504, |
|
"learning_rate": 0.0001987326633165829, |
|
"loss": 1.2775, |
|
"step": 34100 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.242708683013916, |
|
"learning_rate": 0.00019843115577889447, |
|
"loss": 1.2561, |
|
"step": 34200 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.0170931816101074, |
|
"learning_rate": 0.000198129648241206, |
|
"loss": 1.2168, |
|
"step": 34300 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.094848871231079, |
|
"learning_rate": 0.00019782814070351757, |
|
"loss": 1.2588, |
|
"step": 34400 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.1762752532958984, |
|
"learning_rate": 0.00019752663316582913, |
|
"loss": 1.1837, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 3.1318016052246094, |
|
"learning_rate": 0.0001972251256281407, |
|
"loss": 1.2196, |
|
"step": 34600 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 3.2971861362457275, |
|
"learning_rate": 0.00019692361809045225, |
|
"loss": 1.2778, |
|
"step": 34700 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 3.452091693878174, |
|
"learning_rate": 0.0001966221105527638, |
|
"loss": 1.2385, |
|
"step": 34800 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.7514299154281616, |
|
"learning_rate": 0.00019632060301507536, |
|
"loss": 1.2769, |
|
"step": 34900 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.3494088649749756, |
|
"learning_rate": 0.00019601909547738692, |
|
"loss": 1.2689, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"eval_loss": 1.2675199508666992, |
|
"eval_runtime": 37.8879, |
|
"eval_samples_per_second": 26.394, |
|
"eval_steps_per_second": 3.299, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.5741009712219238, |
|
"learning_rate": 0.00019571758793969848, |
|
"loss": 1.2352, |
|
"step": 35100 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 2.652435302734375, |
|
"learning_rate": 0.00019541608040201004, |
|
"loss": 1.2824, |
|
"step": 35200 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.9557676315307617, |
|
"learning_rate": 0.00019511457286432157, |
|
"loss": 1.2453, |
|
"step": 35300 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.8758041858673096, |
|
"learning_rate": 0.00019481306532663313, |
|
"loss": 1.2507, |
|
"step": 35400 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.5828402042388916, |
|
"learning_rate": 0.0001945145728643216, |
|
"loss": 1.2201, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 2.887206554412842, |
|
"learning_rate": 0.00019421306532663312, |
|
"loss": 1.2754, |
|
"step": 35600 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 2.5521140098571777, |
|
"learning_rate": 0.0001939115577889447, |
|
"loss": 1.234, |
|
"step": 35700 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 1.9570846557617188, |
|
"learning_rate": 0.00019361005025125627, |
|
"loss": 1.2708, |
|
"step": 35800 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 2.89273738861084, |
|
"learning_rate": 0.00019330854271356782, |
|
"loss": 1.2343, |
|
"step": 35900 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 3.624706506729126, |
|
"learning_rate": 0.00019300703517587936, |
|
"loss": 1.2576, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"eval_loss": 1.2644726037979126, |
|
"eval_runtime": 37.8527, |
|
"eval_samples_per_second": 26.418, |
|
"eval_steps_per_second": 3.302, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 2.5976133346557617, |
|
"learning_rate": 0.00019270552763819094, |
|
"loss": 1.2812, |
|
"step": 36100 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 2.899306297302246, |
|
"learning_rate": 0.0001924040201005025, |
|
"loss": 1.2541, |
|
"step": 36200 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 3.964782476425171, |
|
"learning_rate": 0.00019210251256281404, |
|
"loss": 1.2639, |
|
"step": 36300 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 2.4634933471679688, |
|
"learning_rate": 0.0001918010050251256, |
|
"loss": 1.2089, |
|
"step": 36400 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 2.6023619174957275, |
|
"learning_rate": 0.00019149949748743718, |
|
"loss": 1.2612, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 3.0462849140167236, |
|
"learning_rate": 0.00019119798994974874, |
|
"loss": 1.2204, |
|
"step": 36600 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 2.1344144344329834, |
|
"learning_rate": 0.00019089648241206027, |
|
"loss": 1.2142, |
|
"step": 36700 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 1.5994189977645874, |
|
"learning_rate": 0.00019059497487437183, |
|
"loss": 1.2586, |
|
"step": 36800 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 1.357469916343689, |
|
"learning_rate": 0.00019029346733668342, |
|
"loss": 1.2705, |
|
"step": 36900 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.4201526641845703, |
|
"learning_rate": 0.00018999195979899495, |
|
"loss": 1.2409, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"eval_loss": 1.2103183269500732, |
|
"eval_runtime": 37.8707, |
|
"eval_samples_per_second": 26.406, |
|
"eval_steps_per_second": 3.301, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 3.1790504455566406, |
|
"learning_rate": 0.0001896904522613065, |
|
"loss": 1.2639, |
|
"step": 37100 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 2.565474033355713, |
|
"learning_rate": 0.00018938894472361807, |
|
"loss": 1.2853, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 2.6977927684783936, |
|
"learning_rate": 0.00018908743718592966, |
|
"loss": 1.2178, |
|
"step": 37300 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.588975191116333, |
|
"learning_rate": 0.0001887859296482412, |
|
"loss": 1.2492, |
|
"step": 37400 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.23592209815979, |
|
"learning_rate": 0.00018848442211055275, |
|
"loss": 1.2273, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.0961692333221436, |
|
"learning_rate": 0.0001881859296482412, |
|
"loss": 1.2375, |
|
"step": 37600 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 2.4870264530181885, |
|
"learning_rate": 0.00018788442211055273, |
|
"loss": 1.2564, |
|
"step": 37700 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 1.9144058227539062, |
|
"learning_rate": 0.0001875829145728643, |
|
"loss": 1.2403, |
|
"step": 37800 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.209117889404297, |
|
"learning_rate": 0.00018728140703517588, |
|
"loss": 1.2168, |
|
"step": 37900 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.7400968074798584, |
|
"learning_rate": 0.00018697989949748744, |
|
"loss": 1.1786, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"eval_loss": 1.2550157308578491, |
|
"eval_runtime": 37.907, |
|
"eval_samples_per_second": 26.38, |
|
"eval_steps_per_second": 3.298, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 2.392390251159668, |
|
"learning_rate": 0.00018667839195979897, |
|
"loss": 1.2294, |
|
"step": 38100 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 3.434168577194214, |
|
"learning_rate": 0.00018637688442211053, |
|
"loss": 1.2491, |
|
"step": 38200 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.082618236541748, |
|
"learning_rate": 0.0001860753768844221, |
|
"loss": 1.2602, |
|
"step": 38300 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.6049084663391113, |
|
"learning_rate": 0.00018577386934673365, |
|
"loss": 1.2067, |
|
"step": 38400 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.1953368186950684, |
|
"learning_rate": 0.0001854723618090452, |
|
"loss": 1.2292, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.6085190773010254, |
|
"learning_rate": 0.00018517085427135677, |
|
"loss": 1.2269, |
|
"step": 38600 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 2.9110639095306396, |
|
"learning_rate": 0.0001848693467336683, |
|
"loss": 1.1898, |
|
"step": 38700 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 1.514410138130188, |
|
"learning_rate": 0.0001845678391959799, |
|
"loss": 1.199, |
|
"step": 38800 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 4.6756134033203125, |
|
"learning_rate": 0.00018426633165829145, |
|
"loss": 1.183, |
|
"step": 38900 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.704317808151245, |
|
"learning_rate": 0.000183964824120603, |
|
"loss": 1.1999, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"eval_loss": 1.2309662103652954, |
|
"eval_runtime": 37.8598, |
|
"eval_samples_per_second": 26.413, |
|
"eval_steps_per_second": 3.302, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.5975565910339355, |
|
"learning_rate": 0.00018366331658291454, |
|
"loss": 1.2576, |
|
"step": 39100 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 3.3112730979919434, |
|
"learning_rate": 0.00018336180904522613, |
|
"loss": 1.2128, |
|
"step": 39200 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.5991640090942383, |
|
"learning_rate": 0.00018306030150753769, |
|
"loss": 1.2294, |
|
"step": 39300 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 4.411704063415527, |
|
"learning_rate": 0.00018275879396984922, |
|
"loss": 1.1977, |
|
"step": 39400 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 1.509308099746704, |
|
"learning_rate": 0.00018245728643216078, |
|
"loss": 1.2712, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 2.136350631713867, |
|
"learning_rate": 0.00018215577889447236, |
|
"loss": 1.2359, |
|
"step": 39600 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 2.1651546955108643, |
|
"learning_rate": 0.0001818542713567839, |
|
"loss": 1.2448, |
|
"step": 39700 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 2.9962761402130127, |
|
"learning_rate": 0.00018155577889447235, |
|
"loss": 1.218, |
|
"step": 39800 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.8525376319885254, |
|
"learning_rate": 0.0001812542713567839, |
|
"loss": 1.2564, |
|
"step": 39900 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.120208740234375, |
|
"learning_rate": 0.00018095276381909547, |
|
"loss": 1.2287, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.2058476209640503, |
|
"eval_runtime": 38.0203, |
|
"eval_samples_per_second": 26.302, |
|
"eval_steps_per_second": 3.288, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 3.9785573482513428, |
|
"learning_rate": 0.000180651256281407, |
|
"loss": 1.2161, |
|
"step": 40100 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 2.7897050380706787, |
|
"learning_rate": 0.0001803497487437186, |
|
"loss": 1.2525, |
|
"step": 40200 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 2.042492389678955, |
|
"learning_rate": 0.00018004824120603015, |
|
"loss": 1.2087, |
|
"step": 40300 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 1.8287073373794556, |
|
"learning_rate": 0.00017974673366834168, |
|
"loss": 1.2404, |
|
"step": 40400 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 1.6399390697479248, |
|
"learning_rate": 0.00017944522613065324, |
|
"loss": 1.174, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 3.9909472465515137, |
|
"learning_rate": 0.00017914371859296482, |
|
"loss": 1.1869, |
|
"step": 40600 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 2.9356400966644287, |
|
"learning_rate": 0.00017884221105527638, |
|
"loss": 1.2271, |
|
"step": 40700 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 2.205498218536377, |
|
"learning_rate": 0.00017854070351758792, |
|
"loss": 1.2505, |
|
"step": 40800 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 2.2801437377929688, |
|
"learning_rate": 0.00017823919597989948, |
|
"loss": 1.2232, |
|
"step": 40900 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 4.001745223999023, |
|
"learning_rate": 0.00017793768844221104, |
|
"loss": 1.257, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"eval_loss": 1.1965339183807373, |
|
"eval_runtime": 37.9045, |
|
"eval_samples_per_second": 26.382, |
|
"eval_steps_per_second": 3.298, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 3.484135150909424, |
|
"learning_rate": 0.0001776361809045226, |
|
"loss": 1.2232, |
|
"step": 41100 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 2.7462897300720215, |
|
"learning_rate": 0.00017733467336683415, |
|
"loss": 1.22, |
|
"step": 41200 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 2.9418435096740723, |
|
"learning_rate": 0.00017703316582914571, |
|
"loss": 1.2141, |
|
"step": 41300 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 2.188680410385132, |
|
"learning_rate": 0.00017673165829145725, |
|
"loss": 1.1909, |
|
"step": 41400 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 3.728938579559326, |
|
"learning_rate": 0.00017643015075376883, |
|
"loss": 1.2146, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 2.8790736198425293, |
|
"learning_rate": 0.0001761286432160804, |
|
"loss": 1.2305, |
|
"step": 41600 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 3.6593847274780273, |
|
"learning_rate": 0.00017582713567839195, |
|
"loss": 1.1753, |
|
"step": 41700 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 2.408237934112549, |
|
"learning_rate": 0.00017552562814070348, |
|
"loss": 1.2229, |
|
"step": 41800 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 2.574580669403076, |
|
"learning_rate": 0.00017522412060301507, |
|
"loss": 1.2173, |
|
"step": 41900 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 2.2249817848205566, |
|
"learning_rate": 0.00017492261306532663, |
|
"loss": 1.2112, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"eval_loss": 1.2255558967590332, |
|
"eval_runtime": 37.9009, |
|
"eval_samples_per_second": 26.385, |
|
"eval_steps_per_second": 3.298, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 2.2712411880493164, |
|
"learning_rate": 0.00017462110552763816, |
|
"loss": 1.1862, |
|
"step": 42100 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 1.646330714225769, |
|
"learning_rate": 0.00017431959798994972, |
|
"loss": 1.1812, |
|
"step": 42200 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 2.9691689014434814, |
|
"learning_rate": 0.0001740180904522613, |
|
"loss": 1.2055, |
|
"step": 42300 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 5.179681777954102, |
|
"learning_rate": 0.00017371658291457287, |
|
"loss": 1.1625, |
|
"step": 42400 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 2.634462833404541, |
|
"learning_rate": 0.0001734150753768844, |
|
"loss": 1.2257, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 8.693337440490723, |
|
"learning_rate": 0.00017311356783919596, |
|
"loss": 1.2447, |
|
"step": 42600 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 3.228513240814209, |
|
"learning_rate": 0.00017281206030150755, |
|
"loss": 1.1993, |
|
"step": 42700 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 7.938237190246582, |
|
"learning_rate": 0.00017251055276381908, |
|
"loss": 1.2084, |
|
"step": 42800 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 3.0843794345855713, |
|
"learning_rate": 0.00017220904522613064, |
|
"loss": 1.2017, |
|
"step": 42900 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 2.86205792427063, |
|
"learning_rate": 0.0001719075376884422, |
|
"loss": 1.1706, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"eval_loss": 1.2179350852966309, |
|
"eval_runtime": 37.9173, |
|
"eval_samples_per_second": 26.373, |
|
"eval_steps_per_second": 3.297, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 2.137380361557007, |
|
"learning_rate": 0.00017160904522613062, |
|
"loss": 1.2066, |
|
"step": 43100 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 2.250091075897217, |
|
"learning_rate": 0.00017130753768844218, |
|
"loss": 1.211, |
|
"step": 43200 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 2.008875608444214, |
|
"learning_rate": 0.00017100603015075377, |
|
"loss": 1.2116, |
|
"step": 43300 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 2.6691529750823975, |
|
"learning_rate": 0.00017070452261306533, |
|
"loss": 1.1844, |
|
"step": 43400 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 1.8802026510238647, |
|
"learning_rate": 0.00017040301507537686, |
|
"loss": 1.1849, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 2.4100139141082764, |
|
"learning_rate": 0.00017010150753768842, |
|
"loss": 1.1887, |
|
"step": 43600 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 3.3384740352630615, |
|
"learning_rate": 0.00016979999999999998, |
|
"loss": 1.2338, |
|
"step": 43700 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 2.349433183670044, |
|
"learning_rate": 0.00016949849246231154, |
|
"loss": 1.1633, |
|
"step": 43800 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 3.019296884536743, |
|
"learning_rate": 0.0001691969849246231, |
|
"loss": 1.2456, |
|
"step": 43900 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 2.497424364089966, |
|
"learning_rate": 0.00016889547738693466, |
|
"loss": 1.1671, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"eval_loss": 1.2000114917755127, |
|
"eval_runtime": 40.4714, |
|
"eval_samples_per_second": 24.709, |
|
"eval_steps_per_second": 3.089, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.6698800325393677, |
|
"learning_rate": 0.0001685939698492462, |
|
"loss": 1.2105, |
|
"step": 44100 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 2.3846988677978516, |
|
"learning_rate": 0.00016829246231155778, |
|
"loss": 1.2229, |
|
"step": 44200 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 5.891537189483643, |
|
"learning_rate": 0.00016799095477386934, |
|
"loss": 1.1848, |
|
"step": 44300 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 1.4433008432388306, |
|
"learning_rate": 0.0001676894472361809, |
|
"loss": 1.1905, |
|
"step": 44400 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 2.5641889572143555, |
|
"learning_rate": 0.00016738793969849243, |
|
"loss": 1.2219, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 3.052948474884033, |
|
"learning_rate": 0.00016708643216080402, |
|
"loss": 1.1887, |
|
"step": 44600 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 2.8185369968414307, |
|
"learning_rate": 0.00016678492462311557, |
|
"loss": 1.2107, |
|
"step": 44700 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 2.9409399032592773, |
|
"learning_rate": 0.0001664834170854271, |
|
"loss": 1.2222, |
|
"step": 44800 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 2.728256940841675, |
|
"learning_rate": 0.00016618190954773867, |
|
"loss": 1.1767, |
|
"step": 44900 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 2.4744584560394287, |
|
"learning_rate": 0.00016588040201005025, |
|
"loss": 1.1663, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"eval_loss": 1.2085031270980835, |
|
"eval_runtime": 41.0345, |
|
"eval_samples_per_second": 24.37, |
|
"eval_steps_per_second": 3.046, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 3.215564250946045, |
|
"learning_rate": 0.00016558190954773868, |
|
"loss": 1.173, |
|
"step": 45100 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.7013347148895264, |
|
"learning_rate": 0.00016528040201005024, |
|
"loss": 1.1637, |
|
"step": 45200 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 3.1096675395965576, |
|
"learning_rate": 0.0001649788944723618, |
|
"loss": 1.1702, |
|
"step": 45300 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 2.5975756645202637, |
|
"learning_rate": 0.00016467738693467336, |
|
"loss": 1.1763, |
|
"step": 45400 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 2.7020699977874756, |
|
"learning_rate": 0.0001643758793969849, |
|
"loss": 1.1761, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 1.7007007598876953, |
|
"learning_rate": 0.00016407437185929648, |
|
"loss": 1.2064, |
|
"step": 45600 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 3.6038424968719482, |
|
"learning_rate": 0.00016377286432160804, |
|
"loss": 1.1716, |
|
"step": 45700 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 2.3656082153320312, |
|
"learning_rate": 0.0001634713567839196, |
|
"loss": 1.1954, |
|
"step": 45800 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 2.390509605407715, |
|
"learning_rate": 0.00016316984924623113, |
|
"loss": 1.1664, |
|
"step": 45900 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.8767670392990112, |
|
"learning_rate": 0.00016286834170854271, |
|
"loss": 1.1784, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"eval_loss": 1.1809154748916626, |
|
"eval_runtime": 43.7304, |
|
"eval_samples_per_second": 22.867, |
|
"eval_steps_per_second": 2.858, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 3.4367122650146484, |
|
"learning_rate": 0.00016256683417085427, |
|
"loss": 1.2055, |
|
"step": 46100 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 1.672525405883789, |
|
"learning_rate": 0.0001622653266331658, |
|
"loss": 1.1954, |
|
"step": 46200 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 3.2755866050720215, |
|
"learning_rate": 0.00016196381909547737, |
|
"loss": 1.1801, |
|
"step": 46300 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 2.347280979156494, |
|
"learning_rate": 0.00016166231155778892, |
|
"loss": 1.1651, |
|
"step": 46400 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 1.9565701484680176, |
|
"learning_rate": 0.0001613608040201005, |
|
"loss": 1.2142, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 2.317847728729248, |
|
"learning_rate": 0.00016105929648241204, |
|
"loss": 1.188, |
|
"step": 46600 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 1.812322974205017, |
|
"learning_rate": 0.0001607577889447236, |
|
"loss": 1.1425, |
|
"step": 46700 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 2.5393502712249756, |
|
"learning_rate": 0.00016045628140703514, |
|
"loss": 1.1854, |
|
"step": 46800 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 6.562712669372559, |
|
"learning_rate": 0.00016015477386934672, |
|
"loss": 1.1517, |
|
"step": 46900 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 2.2086706161499023, |
|
"learning_rate": 0.00015985326633165828, |
|
"loss": 1.1634, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"eval_loss": 1.1972031593322754, |
|
"eval_runtime": 43.2883, |
|
"eval_samples_per_second": 23.101, |
|
"eval_steps_per_second": 2.888, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 2.061951160430908, |
|
"learning_rate": 0.00015955175879396984, |
|
"loss": 1.2409, |
|
"step": 47100 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 2.0312881469726562, |
|
"learning_rate": 0.00015925025125628137, |
|
"loss": 1.1731, |
|
"step": 47200 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 4.90245246887207, |
|
"learning_rate": 0.00015894874371859296, |
|
"loss": 1.1849, |
|
"step": 47300 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 2.4970901012420654, |
|
"learning_rate": 0.00015864723618090452, |
|
"loss": 1.1684, |
|
"step": 47400 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 2.4406049251556396, |
|
"learning_rate": 0.00015834572864321605, |
|
"loss": 1.1855, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 2.8650543689727783, |
|
"learning_rate": 0.0001580442211055276, |
|
"loss": 1.1586, |
|
"step": 47600 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 2.4787731170654297, |
|
"learning_rate": 0.0001577427135678392, |
|
"loss": 1.1913, |
|
"step": 47700 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 2.5188841819763184, |
|
"learning_rate": 0.00015744120603015076, |
|
"loss": 1.1938, |
|
"step": 47800 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 3.8095650672912598, |
|
"learning_rate": 0.0001571396984924623, |
|
"loss": 1.1858, |
|
"step": 47900 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 2.147993564605713, |
|
"learning_rate": 0.00015683819095477385, |
|
"loss": 1.1703, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"eval_loss": 1.1952226161956787, |
|
"eval_runtime": 42.4811, |
|
"eval_samples_per_second": 23.54, |
|
"eval_steps_per_second": 2.942, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 3.050976514816284, |
|
"learning_rate": 0.00015653668341708544, |
|
"loss": 1.1868, |
|
"step": 48100 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 2.6880428791046143, |
|
"learning_rate": 0.00015623517587939697, |
|
"loss": 1.1486, |
|
"step": 48200 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 2.169895648956299, |
|
"learning_rate": 0.00015593366834170853, |
|
"loss": 1.1646, |
|
"step": 48300 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 9.948437690734863, |
|
"learning_rate": 0.0001556321608040201, |
|
"loss": 1.1625, |
|
"step": 48400 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 2.1219215393066406, |
|
"learning_rate": 0.00015533065326633162, |
|
"loss": 1.1854, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 3.2466542720794678, |
|
"learning_rate": 0.0001550291457286432, |
|
"loss": 1.1556, |
|
"step": 48600 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.8362162113189697, |
|
"learning_rate": 0.00015472763819095477, |
|
"loss": 1.177, |
|
"step": 48700 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 3.579221725463867, |
|
"learning_rate": 0.00015442613065326632, |
|
"loss": 1.1671, |
|
"step": 48800 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 2.256967782974243, |
|
"learning_rate": 0.00015412462311557786, |
|
"loss": 1.1807, |
|
"step": 48900 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 2.107179641723633, |
|
"learning_rate": 0.00015382311557788944, |
|
"loss": 1.186, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"eval_loss": 1.1811304092407227, |
|
"eval_runtime": 43.1582, |
|
"eval_samples_per_second": 23.171, |
|
"eval_steps_per_second": 2.896, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 2.615290880203247, |
|
"learning_rate": 0.000153521608040201, |
|
"loss": 1.1828, |
|
"step": 49100 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.600845217704773, |
|
"learning_rate": 0.00015322010050251254, |
|
"loss": 1.1438, |
|
"step": 49200 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 2.272726058959961, |
|
"learning_rate": 0.0001529185929648241, |
|
"loss": 1.1802, |
|
"step": 49300 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 1.9845112562179565, |
|
"learning_rate": 0.00015261708542713568, |
|
"loss": 1.1828, |
|
"step": 49400 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.4725877046585083, |
|
"learning_rate": 0.00015231859296482408, |
|
"loss": 1.1938, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 2.4453134536743164, |
|
"learning_rate": 0.00015201708542713567, |
|
"loss": 1.1928, |
|
"step": 49600 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 2.9869000911712646, |
|
"learning_rate": 0.00015171557788944723, |
|
"loss": 1.1982, |
|
"step": 49700 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 2.633794069290161, |
|
"learning_rate": 0.00015141407035175879, |
|
"loss": 1.1287, |
|
"step": 49800 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.8146005868911743, |
|
"learning_rate": 0.00015111256281407032, |
|
"loss": 1.1747, |
|
"step": 49900 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 6.4758405685424805, |
|
"learning_rate": 0.0001508110552763819, |
|
"loss": 1.1548, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 1.1896699666976929, |
|
"eval_runtime": 43.2315, |
|
"eval_samples_per_second": 23.131, |
|
"eval_steps_per_second": 2.891, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.5688796043395996, |
|
"learning_rate": 0.00015050954773869346, |
|
"loss": 1.168, |
|
"step": 50100 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 1.4024161100387573, |
|
"learning_rate": 0.000150208040201005, |
|
"loss": 1.1796, |
|
"step": 50200 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 2.066570997238159, |
|
"learning_rate": 0.00014990653266331658, |
|
"loss": 1.1419, |
|
"step": 50300 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 3.7978389263153076, |
|
"learning_rate": 0.00014960502512562812, |
|
"loss": 1.1497, |
|
"step": 50400 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 2.2129733562469482, |
|
"learning_rate": 0.0001493035175879397, |
|
"loss": 1.1371, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 3.0140724182128906, |
|
"learning_rate": 0.00014900201005025123, |
|
"loss": 1.1778, |
|
"step": 50600 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 2.457521915435791, |
|
"learning_rate": 0.00014870050251256282, |
|
"loss": 1.1266, |
|
"step": 50700 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 2.1066813468933105, |
|
"learning_rate": 0.00014839899497487435, |
|
"loss": 1.1635, |
|
"step": 50800 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 2.801196336746216, |
|
"learning_rate": 0.0001480974874371859, |
|
"loss": 1.1842, |
|
"step": 50900 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 4.693379878997803, |
|
"learning_rate": 0.00014779597989949747, |
|
"loss": 1.1449, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"eval_loss": 1.1495003700256348, |
|
"eval_runtime": 37.9097, |
|
"eval_samples_per_second": 26.378, |
|
"eval_steps_per_second": 3.297, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.917925477027893, |
|
"learning_rate": 0.00014749447236180903, |
|
"loss": 1.1303, |
|
"step": 51100 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 2.6460864543914795, |
|
"learning_rate": 0.0001471929648241206, |
|
"loss": 1.1638, |
|
"step": 51200 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 2.5040736198425293, |
|
"learning_rate": 0.00014689145728643215, |
|
"loss": 1.1382, |
|
"step": 51300 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 2.7533071041107178, |
|
"learning_rate": 0.0001465899497487437, |
|
"loss": 1.1803, |
|
"step": 51400 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 2.220345973968506, |
|
"learning_rate": 0.00014629145728643214, |
|
"loss": 1.1506, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 1.3668216466903687, |
|
"learning_rate": 0.0001459899497487437, |
|
"loss": 1.1538, |
|
"step": 51600 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 2.26232647895813, |
|
"learning_rate": 0.00014568844221105525, |
|
"loss": 1.2085, |
|
"step": 51700 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 5.508904933929443, |
|
"learning_rate": 0.00014538693467336681, |
|
"loss": 1.1528, |
|
"step": 51800 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 2.9169905185699463, |
|
"learning_rate": 0.00014508542713567837, |
|
"loss": 1.1632, |
|
"step": 51900 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 2.5156240463256836, |
|
"learning_rate": 0.00014478391959798993, |
|
"loss": 1.1677, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"eval_loss": 1.174816370010376, |
|
"eval_runtime": 42.0784, |
|
"eval_samples_per_second": 23.765, |
|
"eval_steps_per_second": 2.971, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.622004747390747, |
|
"learning_rate": 0.0001444824120603015, |
|
"loss": 1.1174, |
|
"step": 52100 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 2.5255143642425537, |
|
"learning_rate": 0.00014418090452261305, |
|
"loss": 1.1415, |
|
"step": 52200 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.7780824899673462, |
|
"learning_rate": 0.0001438793969849246, |
|
"loss": 1.1871, |
|
"step": 52300 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 2.320028305053711, |
|
"learning_rate": 0.00014357788944723617, |
|
"loss": 1.1841, |
|
"step": 52400 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 2.6219685077667236, |
|
"learning_rate": 0.00014327638190954773, |
|
"loss": 1.1349, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 3.0288233757019043, |
|
"learning_rate": 0.0001429748743718593, |
|
"loss": 1.1753, |
|
"step": 52600 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 2.3062517642974854, |
|
"learning_rate": 0.00014267336683417085, |
|
"loss": 1.1836, |
|
"step": 52700 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 1.8819166421890259, |
|
"learning_rate": 0.0001423718592964824, |
|
"loss": 1.1491, |
|
"step": 52800 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 1.7771334648132324, |
|
"learning_rate": 0.00014207035175879397, |
|
"loss": 1.1311, |
|
"step": 52900 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 1.9495539665222168, |
|
"learning_rate": 0.00014176884422110553, |
|
"loss": 1.1757, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"eval_loss": 1.161841869354248, |
|
"eval_runtime": 41.6597, |
|
"eval_samples_per_second": 24.004, |
|
"eval_steps_per_second": 3.0, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 2.317021131515503, |
|
"learning_rate": 0.00014146733668341706, |
|
"loss": 1.145, |
|
"step": 53100 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 1.4079538583755493, |
|
"learning_rate": 0.00014116582914572865, |
|
"loss": 1.0893, |
|
"step": 53200 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 6.593141555786133, |
|
"learning_rate": 0.00014086432160804018, |
|
"loss": 1.1357, |
|
"step": 53300 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 2.657529830932617, |
|
"learning_rate": 0.00014056281407035177, |
|
"loss": 1.1651, |
|
"step": 53400 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 3.312056541442871, |
|
"learning_rate": 0.0001402613065326633, |
|
"loss": 1.165, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 2.1961281299591064, |
|
"learning_rate": 0.00013995979899497486, |
|
"loss": 1.1584, |
|
"step": 53600 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.933409571647644, |
|
"learning_rate": 0.00013965829145728642, |
|
"loss": 1.1382, |
|
"step": 53700 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 2.6763832569122314, |
|
"learning_rate": 0.00013935678391959798, |
|
"loss": 1.1238, |
|
"step": 53800 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 3.3957033157348633, |
|
"learning_rate": 0.00013905527638190954, |
|
"loss": 1.154, |
|
"step": 53900 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 3.526700019836426, |
|
"learning_rate": 0.0001387537688442211, |
|
"loss": 1.1325, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"eval_loss": 1.141178011894226, |
|
"eval_runtime": 37.9667, |
|
"eval_samples_per_second": 26.339, |
|
"eval_steps_per_second": 3.292, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 3.3937137126922607, |
|
"learning_rate": 0.00013845226130653265, |
|
"loss": 1.141, |
|
"step": 54100 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.9187488555908203, |
|
"learning_rate": 0.00013815075376884421, |
|
"loss": 1.1253, |
|
"step": 54200 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 2.2351136207580566, |
|
"learning_rate": 0.00013784924623115577, |
|
"loss": 1.2008, |
|
"step": 54300 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 3.97955584526062, |
|
"learning_rate": 0.0001375477386934673, |
|
"loss": 1.1609, |
|
"step": 54400 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 3.5734050273895264, |
|
"learning_rate": 0.0001372462311557789, |
|
"loss": 1.1584, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 2.3804807662963867, |
|
"learning_rate": 0.00013694472361809042, |
|
"loss": 1.1343, |
|
"step": 54600 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 2.0606038570404053, |
|
"learning_rate": 0.000136643216080402, |
|
"loss": 1.1555, |
|
"step": 54700 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 4.046571731567383, |
|
"learning_rate": 0.00013634170854271354, |
|
"loss": 1.1543, |
|
"step": 54800 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 2.470393180847168, |
|
"learning_rate": 0.00013604020100502513, |
|
"loss": 1.1651, |
|
"step": 54900 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.4677540063858032, |
|
"learning_rate": 0.00013573869346733666, |
|
"loss": 1.1366, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"eval_loss": 1.1223907470703125, |
|
"eval_runtime": 43.6458, |
|
"eval_samples_per_second": 22.912, |
|
"eval_steps_per_second": 2.864, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 2.5567593574523926, |
|
"learning_rate": 0.00013543718592964822, |
|
"loss": 1.1348, |
|
"step": 55100 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 4.812506675720215, |
|
"learning_rate": 0.00013513567839195978, |
|
"loss": 1.1675, |
|
"step": 55200 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 2.5467748641967773, |
|
"learning_rate": 0.00013483417085427134, |
|
"loss": 1.1238, |
|
"step": 55300 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 4.469081878662109, |
|
"learning_rate": 0.0001345326633165829, |
|
"loss": 1.102, |
|
"step": 55400 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 3.878526449203491, |
|
"learning_rate": 0.00013423115577889446, |
|
"loss": 1.131, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 2.0142953395843506, |
|
"learning_rate": 0.00013392964824120602, |
|
"loss": 1.1349, |
|
"step": 55600 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 2.600478410720825, |
|
"learning_rate": 0.00013362814070351758, |
|
"loss": 1.1363, |
|
"step": 55700 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 2.58322811126709, |
|
"learning_rate": 0.00013332663316582914, |
|
"loss": 1.1426, |
|
"step": 55800 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 2.2471609115600586, |
|
"learning_rate": 0.0001330251256281407, |
|
"loss": 1.1446, |
|
"step": 55900 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.8442782163619995, |
|
"learning_rate": 0.00013272361809045226, |
|
"loss": 1.1315, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"eval_loss": 1.1661006212234497, |
|
"eval_runtime": 47.0159, |
|
"eval_samples_per_second": 21.269, |
|
"eval_steps_per_second": 2.659, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 2.2928128242492676, |
|
"learning_rate": 0.0001324221105527638, |
|
"loss": 1.122, |
|
"step": 56100 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 2.192915201187134, |
|
"learning_rate": 0.00013212361809045224, |
|
"loss": 1.1251, |
|
"step": 56200 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 2.334547519683838, |
|
"learning_rate": 0.00013182211055276383, |
|
"loss": 1.1408, |
|
"step": 56300 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 1.832930088043213, |
|
"learning_rate": 0.00013152060301507536, |
|
"loss": 1.1146, |
|
"step": 56400 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 4.524071216583252, |
|
"learning_rate": 0.00013121909547738692, |
|
"loss": 1.1661, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 1.4990063905715942, |
|
"learning_rate": 0.00013091758793969848, |
|
"loss": 1.1247, |
|
"step": 56600 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 3.572678804397583, |
|
"learning_rate": 0.00013061608040201004, |
|
"loss": 1.1251, |
|
"step": 56700 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 2.0090138912200928, |
|
"learning_rate": 0.0001303145728643216, |
|
"loss": 1.1267, |
|
"step": 56800 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 2.0328962802886963, |
|
"learning_rate": 0.00013001306532663316, |
|
"loss": 1.1343, |
|
"step": 56900 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.5744613409042358, |
|
"learning_rate": 0.00012971155778894472, |
|
"loss": 1.1208, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"eval_loss": 1.1388169527053833, |
|
"eval_runtime": 65.7696, |
|
"eval_samples_per_second": 15.205, |
|
"eval_steps_per_second": 1.901, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.2835485935211182, |
|
"learning_rate": 0.00012941005025125628, |
|
"loss": 1.1561, |
|
"step": 57100 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 3.413334846496582, |
|
"learning_rate": 0.00012910854271356784, |
|
"loss": 1.126, |
|
"step": 57200 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 2.6612489223480225, |
|
"learning_rate": 0.00012880703517587937, |
|
"loss": 1.1705, |
|
"step": 57300 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 2.0389411449432373, |
|
"learning_rate": 0.00012850552763819096, |
|
"loss": 1.1322, |
|
"step": 57400 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 2.203789710998535, |
|
"learning_rate": 0.0001282040201005025, |
|
"loss": 1.1437, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 5.272101879119873, |
|
"learning_rate": 0.00012790251256281407, |
|
"loss": 1.1333, |
|
"step": 57600 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 3.0776541233062744, |
|
"learning_rate": 0.0001276010050251256, |
|
"loss": 1.1235, |
|
"step": 57700 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 3.8333828449249268, |
|
"learning_rate": 0.0001272994974874372, |
|
"loss": 1.1141, |
|
"step": 57800 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 3.3916189670562744, |
|
"learning_rate": 0.00012699798994974873, |
|
"loss": 1.1084, |
|
"step": 57900 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 1.6035398244857788, |
|
"learning_rate": 0.00012669648241206029, |
|
"loss": 1.1057, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"eval_loss": 1.1182321310043335, |
|
"eval_runtime": 60.567, |
|
"eval_samples_per_second": 16.511, |
|
"eval_steps_per_second": 2.064, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 2.41086745262146, |
|
"learning_rate": 0.00012639497487437184, |
|
"loss": 1.1424, |
|
"step": 58100 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 1.8278477191925049, |
|
"learning_rate": 0.0001260934673366834, |
|
"loss": 1.1126, |
|
"step": 58200 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 2.7294256687164307, |
|
"learning_rate": 0.00012579195979899496, |
|
"loss": 1.1207, |
|
"step": 58300 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 2.813084602355957, |
|
"learning_rate": 0.00012549045226130652, |
|
"loss": 1.1498, |
|
"step": 58400 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 2.6869473457336426, |
|
"learning_rate": 0.00012519195979899495, |
|
"loss": 1.1198, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 1.8101871013641357, |
|
"learning_rate": 0.00012489045226130654, |
|
"loss": 1.1725, |
|
"step": 58600 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 4.7469305992126465, |
|
"learning_rate": 0.00012458894472361807, |
|
"loss": 1.1382, |
|
"step": 58700 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 1.8046541213989258, |
|
"learning_rate": 0.00012428743718592965, |
|
"loss": 1.082, |
|
"step": 58800 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 2.176015615463257, |
|
"learning_rate": 0.0001239859296482412, |
|
"loss": 1.1304, |
|
"step": 58900 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 1.8910236358642578, |
|
"learning_rate": 0.00012368442211055277, |
|
"loss": 1.1638, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"eval_loss": 1.1192156076431274, |
|
"eval_runtime": 41.8257, |
|
"eval_samples_per_second": 23.909, |
|
"eval_steps_per_second": 2.989, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 2.288358211517334, |
|
"learning_rate": 0.0001233829145728643, |
|
"loss": 1.1203, |
|
"step": 59100 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.9389914274215698, |
|
"learning_rate": 0.00012308140703517586, |
|
"loss": 1.0892, |
|
"step": 59200 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 2.1551334857940674, |
|
"learning_rate": 0.00012277989949748742, |
|
"loss": 1.1046, |
|
"step": 59300 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 1.5200018882751465, |
|
"learning_rate": 0.00012247839195979898, |
|
"loss": 1.1373, |
|
"step": 59400 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 2.45053768157959, |
|
"learning_rate": 0.00012217688442211054, |
|
"loss": 1.1403, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 2.767160177230835, |
|
"learning_rate": 0.00012187537688442209, |
|
"loss": 1.0693, |
|
"step": 59600 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 2.3581674098968506, |
|
"learning_rate": 0.00012157386934673366, |
|
"loss": 1.125, |
|
"step": 59700 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 1.4579651355743408, |
|
"learning_rate": 0.00012127236180904521, |
|
"loss": 1.127, |
|
"step": 59800 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 4.08085298538208, |
|
"learning_rate": 0.00012097085427135678, |
|
"loss": 1.1539, |
|
"step": 59900 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.5620448589324951, |
|
"learning_rate": 0.00012066934673366833, |
|
"loss": 1.1372, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.130272626876831, |
|
"eval_runtime": 37.9665, |
|
"eval_samples_per_second": 26.339, |
|
"eval_steps_per_second": 3.292, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 3.270860433578491, |
|
"learning_rate": 0.00012036783919597989, |
|
"loss": 1.0761, |
|
"step": 60100 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 2.5301287174224854, |
|
"learning_rate": 0.00012006633165829145, |
|
"loss": 1.0881, |
|
"step": 60200 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 2.5292015075683594, |
|
"learning_rate": 0.000119764824120603, |
|
"loss": 1.046, |
|
"step": 60300 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 2.8234751224517822, |
|
"learning_rate": 0.00011946331658291456, |
|
"loss": 1.0802, |
|
"step": 60400 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 2.536975860595703, |
|
"learning_rate": 0.00011916180904522612, |
|
"loss": 1.0993, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 3.510464906692505, |
|
"learning_rate": 0.00011886030150753767, |
|
"loss": 1.1108, |
|
"step": 60600 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 1.9273101091384888, |
|
"learning_rate": 0.00011855879396984924, |
|
"loss": 1.1081, |
|
"step": 60700 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 2.1979687213897705, |
|
"learning_rate": 0.00011825728643216079, |
|
"loss": 1.1059, |
|
"step": 60800 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 2.097529172897339, |
|
"learning_rate": 0.00011795577889447236, |
|
"loss": 1.1098, |
|
"step": 60900 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 2.970689296722412, |
|
"learning_rate": 0.00011765427135678391, |
|
"loss": 1.0915, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"eval_loss": 1.0778993368148804, |
|
"eval_runtime": 37.9552, |
|
"eval_samples_per_second": 26.347, |
|
"eval_steps_per_second": 3.293, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 2.3489325046539307, |
|
"learning_rate": 0.00011735577889447236, |
|
"loss": 1.1174, |
|
"step": 61100 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 2.9280216693878174, |
|
"learning_rate": 0.00011705427135678391, |
|
"loss": 1.0977, |
|
"step": 61200 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 2.231684446334839, |
|
"learning_rate": 0.00011675276381909548, |
|
"loss": 1.1004, |
|
"step": 61300 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 1.8373113870620728, |
|
"learning_rate": 0.00011645125628140703, |
|
"loss": 1.109, |
|
"step": 61400 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 3.446971893310547, |
|
"learning_rate": 0.00011614974874371859, |
|
"loss": 1.092, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 2.2681097984313965, |
|
"learning_rate": 0.00011584824120603014, |
|
"loss": 1.0901, |
|
"step": 61600 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 2.173755407333374, |
|
"learning_rate": 0.0001155467336683417, |
|
"loss": 1.0638, |
|
"step": 61700 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 3.3374030590057373, |
|
"learning_rate": 0.00011524522613065325, |
|
"loss": 1.1036, |
|
"step": 61800 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 2.082169771194458, |
|
"learning_rate": 0.00011494371859296481, |
|
"loss": 1.0737, |
|
"step": 61900 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 2.741830587387085, |
|
"learning_rate": 0.00011464221105527637, |
|
"loss": 1.0705, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"eval_loss": 1.079288125038147, |
|
"eval_runtime": 37.9824, |
|
"eval_samples_per_second": 26.328, |
|
"eval_steps_per_second": 3.291, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 2.128262996673584, |
|
"learning_rate": 0.00011434070351758793, |
|
"loss": 1.0964, |
|
"step": 62100 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 2.100025177001953, |
|
"learning_rate": 0.00011403919597989949, |
|
"loss": 1.0951, |
|
"step": 62200 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 7.355963706970215, |
|
"learning_rate": 0.00011373768844221103, |
|
"loss": 1.128, |
|
"step": 62300 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 2.6374123096466064, |
|
"learning_rate": 0.0001134361809045226, |
|
"loss": 1.0928, |
|
"step": 62400 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 2.6389834880828857, |
|
"learning_rate": 0.00011313467336683415, |
|
"loss": 1.1067, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 3.367866277694702, |
|
"learning_rate": 0.00011283316582914573, |
|
"loss": 1.0719, |
|
"step": 62600 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 2.0250422954559326, |
|
"learning_rate": 0.00011253165829145727, |
|
"loss": 1.0967, |
|
"step": 62700 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 3.8763527870178223, |
|
"learning_rate": 0.00011223015075376884, |
|
"loss": 1.0819, |
|
"step": 62800 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 2.7926995754241943, |
|
"learning_rate": 0.00011192864321608039, |
|
"loss": 1.1123, |
|
"step": 62900 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 2.5031745433807373, |
|
"learning_rate": 0.00011162713567839195, |
|
"loss": 1.0725, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"eval_loss": 1.117138147354126, |
|
"eval_runtime": 37.9757, |
|
"eval_samples_per_second": 26.333, |
|
"eval_steps_per_second": 3.292, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 2.086465835571289, |
|
"learning_rate": 0.00011132562814070351, |
|
"loss": 1.0588, |
|
"step": 63100 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 3.295759439468384, |
|
"learning_rate": 0.00011102412060301507, |
|
"loss": 1.1175, |
|
"step": 63200 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 2.666032075881958, |
|
"learning_rate": 0.00011072261306532661, |
|
"loss": 1.0963, |
|
"step": 63300 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 1.8267697095870972, |
|
"learning_rate": 0.00011042110552763819, |
|
"loss": 1.0691, |
|
"step": 63400 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 2.682745933532715, |
|
"learning_rate": 0.00011011959798994973, |
|
"loss": 1.0671, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 2.914111375808716, |
|
"learning_rate": 0.00010982110552763819, |
|
"loss": 1.0809, |
|
"step": 63600 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 2.7258005142211914, |
|
"learning_rate": 0.00010951959798994973, |
|
"loss": 1.0527, |
|
"step": 63700 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 2.646939992904663, |
|
"learning_rate": 0.0001092180904522613, |
|
"loss": 1.0523, |
|
"step": 63800 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 2.107849359512329, |
|
"learning_rate": 0.00010891658291457285, |
|
"loss": 1.0629, |
|
"step": 63900 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.9583218097686768, |
|
"learning_rate": 0.00010861507537688442, |
|
"loss": 1.065, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"eval_loss": 1.1121866703033447, |
|
"eval_runtime": 37.9368, |
|
"eval_samples_per_second": 26.36, |
|
"eval_steps_per_second": 3.295, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 2.384493589401245, |
|
"learning_rate": 0.00010831356783919597, |
|
"loss": 1.0664, |
|
"step": 64100 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 2.060441732406616, |
|
"learning_rate": 0.00010801206030150753, |
|
"loss": 1.0762, |
|
"step": 64200 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 6.751837253570557, |
|
"learning_rate": 0.00010771055276381909, |
|
"loss": 1.0553, |
|
"step": 64300 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 2.9765820503234863, |
|
"learning_rate": 0.00010740904522613064, |
|
"loss": 1.0636, |
|
"step": 64400 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 2.2694509029388428, |
|
"learning_rate": 0.00010710753768844221, |
|
"loss": 1.1031, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 3.272937536239624, |
|
"learning_rate": 0.00010680603015075375, |
|
"loss": 1.1053, |
|
"step": 64600 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 3.242722988128662, |
|
"learning_rate": 0.00010650452261306531, |
|
"loss": 1.1013, |
|
"step": 64700 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 2.7234878540039062, |
|
"learning_rate": 0.00010620301507537687, |
|
"loss": 1.0428, |
|
"step": 64800 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 2.30928373336792, |
|
"learning_rate": 0.00010590150753768843, |
|
"loss": 1.067, |
|
"step": 64900 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 4.809457302093506, |
|
"learning_rate": 0.00010559999999999998, |
|
"loss": 1.053, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"eval_loss": 1.1082242727279663, |
|
"eval_runtime": 37.9286, |
|
"eval_samples_per_second": 26.365, |
|
"eval_steps_per_second": 3.296, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 2.282684087753296, |
|
"learning_rate": 0.00010529849246231155, |
|
"loss": 1.0547, |
|
"step": 65100 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 3.756114959716797, |
|
"learning_rate": 0.0001049969849246231, |
|
"loss": 1.0435, |
|
"step": 65200 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 3.709932565689087, |
|
"learning_rate": 0.00010469547738693467, |
|
"loss": 1.0678, |
|
"step": 65300 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 1.6080820560455322, |
|
"learning_rate": 0.00010439396984924622, |
|
"loss": 1.101, |
|
"step": 65400 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 2.2617008686065674, |
|
"learning_rate": 0.00010409246231155779, |
|
"loss": 1.0729, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 3.1394824981689453, |
|
"learning_rate": 0.00010379095477386933, |
|
"loss": 1.0861, |
|
"step": 65600 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 2.8208096027374268, |
|
"learning_rate": 0.0001034894472361809, |
|
"loss": 1.0535, |
|
"step": 65700 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 2.7133829593658447, |
|
"learning_rate": 0.00010318793969849245, |
|
"loss": 1.0498, |
|
"step": 65800 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 2.2674591541290283, |
|
"learning_rate": 0.00010288643216080401, |
|
"loss": 1.0861, |
|
"step": 65900 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 2.238206386566162, |
|
"learning_rate": 0.00010258492462311557, |
|
"loss": 1.0557, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"eval_loss": 1.0877478122711182, |
|
"eval_runtime": 37.9734, |
|
"eval_samples_per_second": 26.334, |
|
"eval_steps_per_second": 3.292, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 1.8776639699935913, |
|
"learning_rate": 0.00010228643216080401, |
|
"loss": 1.0898, |
|
"step": 66100 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 2.540071725845337, |
|
"learning_rate": 0.00010198492462311557, |
|
"loss": 1.0437, |
|
"step": 66200 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 3.616443157196045, |
|
"learning_rate": 0.00010168341708542713, |
|
"loss": 1.0698, |
|
"step": 66300 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 2.866360902786255, |
|
"learning_rate": 0.00010138190954773868, |
|
"loss": 1.0666, |
|
"step": 66400 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 3.1752941608428955, |
|
"learning_rate": 0.00010108040201005025, |
|
"loss": 1.0723, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 4.475529193878174, |
|
"learning_rate": 0.0001007788944723618, |
|
"loss": 1.105, |
|
"step": 66600 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 2.9230782985687256, |
|
"learning_rate": 0.00010047738693467337, |
|
"loss": 1.0674, |
|
"step": 66700 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 4.472579479217529, |
|
"learning_rate": 0.00010017587939698491, |
|
"loss": 1.0798, |
|
"step": 66800 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 2.9080252647399902, |
|
"learning_rate": 9.987437185929649e-05, |
|
"loss": 1.0789, |
|
"step": 66900 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 2.728170394897461, |
|
"learning_rate": 9.957286432160803e-05, |
|
"loss": 1.0771, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"eval_loss": 1.0558359622955322, |
|
"eval_runtime": 37.9887, |
|
"eval_samples_per_second": 26.324, |
|
"eval_steps_per_second": 3.29, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 2.227384328842163, |
|
"learning_rate": 9.927135678391958e-05, |
|
"loss": 1.0336, |
|
"step": 67100 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 2.5888235569000244, |
|
"learning_rate": 9.896984924623115e-05, |
|
"loss": 1.0525, |
|
"step": 67200 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 1.9375131130218506, |
|
"learning_rate": 9.86683417085427e-05, |
|
"loss": 1.1218, |
|
"step": 67300 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 1.8543367385864258, |
|
"learning_rate": 9.836683417085426e-05, |
|
"loss": 1.0761, |
|
"step": 67400 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 3.050717353820801, |
|
"learning_rate": 9.806532663316582e-05, |
|
"loss": 1.07, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 3.321708917617798, |
|
"learning_rate": 9.776381909547738e-05, |
|
"loss": 1.0606, |
|
"step": 67600 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 2.958376407623291, |
|
"learning_rate": 9.746231155778894e-05, |
|
"loss": 1.0608, |
|
"step": 67700 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 2.215822219848633, |
|
"learning_rate": 9.71608040201005e-05, |
|
"loss": 1.0605, |
|
"step": 67800 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 2.430649518966675, |
|
"learning_rate": 9.685929648241204e-05, |
|
"loss": 1.0783, |
|
"step": 67900 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 2.4160895347595215, |
|
"learning_rate": 9.655778894472361e-05, |
|
"loss": 1.0783, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"eval_loss": 1.1083147525787354, |
|
"eval_runtime": 37.9578, |
|
"eval_samples_per_second": 26.345, |
|
"eval_steps_per_second": 3.293, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 3.5485310554504395, |
|
"learning_rate": 9.625628140703516e-05, |
|
"loss": 1.0299, |
|
"step": 68100 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 2.0450522899627686, |
|
"learning_rate": 9.595477386934673e-05, |
|
"loss": 1.0662, |
|
"step": 68200 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 2.339768171310425, |
|
"learning_rate": 9.565326633165828e-05, |
|
"loss": 1.0781, |
|
"step": 68300 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 2.055027484893799, |
|
"learning_rate": 9.535477386934673e-05, |
|
"loss": 1.0586, |
|
"step": 68400 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 3.186723232269287, |
|
"learning_rate": 9.505326633165828e-05, |
|
"loss": 1.071, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 2.934070587158203, |
|
"learning_rate": 9.475175879396985e-05, |
|
"loss": 1.0474, |
|
"step": 68600 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 4.080368995666504, |
|
"learning_rate": 9.44502512562814e-05, |
|
"loss": 1.0376, |
|
"step": 68700 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 9.1796236038208, |
|
"learning_rate": 9.415175879396985e-05, |
|
"loss": 1.0362, |
|
"step": 68800 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 2.9005532264709473, |
|
"learning_rate": 9.38502512562814e-05, |
|
"loss": 1.0581, |
|
"step": 68900 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 2.2525532245635986, |
|
"learning_rate": 9.354874371859296e-05, |
|
"loss": 1.0664, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"eval_loss": 1.0977917909622192, |
|
"eval_runtime": 37.9301, |
|
"eval_samples_per_second": 26.364, |
|
"eval_steps_per_second": 3.296, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 4.754021644592285, |
|
"learning_rate": 9.324723618090452e-05, |
|
"loss": 1.0512, |
|
"step": 69100 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 2.1440653800964355, |
|
"learning_rate": 9.294572864321607e-05, |
|
"loss": 1.0653, |
|
"step": 69200 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 2.278679609298706, |
|
"learning_rate": 9.264422110552762e-05, |
|
"loss": 1.0466, |
|
"step": 69300 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 2.176259994506836, |
|
"learning_rate": 9.23427135678392e-05, |
|
"loss": 1.0664, |
|
"step": 69400 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 2.2514779567718506, |
|
"learning_rate": 9.204120603015074e-05, |
|
"loss": 1.0597, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 3.136343002319336, |
|
"learning_rate": 9.173969849246231e-05, |
|
"loss": 1.0742, |
|
"step": 69600 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 1.6031814813613892, |
|
"learning_rate": 9.143819095477386e-05, |
|
"loss": 1.0435, |
|
"step": 69700 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 5.727216720581055, |
|
"learning_rate": 9.113668341708543e-05, |
|
"loss": 1.0837, |
|
"step": 69800 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 2.909613609313965, |
|
"learning_rate": 9.083517587939698e-05, |
|
"loss": 1.0292, |
|
"step": 69900 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 2.8508193492889404, |
|
"learning_rate": 9.053366834170854e-05, |
|
"loss": 1.0643, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"eval_loss": 1.0314569473266602, |
|
"eval_runtime": 45.3565, |
|
"eval_samples_per_second": 22.048, |
|
"eval_steps_per_second": 2.756, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 1.3868812322616577, |
|
"learning_rate": 9.02321608040201e-05, |
|
"loss": 1.0719, |
|
"step": 70100 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 2.059966564178467, |
|
"learning_rate": 8.993065326633164e-05, |
|
"loss": 1.0496, |
|
"step": 70200 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 2.371212959289551, |
|
"learning_rate": 8.962914572864322e-05, |
|
"loss": 1.0416, |
|
"step": 70300 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 5.051455497741699, |
|
"learning_rate": 8.932763819095476e-05, |
|
"loss": 1.0817, |
|
"step": 70400 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 2.4436607360839844, |
|
"learning_rate": 8.902613065326632e-05, |
|
"loss": 1.0434, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 2.097843885421753, |
|
"learning_rate": 8.872462311557788e-05, |
|
"loss": 1.06, |
|
"step": 70600 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 3.9826953411102295, |
|
"learning_rate": 8.842311557788944e-05, |
|
"loss": 1.0921, |
|
"step": 70700 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 3.572988748550415, |
|
"learning_rate": 8.812160804020099e-05, |
|
"loss": 1.0503, |
|
"step": 70800 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 3.2607603073120117, |
|
"learning_rate": 8.782010050251256e-05, |
|
"loss": 1.0308, |
|
"step": 70900 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 2.152568817138672, |
|
"learning_rate": 8.75185929648241e-05, |
|
"loss": 1.0508, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"eval_loss": 1.035280704498291, |
|
"eval_runtime": 44.3432, |
|
"eval_samples_per_second": 22.551, |
|
"eval_steps_per_second": 2.819, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 1.5636742115020752, |
|
"learning_rate": 8.721708542713568e-05, |
|
"loss": 1.0177, |
|
"step": 71100 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 1.9526029825210571, |
|
"learning_rate": 8.691557788944722e-05, |
|
"loss": 1.0516, |
|
"step": 71200 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 2.2071800231933594, |
|
"learning_rate": 8.66140703517588e-05, |
|
"loss": 1.034, |
|
"step": 71300 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 2.6768360137939453, |
|
"learning_rate": 8.631256281407034e-05, |
|
"loss": 1.0642, |
|
"step": 71400 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 1.6602065563201904, |
|
"learning_rate": 8.60110552763819e-05, |
|
"loss": 1.0389, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 2.439145565032959, |
|
"learning_rate": 8.570954773869346e-05, |
|
"loss": 1.0536, |
|
"step": 71600 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 6.254899978637695, |
|
"learning_rate": 8.54110552763819e-05, |
|
"loss": 1.0141, |
|
"step": 71700 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 1.8221715688705444, |
|
"learning_rate": 8.510954773869346e-05, |
|
"loss": 1.044, |
|
"step": 71800 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 4.5664849281311035, |
|
"learning_rate": 8.480804020100502e-05, |
|
"loss": 1.0665, |
|
"step": 71900 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 2.4576423168182373, |
|
"learning_rate": 8.450653266331658e-05, |
|
"loss": 1.0615, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"eval_loss": 1.031246542930603, |
|
"eval_runtime": 42.8264, |
|
"eval_samples_per_second": 23.35, |
|
"eval_steps_per_second": 2.919, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 2.763627290725708, |
|
"learning_rate": 8.420502512562814e-05, |
|
"loss": 1.0333, |
|
"step": 72100 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 1.6231377124786377, |
|
"learning_rate": 8.390351758793968e-05, |
|
"loss": 1.0572, |
|
"step": 72200 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 1.9768860340118408, |
|
"learning_rate": 8.360201005025126e-05, |
|
"loss": 1.0423, |
|
"step": 72300 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 2.292513132095337, |
|
"learning_rate": 8.33005025125628e-05, |
|
"loss": 1.0655, |
|
"step": 72400 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 2.1181390285491943, |
|
"learning_rate": 8.299899497487438e-05, |
|
"loss": 1.0216, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 2.3944106101989746, |
|
"learning_rate": 8.269748743718592e-05, |
|
"loss": 1.0585, |
|
"step": 72600 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 1.5745407342910767, |
|
"learning_rate": 8.23959798994975e-05, |
|
"loss": 1.0629, |
|
"step": 72700 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 2.130709648132324, |
|
"learning_rate": 8.209447236180904e-05, |
|
"loss": 1.0027, |
|
"step": 72800 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 3.202035427093506, |
|
"learning_rate": 8.179296482412059e-05, |
|
"loss": 1.0385, |
|
"step": 72900 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 2.009536027908325, |
|
"learning_rate": 8.149145728643216e-05, |
|
"loss": 1.0471, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"eval_loss": 1.047244668006897, |
|
"eval_runtime": 38.0986, |
|
"eval_samples_per_second": 26.248, |
|
"eval_steps_per_second": 3.281, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 5.239896774291992, |
|
"learning_rate": 8.11899497487437e-05, |
|
"loss": 1.0527, |
|
"step": 73100 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 3.438692808151245, |
|
"learning_rate": 8.088844221105527e-05, |
|
"loss": 1.0198, |
|
"step": 73200 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 2.0132901668548584, |
|
"learning_rate": 8.058693467336682e-05, |
|
"loss": 0.989, |
|
"step": 73300 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 2.9494431018829346, |
|
"learning_rate": 8.028542713567838e-05, |
|
"loss": 1.0329, |
|
"step": 73400 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 2.8393380641937256, |
|
"learning_rate": 7.998391959798994e-05, |
|
"loss": 1.043, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 3.039391279220581, |
|
"learning_rate": 7.96824120603015e-05, |
|
"loss": 1.0035, |
|
"step": 73600 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 3.696676731109619, |
|
"learning_rate": 7.938090452261305e-05, |
|
"loss": 1.0472, |
|
"step": 73700 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 2.8557331562042236, |
|
"learning_rate": 7.907939698492462e-05, |
|
"loss": 1.0665, |
|
"step": 73800 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 3.7987170219421387, |
|
"learning_rate": 7.877788944723617e-05, |
|
"loss": 1.0233, |
|
"step": 73900 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 1.9759894609451294, |
|
"learning_rate": 7.847638190954774e-05, |
|
"loss": 1.0303, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"eval_loss": 1.0124469995498657, |
|
"eval_runtime": 38.8346, |
|
"eval_samples_per_second": 25.75, |
|
"eval_steps_per_second": 3.219, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 1.9311368465423584, |
|
"learning_rate": 7.817487437185929e-05, |
|
"loss": 1.0479, |
|
"step": 74100 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 4.948327541351318, |
|
"learning_rate": 7.787336683417086e-05, |
|
"loss": 1.0197, |
|
"step": 74200 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 2.6867167949676514, |
|
"learning_rate": 7.75718592964824e-05, |
|
"loss": 1.0209, |
|
"step": 74300 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 1.8292616605758667, |
|
"learning_rate": 7.727035175879396e-05, |
|
"loss": 1.0257, |
|
"step": 74400 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 3.2925384044647217, |
|
"learning_rate": 7.696884422110552e-05, |
|
"loss": 1.0635, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 2.2040624618530273, |
|
"learning_rate": 7.666733668341708e-05, |
|
"loss": 1.0285, |
|
"step": 74600 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 2.1025142669677734, |
|
"learning_rate": 7.636582914572863e-05, |
|
"loss": 1.05, |
|
"step": 74700 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 2.409148693084717, |
|
"learning_rate": 7.60643216080402e-05, |
|
"loss": 1.0638, |
|
"step": 74800 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 3.284660577774048, |
|
"learning_rate": 7.576281407035175e-05, |
|
"loss": 1.0203, |
|
"step": 74900 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 2.3454208374023438, |
|
"learning_rate": 7.546130653266332e-05, |
|
"loss": 1.0425, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"eval_loss": 1.0414044857025146, |
|
"eval_runtime": 38.2892, |
|
"eval_samples_per_second": 26.117, |
|
"eval_steps_per_second": 3.265, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 2.6853275299072266, |
|
"learning_rate": 7.515979899497487e-05, |
|
"loss": 0.9762, |
|
"step": 75100 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 1.439287543296814, |
|
"learning_rate": 7.485829145728643e-05, |
|
"loss": 0.9955, |
|
"step": 75200 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 2.0795187950134277, |
|
"learning_rate": 7.455678391959799e-05, |
|
"loss": 1.0148, |
|
"step": 75300 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 2.318300247192383, |
|
"learning_rate": 7.425527638190955e-05, |
|
"loss": 1.0368, |
|
"step": 75400 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 2.979464054107666, |
|
"learning_rate": 7.39537688442211e-05, |
|
"loss": 1.0233, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 2.384615421295166, |
|
"learning_rate": 7.365226130653266e-05, |
|
"loss": 1.0183, |
|
"step": 75600 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 2.2947332859039307, |
|
"learning_rate": 7.335075376884421e-05, |
|
"loss": 1.046, |
|
"step": 75700 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 2.707266330718994, |
|
"learning_rate": 7.304924623115577e-05, |
|
"loss": 1.0145, |
|
"step": 75800 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 1.8125189542770386, |
|
"learning_rate": 7.275075376884422e-05, |
|
"loss": 1.0508, |
|
"step": 75900 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 1.833924412727356, |
|
"learning_rate": 7.244924623115577e-05, |
|
"loss": 1.051, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"eval_loss": 1.0207512378692627, |
|
"eval_runtime": 38.1696, |
|
"eval_samples_per_second": 26.199, |
|
"eval_steps_per_second": 3.275, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 2.3891940116882324, |
|
"learning_rate": 7.214773869346733e-05, |
|
"loss": 1.0006, |
|
"step": 76100 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 2.6063296794891357, |
|
"learning_rate": 7.184623115577889e-05, |
|
"loss": 1.0011, |
|
"step": 76200 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 1.7001017332077026, |
|
"learning_rate": 7.154472361809045e-05, |
|
"loss": 1.0172, |
|
"step": 76300 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 2.0134339332580566, |
|
"learning_rate": 7.124321608040201e-05, |
|
"loss": 1.0367, |
|
"step": 76400 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 2.199366807937622, |
|
"learning_rate": 7.094170854271357e-05, |
|
"loss": 1.044, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 2.8991353511810303, |
|
"learning_rate": 7.064020100502511e-05, |
|
"loss": 1.0121, |
|
"step": 76600 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 5.798487663269043, |
|
"learning_rate": 7.033869346733667e-05, |
|
"loss": 0.9734, |
|
"step": 76700 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 2.8960068225860596, |
|
"learning_rate": 7.003718592964823e-05, |
|
"loss": 1.004, |
|
"step": 76800 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 2.980179786682129, |
|
"learning_rate": 6.973567839195979e-05, |
|
"loss": 1.0118, |
|
"step": 76900 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 6.4917988777160645, |
|
"learning_rate": 6.943417085427135e-05, |
|
"loss": 0.9682, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"eval_loss": 1.0282562971115112, |
|
"eval_runtime": 38.0717, |
|
"eval_samples_per_second": 26.266, |
|
"eval_steps_per_second": 3.283, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 2.9224038124084473, |
|
"learning_rate": 6.913266331658291e-05, |
|
"loss": 1.0385, |
|
"step": 77100 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 4.447437763214111, |
|
"learning_rate": 6.883115577889447e-05, |
|
"loss": 1.0388, |
|
"step": 77200 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 2.2013559341430664, |
|
"learning_rate": 6.852964824120603e-05, |
|
"loss": 1.034, |
|
"step": 77300 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 1.3720605373382568, |
|
"learning_rate": 6.822814070351757e-05, |
|
"loss": 1.0512, |
|
"step": 77400 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 2.4448797702789307, |
|
"learning_rate": 6.792663316582913e-05, |
|
"loss": 1.0012, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 4.061469554901123, |
|
"learning_rate": 6.762512562814069e-05, |
|
"loss": 1.0144, |
|
"step": 77600 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 1.62380850315094, |
|
"learning_rate": 6.732361809045225e-05, |
|
"loss": 1.0369, |
|
"step": 77700 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 1.3728336095809937, |
|
"learning_rate": 6.702211055276381e-05, |
|
"loss": 1.0133, |
|
"step": 77800 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 7.0939435958862305, |
|
"learning_rate": 6.672060301507537e-05, |
|
"loss": 0.9797, |
|
"step": 77900 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 2.0842604637145996, |
|
"learning_rate": 6.642211055276381e-05, |
|
"loss": 1.0035, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"eval_loss": 1.0243637561798096, |
|
"eval_runtime": 38.1566, |
|
"eval_samples_per_second": 26.208, |
|
"eval_steps_per_second": 3.276, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 3.6360020637512207, |
|
"learning_rate": 6.612060301507537e-05, |
|
"loss": 0.9969, |
|
"step": 78100 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 2.5551681518554688, |
|
"learning_rate": 6.581909547738693e-05, |
|
"loss": 1.0203, |
|
"step": 78200 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 6.86871862411499, |
|
"learning_rate": 6.551758793969849e-05, |
|
"loss": 1.0472, |
|
"step": 78300 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 2.3950083255767822, |
|
"learning_rate": 6.521608040201005e-05, |
|
"loss": 1.0167, |
|
"step": 78400 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 1.422188401222229, |
|
"learning_rate": 6.491457286432161e-05, |
|
"loss": 0.9968, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 2.186511993408203, |
|
"learning_rate": 6.461306532663317e-05, |
|
"loss": 1.0113, |
|
"step": 78600 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 1.764722228050232, |
|
"learning_rate": 6.431155778894471e-05, |
|
"loss": 0.983, |
|
"step": 78700 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 4.928635597229004, |
|
"learning_rate": 6.401005025125627e-05, |
|
"loss": 1.0164, |
|
"step": 78800 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 2.1061389446258545, |
|
"learning_rate": 6.370854271356783e-05, |
|
"loss": 1.0171, |
|
"step": 78900 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 4.193387985229492, |
|
"learning_rate": 6.340703517587939e-05, |
|
"loss": 1.0072, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"eval_loss": 1.00971519947052, |
|
"eval_runtime": 38.1263, |
|
"eval_samples_per_second": 26.229, |
|
"eval_steps_per_second": 3.279, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 2.4844706058502197, |
|
"learning_rate": 6.310552763819095e-05, |
|
"loss": 1.0064, |
|
"step": 79100 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 5.7934746742248535, |
|
"learning_rate": 6.280402010050251e-05, |
|
"loss": 0.9509, |
|
"step": 79200 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 3.7046196460723877, |
|
"learning_rate": 6.250251256281406e-05, |
|
"loss": 1.0139, |
|
"step": 79300 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 1.9528000354766846, |
|
"learning_rate": 6.220100502512562e-05, |
|
"loss": 1.0214, |
|
"step": 79400 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 3.4000682830810547, |
|
"learning_rate": 6.189949748743718e-05, |
|
"loss": 1.006, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 3.152561664581299, |
|
"learning_rate": 6.159798994974874e-05, |
|
"loss": 1.0288, |
|
"step": 79600 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 3.774915933609009, |
|
"learning_rate": 6.12964824120603e-05, |
|
"loss": 1.022, |
|
"step": 79700 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 2.291813373565674, |
|
"learning_rate": 6.0994974874371854e-05, |
|
"loss": 0.9845, |
|
"step": 79800 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 3.019514560699463, |
|
"learning_rate": 6.0693467336683413e-05, |
|
"loss": 1.0246, |
|
"step": 79900 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.4409408569335938, |
|
"learning_rate": 6.0391959798994966e-05, |
|
"loss": 0.9951, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.9992234110832214, |
|
"eval_runtime": 39.3867, |
|
"eval_samples_per_second": 25.389, |
|
"eval_steps_per_second": 3.174, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.4257367849349976, |
|
"learning_rate": 6.0090452261306526e-05, |
|
"loss": 0.9763, |
|
"step": 80100 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 4.97927713394165, |
|
"learning_rate": 5.9788944723618085e-05, |
|
"loss": 0.9417, |
|
"step": 80200 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 2.8552098274230957, |
|
"learning_rate": 5.9487437185929644e-05, |
|
"loss": 0.9591, |
|
"step": 80300 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": Infinity, |
|
"learning_rate": 5.9188944723618084e-05, |
|
"loss": 0.9783, |
|
"step": 80400 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 3.83720064163208, |
|
"learning_rate": 5.8887437185929643e-05, |
|
"loss": 0.9607, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 2.607973337173462, |
|
"learning_rate": 5.85859296482412e-05, |
|
"loss": 0.9556, |
|
"step": 80600 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 3.51914381980896, |
|
"learning_rate": 5.8284422110552756e-05, |
|
"loss": 0.9371, |
|
"step": 80700 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 2.0518856048583984, |
|
"learning_rate": 5.7982914572864315e-05, |
|
"loss": 1.0154, |
|
"step": 80800 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 3.5824625492095947, |
|
"learning_rate": 5.7681407035175874e-05, |
|
"loss": 0.9894, |
|
"step": 80900 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 7.991865634918213, |
|
"learning_rate": 5.7379899497487434e-05, |
|
"loss": 0.9719, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"eval_loss": 1.0105689764022827, |
|
"eval_runtime": 38.1347, |
|
"eval_samples_per_second": 26.223, |
|
"eval_steps_per_second": 3.278, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 1.6757104396820068, |
|
"learning_rate": 5.707839195979899e-05, |
|
"loss": 0.9526, |
|
"step": 81100 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 3.1675045490264893, |
|
"learning_rate": 5.677688442211055e-05, |
|
"loss": 0.9798, |
|
"step": 81200 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 2.8390209674835205, |
|
"learning_rate": 5.6475376884422105e-05, |
|
"loss": 0.9455, |
|
"step": 81300 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 2.2900238037109375, |
|
"learning_rate": 5.6173869346733665e-05, |
|
"loss": 1.0016, |
|
"step": 81400 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 2.4220378398895264, |
|
"learning_rate": 5.5872361809045224e-05, |
|
"loss": 0.9681, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 2.7175300121307373, |
|
"learning_rate": 5.5570854271356784e-05, |
|
"loss": 0.9822, |
|
"step": 81600 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 3.7499475479125977, |
|
"learning_rate": 5.526934673366834e-05, |
|
"loss": 0.9501, |
|
"step": 81700 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 2.1566553115844727, |
|
"learning_rate": 5.4967839195979896e-05, |
|
"loss": 0.9601, |
|
"step": 81800 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 2.080754280090332, |
|
"learning_rate": 5.466633165829145e-05, |
|
"loss": 0.954, |
|
"step": 81900 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 3.1466102600097656, |
|
"learning_rate": 5.436482412060301e-05, |
|
"loss": 0.9896, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"eval_loss": 1.0087724924087524, |
|
"eval_runtime": 37.9931, |
|
"eval_samples_per_second": 26.321, |
|
"eval_steps_per_second": 3.29, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"grad_norm": 4.262351989746094, |
|
"learning_rate": 5.406331658291457e-05, |
|
"loss": 0.9454, |
|
"step": 82100 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"grad_norm": 1.9488756656646729, |
|
"learning_rate": 5.376180904522612e-05, |
|
"loss": 0.9494, |
|
"step": 82200 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 1.6786818504333496, |
|
"learning_rate": 5.346030150753768e-05, |
|
"loss": 0.9241, |
|
"step": 82300 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 2.143955945968628, |
|
"learning_rate": 5.315879396984924e-05, |
|
"loss": 0.9958, |
|
"step": 82400 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 3.6211471557617188, |
|
"learning_rate": 5.286030150753768e-05, |
|
"loss": 0.9641, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 4.066643238067627, |
|
"learning_rate": 5.255879396984924e-05, |
|
"loss": 0.9698, |
|
"step": 82600 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 2.151590585708618, |
|
"learning_rate": 5.22572864321608e-05, |
|
"loss": 0.9388, |
|
"step": 82700 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 4.644803524017334, |
|
"learning_rate": 5.195577889447236e-05, |
|
"loss": 0.9141, |
|
"step": 82800 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 2.652754068374634, |
|
"learning_rate": 5.1654271356783916e-05, |
|
"loss": 0.9592, |
|
"step": 82900 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 4.528812885284424, |
|
"learning_rate": 5.135276381909547e-05, |
|
"loss": 0.9778, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"eval_loss": 0.9974797368049622, |
|
"eval_runtime": 38.0893, |
|
"eval_samples_per_second": 26.254, |
|
"eval_steps_per_second": 3.282, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 2.625786542892456, |
|
"learning_rate": 5.105125628140703e-05, |
|
"loss": 0.9594, |
|
"step": 83100 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 3.7137229442596436, |
|
"learning_rate": 5.074974874371859e-05, |
|
"loss": 0.9462, |
|
"step": 83200 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 6.682472229003906, |
|
"learning_rate": 5.044824120603015e-05, |
|
"loss": 0.9301, |
|
"step": 83300 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 2.7188687324523926, |
|
"learning_rate": 5.014673366834171e-05, |
|
"loss": 0.9801, |
|
"step": 83400 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 2.7037341594696045, |
|
"learning_rate": 4.984522613065326e-05, |
|
"loss": 0.9475, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 2.815229654312134, |
|
"learning_rate": 4.954371859296482e-05, |
|
"loss": 0.9012, |
|
"step": 83600 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 2.7187130451202393, |
|
"learning_rate": 4.924221105527638e-05, |
|
"loss": 0.9199, |
|
"step": 83700 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 1.6610496044158936, |
|
"learning_rate": 4.894070351758794e-05, |
|
"loss": 0.9321, |
|
"step": 83800 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 2.1496291160583496, |
|
"learning_rate": 4.86391959798995e-05, |
|
"loss": 0.9003, |
|
"step": 83900 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 2.9933974742889404, |
|
"learning_rate": 4.833768844221105e-05, |
|
"loss": 0.9467, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"eval_loss": 0.9802306890487671, |
|
"eval_runtime": 38.0487, |
|
"eval_samples_per_second": 26.282, |
|
"eval_steps_per_second": 3.285, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 4.368553161621094, |
|
"learning_rate": 4.803618090452261e-05, |
|
"loss": 0.921, |
|
"step": 84100 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 4.087899684906006, |
|
"learning_rate": 4.773467336683417e-05, |
|
"loss": 0.9413, |
|
"step": 84200 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 1.8541690111160278, |
|
"learning_rate": 4.743316582914573e-05, |
|
"loss": 0.9657, |
|
"step": 84300 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 2.6514675617218018, |
|
"learning_rate": 4.713165829145729e-05, |
|
"loss": 0.9645, |
|
"step": 84400 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 3.2329466342926025, |
|
"learning_rate": 4.683015075376885e-05, |
|
"loss": 0.9465, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 2.358675241470337, |
|
"learning_rate": 4.652864321608039e-05, |
|
"loss": 0.9644, |
|
"step": 84600 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 3.6738836765289307, |
|
"learning_rate": 4.6230150753768846e-05, |
|
"loss": 0.9357, |
|
"step": 84700 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 2.8447327613830566, |
|
"learning_rate": 4.59286432160804e-05, |
|
"loss": 0.9308, |
|
"step": 84800 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 1.6326079368591309, |
|
"learning_rate": 4.562713567839195e-05, |
|
"loss": 0.9068, |
|
"step": 84900 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 2.3545360565185547, |
|
"learning_rate": 4.532562814070351e-05, |
|
"loss": 0.9436, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"eval_loss": 0.9844674468040466, |
|
"eval_runtime": 38.274, |
|
"eval_samples_per_second": 26.127, |
|
"eval_steps_per_second": 3.266, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 3.2402210235595703, |
|
"learning_rate": 4.502412060301507e-05, |
|
"loss": 0.9313, |
|
"step": 85100 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 3.3900952339172363, |
|
"learning_rate": 4.472261306532662e-05, |
|
"loss": 0.9385, |
|
"step": 85200 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 3.8531854152679443, |
|
"learning_rate": 4.442110552763818e-05, |
|
"loss": 0.9292, |
|
"step": 85300 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 2.3123373985290527, |
|
"learning_rate": 4.411959798994974e-05, |
|
"loss": 0.9544, |
|
"step": 85400 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 2.5710906982421875, |
|
"learning_rate": 4.38180904522613e-05, |
|
"loss": 0.9591, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 3.4481329917907715, |
|
"learning_rate": 4.351658291457286e-05, |
|
"loss": 0.9281, |
|
"step": 85600 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 1.7887803316116333, |
|
"learning_rate": 4.321507537688442e-05, |
|
"loss": 0.9371, |
|
"step": 85700 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 6.177557945251465, |
|
"learning_rate": 4.291356783919597e-05, |
|
"loss": 0.9154, |
|
"step": 85800 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 3.0554301738739014, |
|
"learning_rate": 4.261206030150753e-05, |
|
"loss": 0.9483, |
|
"step": 85900 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 2.0133023262023926, |
|
"learning_rate": 4.231055276381909e-05, |
|
"loss": 0.9557, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"eval_loss": 0.9593837261199951, |
|
"eval_runtime": 38.1446, |
|
"eval_samples_per_second": 26.216, |
|
"eval_steps_per_second": 3.277, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 2.1396610736846924, |
|
"learning_rate": 4.200904522613065e-05, |
|
"loss": 0.9643, |
|
"step": 86100 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 2.709627628326416, |
|
"learning_rate": 4.170753768844221e-05, |
|
"loss": 0.9365, |
|
"step": 86200 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 4.406678199768066, |
|
"learning_rate": 4.1406030150753764e-05, |
|
"loss": 0.9553, |
|
"step": 86300 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 4.822593688964844, |
|
"learning_rate": 4.110452261306532e-05, |
|
"loss": 0.9213, |
|
"step": 86400 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 4.148794651031494, |
|
"learning_rate": 4.080301507537688e-05, |
|
"loss": 0.9808, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 3.7028510570526123, |
|
"learning_rate": 4.050150753768844e-05, |
|
"loss": 0.9331, |
|
"step": 86600 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 2.314500093460083, |
|
"learning_rate": 4.02e-05, |
|
"loss": 0.9551, |
|
"step": 86700 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 3.741234302520752, |
|
"learning_rate": 3.9898492462311554e-05, |
|
"loss": 0.9053, |
|
"step": 86800 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 3.7346441745758057, |
|
"learning_rate": 3.9596984924623113e-05, |
|
"loss": 0.9517, |
|
"step": 86900 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 1.324827790260315, |
|
"learning_rate": 3.929849246231156e-05, |
|
"loss": 0.9764, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"eval_loss": 1.0139998197555542, |
|
"eval_runtime": 38.1639, |
|
"eval_samples_per_second": 26.203, |
|
"eval_steps_per_second": 3.275, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 5.19126033782959, |
|
"learning_rate": 3.899698492462311e-05, |
|
"loss": 0.9366, |
|
"step": 87100 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 2.899726629257202, |
|
"learning_rate": 3.869547738693467e-05, |
|
"loss": 0.9555, |
|
"step": 87200 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 1.9099615812301636, |
|
"learning_rate": 3.839396984924623e-05, |
|
"loss": 0.9033, |
|
"step": 87300 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 1.5814082622528076, |
|
"learning_rate": 3.809246231155779e-05, |
|
"loss": 0.9978, |
|
"step": 87400 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 3.4520106315612793, |
|
"learning_rate": 3.779095477386935e-05, |
|
"loss": 0.9343, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 3.0876681804656982, |
|
"learning_rate": 3.74894472361809e-05, |
|
"loss": 0.9094, |
|
"step": 87600 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 3.5139119625091553, |
|
"learning_rate": 3.718793969849246e-05, |
|
"loss": 0.8677, |
|
"step": 87700 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 2.003330945968628, |
|
"learning_rate": 3.6886432160804015e-05, |
|
"loss": 0.9351, |
|
"step": 87800 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 2.259235382080078, |
|
"learning_rate": 3.6584924623115574e-05, |
|
"loss": 0.9388, |
|
"step": 87900 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 2.2141153812408447, |
|
"learning_rate": 3.6283417085427134e-05, |
|
"loss": 0.9169, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"eval_loss": 0.9528889060020447, |
|
"eval_runtime": 38.0305, |
|
"eval_samples_per_second": 26.295, |
|
"eval_steps_per_second": 3.287, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 4.264975547790527, |
|
"learning_rate": 3.5981909547738693e-05, |
|
"loss": 0.9309, |
|
"step": 88100 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 4.431647777557373, |
|
"learning_rate": 3.5680402010050246e-05, |
|
"loss": 0.9035, |
|
"step": 88200 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 2.326883316040039, |
|
"learning_rate": 3.5378894472361806e-05, |
|
"loss": 0.904, |
|
"step": 88300 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 2.6951944828033447, |
|
"learning_rate": 3.5077386934673365e-05, |
|
"loss": 0.9195, |
|
"step": 88400 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 1.8017208576202393, |
|
"learning_rate": 3.477587939698492e-05, |
|
"loss": 0.9398, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"grad_norm": 3.8392789363861084, |
|
"learning_rate": 3.447437185929648e-05, |
|
"loss": 0.9591, |
|
"step": 88600 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"grad_norm": 2.541273593902588, |
|
"learning_rate": 3.4172864321608037e-05, |
|
"loss": 0.9054, |
|
"step": 88700 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 2.7736191749572754, |
|
"learning_rate": 3.3874371859296476e-05, |
|
"loss": 0.9473, |
|
"step": 88800 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 2.660540819168091, |
|
"learning_rate": 3.3572864321608036e-05, |
|
"loss": 0.9582, |
|
"step": 88900 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 3.161513328552246, |
|
"learning_rate": 3.3271356783919595e-05, |
|
"loss": 0.8943, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"eval_loss": 0.9552559852600098, |
|
"eval_runtime": 38.1158, |
|
"eval_samples_per_second": 26.236, |
|
"eval_steps_per_second": 3.279, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 4.881318092346191, |
|
"learning_rate": 3.2969849246231154e-05, |
|
"loss": 0.9053, |
|
"step": 89100 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 1.7572602033615112, |
|
"learning_rate": 3.2668341708542714e-05, |
|
"loss": 0.9364, |
|
"step": 89200 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 3.067507743835449, |
|
"learning_rate": 3.2366834170854267e-05, |
|
"loss": 0.9355, |
|
"step": 89300 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 3.1982858180999756, |
|
"learning_rate": 3.2065326633165826e-05, |
|
"loss": 0.9333, |
|
"step": 89400 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 3.596789598464966, |
|
"learning_rate": 3.1763819095477385e-05, |
|
"loss": 0.8978, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 5.035818576812744, |
|
"learning_rate": 3.1462311557788945e-05, |
|
"loss": 0.9337, |
|
"step": 89600 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 3.149653673171997, |
|
"learning_rate": 3.11608040201005e-05, |
|
"loss": 0.9515, |
|
"step": 89700 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 3.4601404666900635, |
|
"learning_rate": 3.085929648241206e-05, |
|
"loss": 0.9021, |
|
"step": 89800 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 2.6317124366760254, |
|
"learning_rate": 3.0557788944723616e-05, |
|
"loss": 0.9559, |
|
"step": 89900 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 2.667861223220825, |
|
"learning_rate": 3.0256281407035173e-05, |
|
"loss": 0.9341, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"eval_loss": 0.9440233111381531, |
|
"eval_runtime": 38.0809, |
|
"eval_samples_per_second": 26.26, |
|
"eval_steps_per_second": 3.282, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 3.903172016143799, |
|
"learning_rate": 2.9954773869346732e-05, |
|
"loss": 0.8857, |
|
"step": 90100 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 3.9286229610443115, |
|
"learning_rate": 2.9653266331658288e-05, |
|
"loss": 0.9119, |
|
"step": 90200 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 2.812256336212158, |
|
"learning_rate": 2.9351758793969847e-05, |
|
"loss": 0.9026, |
|
"step": 90300 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 2.2835099697113037, |
|
"learning_rate": 2.9050251256281404e-05, |
|
"loss": 0.885, |
|
"step": 90400 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 3.383111000061035, |
|
"learning_rate": 2.8748743718592963e-05, |
|
"loss": 0.8838, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 2.7682292461395264, |
|
"learning_rate": 2.8447236180904522e-05, |
|
"loss": 0.9139, |
|
"step": 90600 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 6.3915019035339355, |
|
"learning_rate": 2.814572864321608e-05, |
|
"loss": 0.9188, |
|
"step": 90700 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 5.53504753112793, |
|
"learning_rate": 2.7844221105527635e-05, |
|
"loss": 0.9118, |
|
"step": 90800 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 2.5919177532196045, |
|
"learning_rate": 2.754271356783919e-05, |
|
"loss": 0.8844, |
|
"step": 90900 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 1.9481797218322754, |
|
"learning_rate": 2.724120603015075e-05, |
|
"loss": 0.9192, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"eval_loss": 0.9217103123664856, |
|
"eval_runtime": 38.1169, |
|
"eval_samples_per_second": 26.235, |
|
"eval_steps_per_second": 3.279, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 2.1429965496063232, |
|
"learning_rate": 2.693969849246231e-05, |
|
"loss": 0.8889, |
|
"step": 91100 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 3.4818546772003174, |
|
"learning_rate": 2.6638190954773866e-05, |
|
"loss": 0.8932, |
|
"step": 91200 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 2.3813984394073486, |
|
"learning_rate": 2.6336683417085425e-05, |
|
"loss": 0.9154, |
|
"step": 91300 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 2.4688570499420166, |
|
"learning_rate": 2.6035175879396984e-05, |
|
"loss": 0.9344, |
|
"step": 91400 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 4.330790996551514, |
|
"learning_rate": 2.573366834170854e-05, |
|
"loss": 0.9137, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 2.8123939037323, |
|
"learning_rate": 2.54321608040201e-05, |
|
"loss": 0.9041, |
|
"step": 91600 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 2.1815638542175293, |
|
"learning_rate": 2.5130653266331656e-05, |
|
"loss": 0.8606, |
|
"step": 91700 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 3.3489341735839844, |
|
"learning_rate": 2.4829145728643216e-05, |
|
"loss": 0.934, |
|
"step": 91800 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 2.9650094509124756, |
|
"learning_rate": 2.4527638190954775e-05, |
|
"loss": 0.8893, |
|
"step": 91900 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 3.541456460952759, |
|
"learning_rate": 2.4226130653266328e-05, |
|
"loss": 0.9239, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"eval_loss": 0.9656698107719421, |
|
"eval_runtime": 38.5991, |
|
"eval_samples_per_second": 25.907, |
|
"eval_steps_per_second": 3.238, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"grad_norm": 3.1648945808410645, |
|
"learning_rate": 2.3924623115577887e-05, |
|
"loss": 0.8777, |
|
"step": 92100 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"grad_norm": 8.632335662841797, |
|
"learning_rate": 2.3623115577889443e-05, |
|
"loss": 0.9047, |
|
"step": 92200 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 2.9412002563476562, |
|
"learning_rate": 2.3321608040201003e-05, |
|
"loss": 0.8964, |
|
"step": 92300 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 2.7501888275146484, |
|
"learning_rate": 2.3020100502512562e-05, |
|
"loss": 0.9303, |
|
"step": 92400 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 3.36631178855896, |
|
"learning_rate": 2.2718592964824118e-05, |
|
"loss": 0.8987, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 2.6061251163482666, |
|
"learning_rate": 2.2417085427135678e-05, |
|
"loss": 0.8981, |
|
"step": 92600 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 3.9636521339416504, |
|
"learning_rate": 2.2115577889447234e-05, |
|
"loss": 0.893, |
|
"step": 92700 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 3.2085049152374268, |
|
"learning_rate": 2.1814070351758793e-05, |
|
"loss": 0.9298, |
|
"step": 92800 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 2.590059995651245, |
|
"learning_rate": 2.1512562814070353e-05, |
|
"loss": 0.9118, |
|
"step": 92900 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 4.868690013885498, |
|
"learning_rate": 2.121105527638191e-05, |
|
"loss": 0.8873, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"eval_loss": 0.918121337890625, |
|
"eval_runtime": 38.3542, |
|
"eval_samples_per_second": 26.073, |
|
"eval_steps_per_second": 3.259, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 4.0143303871154785, |
|
"learning_rate": 2.0909547738693465e-05, |
|
"loss": 0.871, |
|
"step": 93100 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 4.423349857330322, |
|
"learning_rate": 2.060804020100502e-05, |
|
"loss": 0.9232, |
|
"step": 93200 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 3.6609606742858887, |
|
"learning_rate": 2.030653266331658e-05, |
|
"loss": 0.8782, |
|
"step": 93300 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 3.252089738845825, |
|
"learning_rate": 2.0008040201005026e-05, |
|
"loss": 0.9232, |
|
"step": 93400 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 2.8783979415893555, |
|
"learning_rate": 1.970653266331658e-05, |
|
"loss": 0.8539, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 5.381927967071533, |
|
"learning_rate": 1.940502512562814e-05, |
|
"loss": 0.9263, |
|
"step": 93600 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 3.1031525135040283, |
|
"learning_rate": 1.9103517587939695e-05, |
|
"loss": 0.9095, |
|
"step": 93700 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 2.668039321899414, |
|
"learning_rate": 1.8802010050251254e-05, |
|
"loss": 0.892, |
|
"step": 93800 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 2.6661875247955322, |
|
"learning_rate": 1.8500502512562814e-05, |
|
"loss": 0.8944, |
|
"step": 93900 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 3.5291526317596436, |
|
"learning_rate": 1.819899497487437e-05, |
|
"loss": 0.9074, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"eval_loss": 0.9208371639251709, |
|
"eval_runtime": 38.4003, |
|
"eval_samples_per_second": 26.041, |
|
"eval_steps_per_second": 3.255, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 4.160482883453369, |
|
"learning_rate": 1.789748743718593e-05, |
|
"loss": 0.9045, |
|
"step": 94100 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 3.8051962852478027, |
|
"learning_rate": 1.7595979899497485e-05, |
|
"loss": 0.899, |
|
"step": 94200 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 3.431490898132324, |
|
"learning_rate": 1.7294472361809045e-05, |
|
"loss": 0.8577, |
|
"step": 94300 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 2.356250524520874, |
|
"learning_rate": 1.69929648241206e-05, |
|
"loss": 0.9204, |
|
"step": 94400 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 5.237595081329346, |
|
"learning_rate": 1.669145728643216e-05, |
|
"loss": 0.8973, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 5.023568153381348, |
|
"learning_rate": 1.6389949748743716e-05, |
|
"loss": 0.9064, |
|
"step": 94600 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 6.610247611999512, |
|
"learning_rate": 1.6088442211055276e-05, |
|
"loss": 0.858, |
|
"step": 94700 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 2.1937615871429443, |
|
"learning_rate": 1.5786934673366835e-05, |
|
"loss": 0.872, |
|
"step": 94800 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 4.40328311920166, |
|
"learning_rate": 1.548542713567839e-05, |
|
"loss": 0.88, |
|
"step": 94900 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 3.0487658977508545, |
|
"learning_rate": 1.5183919597989947e-05, |
|
"loss": 0.8779, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"eval_loss": 0.9459323883056641, |
|
"eval_runtime": 38.1338, |
|
"eval_samples_per_second": 26.223, |
|
"eval_steps_per_second": 3.278, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 3.8922808170318604, |
|
"learning_rate": 1.4882412060301507e-05, |
|
"loss": 0.9075, |
|
"step": 95100 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 3.232625722885132, |
|
"learning_rate": 1.4580904522613064e-05, |
|
"loss": 0.869, |
|
"step": 95200 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 8.73833179473877, |
|
"learning_rate": 1.4279396984924622e-05, |
|
"loss": 0.8741, |
|
"step": 95300 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 4.5711846351623535, |
|
"learning_rate": 1.397788944723618e-05, |
|
"loss": 0.8976, |
|
"step": 95400 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 4.647241115570068, |
|
"learning_rate": 1.3676381909547736e-05, |
|
"loss": 0.8392, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 4.90078067779541, |
|
"learning_rate": 1.337788944723618e-05, |
|
"loss": 0.8739, |
|
"step": 95600 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 3.1595067977905273, |
|
"learning_rate": 1.3076381909547738e-05, |
|
"loss": 0.8398, |
|
"step": 95700 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 2.488835096359253, |
|
"learning_rate": 1.2774874371859296e-05, |
|
"loss": 0.868, |
|
"step": 95800 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 4.495543003082275, |
|
"learning_rate": 1.2473366834170852e-05, |
|
"loss": 0.8872, |
|
"step": 95900 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 3.673161268234253, |
|
"learning_rate": 1.217185929648241e-05, |
|
"loss": 0.8824, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"eval_loss": 0.910308301448822, |
|
"eval_runtime": 38.0891, |
|
"eval_samples_per_second": 26.254, |
|
"eval_steps_per_second": 3.282, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 5.159984111785889, |
|
"learning_rate": 1.187035175879397e-05, |
|
"loss": 0.8672, |
|
"step": 96100 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 2.706937551498413, |
|
"learning_rate": 1.1568844221105527e-05, |
|
"loss": 0.8914, |
|
"step": 96200 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 3.727692127227783, |
|
"learning_rate": 1.1267336683417085e-05, |
|
"loss": 0.8485, |
|
"step": 96300 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 2.665670156478882, |
|
"learning_rate": 1.0965829145728641e-05, |
|
"loss": 0.8695, |
|
"step": 96400 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 5.077518463134766, |
|
"learning_rate": 1.0664321608040199e-05, |
|
"loss": 0.8767, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 3.4337048530578613, |
|
"learning_rate": 1.0362814070351758e-05, |
|
"loss": 0.8673, |
|
"step": 96600 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 3.231494665145874, |
|
"learning_rate": 1.0061306532663316e-05, |
|
"loss": 0.8767, |
|
"step": 96700 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 4.2955002784729, |
|
"learning_rate": 9.759798994974874e-06, |
|
"loss": 0.8645, |
|
"step": 96800 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 6.2070698738098145, |
|
"learning_rate": 9.458291457286431e-06, |
|
"loss": 0.8683, |
|
"step": 96900 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 3.6267805099487305, |
|
"learning_rate": 9.159798994974874e-06, |
|
"loss": 0.907, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"eval_loss": 0.9255304932594299, |
|
"eval_runtime": 38.1396, |
|
"eval_samples_per_second": 26.219, |
|
"eval_steps_per_second": 3.277, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 4.985959529876709, |
|
"learning_rate": 8.858291457286432e-06, |
|
"loss": 0.8615, |
|
"step": 97100 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 4.538032531738281, |
|
"learning_rate": 8.556783919597988e-06, |
|
"loss": 0.8519, |
|
"step": 97200 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 6.562105178833008, |
|
"learning_rate": 8.255276381909548e-06, |
|
"loss": 0.8888, |
|
"step": 97300 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 2.922360897064209, |
|
"learning_rate": 7.953768844221105e-06, |
|
"loss": 0.8784, |
|
"step": 97400 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 3.8349783420562744, |
|
"learning_rate": 7.652261306532663e-06, |
|
"loss": 0.8962, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 2.096787929534912, |
|
"learning_rate": 7.350753768844221e-06, |
|
"loss": 0.9088, |
|
"step": 97600 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 2.512312650680542, |
|
"learning_rate": 7.0492462311557786e-06, |
|
"loss": 0.8816, |
|
"step": 97700 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 4.749015808105469, |
|
"learning_rate": 6.7477386934673355e-06, |
|
"loss": 0.8791, |
|
"step": 97800 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 3.5753800868988037, |
|
"learning_rate": 6.446231155778894e-06, |
|
"loss": 0.8414, |
|
"step": 97900 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 2.849839210510254, |
|
"learning_rate": 6.144723618090452e-06, |
|
"loss": 0.873, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"eval_loss": 0.8922821283340454, |
|
"eval_runtime": 38.1228, |
|
"eval_samples_per_second": 26.231, |
|
"eval_steps_per_second": 3.279, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 4.473388195037842, |
|
"learning_rate": 5.8432160804020096e-06, |
|
"loss": 0.8428, |
|
"step": 98100 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 2.7943496704101562, |
|
"learning_rate": 5.541708542713567e-06, |
|
"loss": 0.8519, |
|
"step": 98200 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 2.476835012435913, |
|
"learning_rate": 5.240201005025126e-06, |
|
"loss": 0.8841, |
|
"step": 98300 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 4.992676258087158, |
|
"learning_rate": 4.938693467336683e-06, |
|
"loss": 0.8409, |
|
"step": 98400 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 2.4756906032562256, |
|
"learning_rate": 4.637185929648241e-06, |
|
"loss": 0.8527, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"grad_norm": 2.157059669494629, |
|
"learning_rate": 4.335678391959798e-06, |
|
"loss": 0.8605, |
|
"step": 98600 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"grad_norm": 2.8840818405151367, |
|
"learning_rate": 4.034170854271356e-06, |
|
"loss": 0.87, |
|
"step": 98700 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"grad_norm": 4.124537944793701, |
|
"learning_rate": 3.7326633165829143e-06, |
|
"loss": 0.8318, |
|
"step": 98800 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 4.684917449951172, |
|
"learning_rate": 3.431155778894472e-06, |
|
"loss": 0.8479, |
|
"step": 98900 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 2.413602590560913, |
|
"learning_rate": 3.12964824120603e-06, |
|
"loss": 0.8452, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"eval_loss": 0.8957632780075073, |
|
"eval_runtime": 38.1658, |
|
"eval_samples_per_second": 26.201, |
|
"eval_steps_per_second": 3.275, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 3.240213394165039, |
|
"learning_rate": 2.828140703517588e-06, |
|
"loss": 0.8303, |
|
"step": 99100 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 4.0827555656433105, |
|
"learning_rate": 2.5266331658291453e-06, |
|
"loss": 0.8872, |
|
"step": 99200 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 2.948489189147949, |
|
"learning_rate": 2.2251256281407035e-06, |
|
"loss": 0.8707, |
|
"step": 99300 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 6.414693832397461, |
|
"learning_rate": 1.9236180904522612e-06, |
|
"loss": 0.837, |
|
"step": 99400 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 5.013907432556152, |
|
"learning_rate": 1.622110552763819e-06, |
|
"loss": 0.8443, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"grad_norm": 2.487205743789673, |
|
"learning_rate": 1.3206030150753765e-06, |
|
"loss": 0.8425, |
|
"step": 99600 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 5.77063512802124, |
|
"learning_rate": 1.0190954773869345e-06, |
|
"loss": 0.8509, |
|
"step": 99700 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 3.125368356704712, |
|
"learning_rate": 7.175879396984924e-07, |
|
"loss": 0.8874, |
|
"step": 99800 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 8.932684898376465, |
|
"learning_rate": 4.160804020100502e-07, |
|
"loss": 0.858, |
|
"step": 99900 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 5.0273756980896, |
|
"learning_rate": 1.1457286432160803e-07, |
|
"loss": 0.8394, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.9212185144424438, |
|
"eval_runtime": 38.102, |
|
"eval_samples_per_second": 26.245, |
|
"eval_steps_per_second": 3.281, |
|
"step": 100000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 1000, |
|
"total_flos": 1.2076594495488e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|