|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.92, |
|
"eval_steps": 500, |
|
"global_step": 310, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 0.3297976851463318, |
|
"learning_rate": 0.0002990322580645161, |
|
"loss": 1.0389, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 0.4069916307926178, |
|
"learning_rate": 0.0002980645161290322, |
|
"loss": 1.3377, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 0.42084500193595886, |
|
"learning_rate": 0.00029709677419354836, |
|
"loss": 0.9366, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.4641948938369751, |
|
"learning_rate": 0.0002961290322580645, |
|
"loss": 1.0086, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.3840750455856323, |
|
"learning_rate": 0.00029516129032258065, |
|
"loss": 0.8333, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.4263865053653717, |
|
"learning_rate": 0.00029419354838709674, |
|
"loss": 0.854, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.48615148663520813, |
|
"learning_rate": 0.0002932258064516129, |
|
"loss": 0.9548, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.44419369101524353, |
|
"learning_rate": 0.00029225806451612903, |
|
"loss": 0.8482, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.5317733883857727, |
|
"learning_rate": 0.0002912903225806451, |
|
"loss": 0.9426, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.47260937094688416, |
|
"learning_rate": 0.00029032258064516127, |
|
"loss": 0.9816, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.39063283801078796, |
|
"learning_rate": 0.00028935483870967736, |
|
"loss": 0.84, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.39234670996665955, |
|
"learning_rate": 0.0002883870967741935, |
|
"loss": 0.7476, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.40661805868148804, |
|
"learning_rate": 0.00028741935483870965, |
|
"loss": 0.9282, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.42970865964889526, |
|
"learning_rate": 0.0002864516129032258, |
|
"loss": 0.7858, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.3780193626880646, |
|
"learning_rate": 0.00028548387096774194, |
|
"loss": 0.7968, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.37006014585494995, |
|
"learning_rate": 0.00028451612903225803, |
|
"loss": 0.6801, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.3660840392112732, |
|
"learning_rate": 0.0002835483870967742, |
|
"loss": 0.5914, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.3270975351333618, |
|
"learning_rate": 0.00028258064516129027, |
|
"loss": 0.6449, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.3859024941921234, |
|
"learning_rate": 0.0002816129032258064, |
|
"loss": 0.8144, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.37092071771621704, |
|
"learning_rate": 0.00028064516129032256, |
|
"loss": 0.7667, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.37667015194892883, |
|
"learning_rate": 0.0002796774193548387, |
|
"loss": 0.7751, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.3832458555698395, |
|
"learning_rate": 0.0002787096774193548, |
|
"loss": 0.755, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 0.327288419008255, |
|
"learning_rate": 0.00027774193548387095, |
|
"loss": 0.7178, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.34552687406539917, |
|
"learning_rate": 0.0002767741935483871, |
|
"loss": 0.7057, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.3611259460449219, |
|
"learning_rate": 0.0002758064516129032, |
|
"loss": 0.8159, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.3345054090023041, |
|
"learning_rate": 0.00027483870967741933, |
|
"loss": 0.7208, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.3697254955768585, |
|
"learning_rate": 0.0002738709677419355, |
|
"loss": 0.8964, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.3905017375946045, |
|
"learning_rate": 0.00027290322580645157, |
|
"loss": 0.7794, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.3715725243091583, |
|
"learning_rate": 0.0002719354838709677, |
|
"loss": 0.6966, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.3650343120098114, |
|
"learning_rate": 0.00027096774193548386, |
|
"loss": 0.5761, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.33932459354400635, |
|
"learning_rate": 0.00027, |
|
"loss": 0.556, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 0.6371742486953735, |
|
"learning_rate": 0.0002690322580645161, |
|
"loss": 0.847, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 0.37499895691871643, |
|
"learning_rate": 0.00026806451612903224, |
|
"loss": 0.8419, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 0.33221954107284546, |
|
"learning_rate": 0.0002670967741935484, |
|
"loss": 0.6011, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.344096839427948, |
|
"learning_rate": 0.0002661290322580645, |
|
"loss": 0.6501, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 0.38429391384124756, |
|
"learning_rate": 0.0002651612903225806, |
|
"loss": 0.8091, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 0.38014867901802063, |
|
"learning_rate": 0.00026419354838709677, |
|
"loss": 0.7668, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 0.3352573812007904, |
|
"learning_rate": 0.00026322580645161286, |
|
"loss": 0.5444, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 0.33811062574386597, |
|
"learning_rate": 0.000262258064516129, |
|
"loss": 0.512, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.3998416066169739, |
|
"learning_rate": 0.00026129032258064515, |
|
"loss": 0.6315, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"grad_norm": 0.3983341157436371, |
|
"learning_rate": 0.0002603225806451613, |
|
"loss": 0.5882, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 0.4585898816585541, |
|
"learning_rate": 0.0002593548387096774, |
|
"loss": 0.761, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 0.4080730080604553, |
|
"learning_rate": 0.00025838709677419354, |
|
"loss": 0.6716, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 0.4068273901939392, |
|
"learning_rate": 0.0002574193548387096, |
|
"loss": 0.6376, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.4406949579715729, |
|
"learning_rate": 0.00025645161290322577, |
|
"loss": 0.4594, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 0.34500986337661743, |
|
"learning_rate": 0.0002554838709677419, |
|
"loss": 0.3672, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.504, |
|
"grad_norm": 0.4760681390762329, |
|
"learning_rate": 0.00025451612903225806, |
|
"loss": 0.6331, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 0.39281558990478516, |
|
"learning_rate": 0.0002535483870967742, |
|
"loss": 0.5845, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.568, |
|
"grad_norm": 0.4265002906322479, |
|
"learning_rate": 0.0002525806451612903, |
|
"loss": 0.4461, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.40967294573783875, |
|
"learning_rate": 0.00025161290322580645, |
|
"loss": 0.7011, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.6320000000000001, |
|
"grad_norm": 0.4288088381290436, |
|
"learning_rate": 0.00025064516129032254, |
|
"loss": 0.6928, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.6640000000000001, |
|
"grad_norm": 0.4356289803981781, |
|
"learning_rate": 0.0002496774193548387, |
|
"loss": 0.7972, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.696, |
|
"grad_norm": 0.3827487826347351, |
|
"learning_rate": 0.0002487096774193548, |
|
"loss": 0.2991, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"grad_norm": 0.40093398094177246, |
|
"learning_rate": 0.0002477419354838709, |
|
"loss": 0.416, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.41548973321914673, |
|
"learning_rate": 0.00024677419354838707, |
|
"loss": 0.5501, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.792, |
|
"grad_norm": 0.4093388617038727, |
|
"learning_rate": 0.0002458064516129032, |
|
"loss": 0.5557, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.8239999999999998, |
|
"grad_norm": 0.3934040665626526, |
|
"learning_rate": 0.00024483870967741936, |
|
"loss": 0.602, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.8559999999999999, |
|
"grad_norm": 0.42221033573150635, |
|
"learning_rate": 0.00024387096774193545, |
|
"loss": 0.6421, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.888, |
|
"grad_norm": 0.4351339340209961, |
|
"learning_rate": 0.0002429032258064516, |
|
"loss": 0.5615, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.4319838881492615, |
|
"learning_rate": 0.00024193548387096771, |
|
"loss": 0.6804, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.952, |
|
"grad_norm": 0.40016525983810425, |
|
"learning_rate": 0.00024096774193548386, |
|
"loss": 0.5432, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.984, |
|
"grad_norm": 0.3905942440032959, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 0.4187, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.016, |
|
"grad_norm": 0.8056382536888123, |
|
"learning_rate": 0.0002390322580645161, |
|
"loss": 1.0174, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.048, |
|
"grad_norm": 0.3835236430168152, |
|
"learning_rate": 0.00023806451612903224, |
|
"loss": 0.5992, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.41092216968536377, |
|
"learning_rate": 0.00023709677419354836, |
|
"loss": 0.4746, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.112, |
|
"grad_norm": 0.39536622166633606, |
|
"learning_rate": 0.0002361290322580645, |
|
"loss": 0.3946, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.144, |
|
"grad_norm": 0.3927665948867798, |
|
"learning_rate": 0.0002351612903225806, |
|
"loss": 0.5187, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.176, |
|
"grad_norm": 0.39792704582214355, |
|
"learning_rate": 0.00023419354838709674, |
|
"loss": 0.4568, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.208, |
|
"grad_norm": 0.5023652911186218, |
|
"learning_rate": 0.0002332258064516129, |
|
"loss": 0.6166, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.425017774105072, |
|
"learning_rate": 0.000232258064516129, |
|
"loss": 0.42, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.2720000000000002, |
|
"grad_norm": 0.46458110213279724, |
|
"learning_rate": 0.00023129032258064516, |
|
"loss": 0.4613, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.304, |
|
"grad_norm": 0.49037960171699524, |
|
"learning_rate": 0.00023032258064516125, |
|
"loss": 0.5509, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.336, |
|
"grad_norm": 0.5233697891235352, |
|
"learning_rate": 0.0002293548387096774, |
|
"loss": 0.6396, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.368, |
|
"grad_norm": 0.4720582962036133, |
|
"learning_rate": 0.0002283870967741935, |
|
"loss": 0.5076, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.4900650382041931, |
|
"learning_rate": 0.00022741935483870966, |
|
"loss": 0.4794, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.432, |
|
"grad_norm": 0.6321704983711243, |
|
"learning_rate": 0.0002264516129032258, |
|
"loss": 0.6677, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.464, |
|
"grad_norm": 0.5305324792861938, |
|
"learning_rate": 0.00022548387096774192, |
|
"loss": 0.5102, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.496, |
|
"grad_norm": 0.5799248218536377, |
|
"learning_rate": 0.00022451612903225804, |
|
"loss": 0.5274, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.528, |
|
"grad_norm": 0.4990101456642151, |
|
"learning_rate": 0.00022354838709677416, |
|
"loss": 0.5407, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.4779827296733856, |
|
"learning_rate": 0.0002225806451612903, |
|
"loss": 0.5166, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.592, |
|
"grad_norm": 0.5140111446380615, |
|
"learning_rate": 0.00022161290322580645, |
|
"loss": 0.3288, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.624, |
|
"grad_norm": 0.5674853920936584, |
|
"learning_rate": 0.00022064516129032257, |
|
"loss": 0.666, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.656, |
|
"grad_norm": 0.5277597308158875, |
|
"learning_rate": 0.00021967741935483871, |
|
"loss": 0.5335, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.6879999999999997, |
|
"grad_norm": 0.6029439568519592, |
|
"learning_rate": 0.0002187096774193548, |
|
"loss": 0.693, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 0.5039327144622803, |
|
"learning_rate": 0.00021774193548387095, |
|
"loss": 0.5728, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.752, |
|
"grad_norm": 0.5564692616462708, |
|
"learning_rate": 0.00021677419354838707, |
|
"loss": 0.4734, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.784, |
|
"grad_norm": 0.5278319120407104, |
|
"learning_rate": 0.00021580645161290322, |
|
"loss": 0.5834, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.816, |
|
"grad_norm": 0.5445135831832886, |
|
"learning_rate": 0.00021483870967741936, |
|
"loss": 0.4642, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.848, |
|
"grad_norm": 0.5394749045372009, |
|
"learning_rate": 0.00021387096774193545, |
|
"loss": 0.4779, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.5756134390830994, |
|
"learning_rate": 0.0002129032258064516, |
|
"loss": 0.5607, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.912, |
|
"grad_norm": 0.48361241817474365, |
|
"learning_rate": 0.00021193548387096772, |
|
"loss": 0.4278, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.944, |
|
"grad_norm": 0.5017121434211731, |
|
"learning_rate": 0.00021096774193548386, |
|
"loss": 0.4834, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.976, |
|
"grad_norm": 0.4741989076137543, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 0.468, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 3.008, |
|
"grad_norm": 1.003368854522705, |
|
"learning_rate": 0.0002090322580645161, |
|
"loss": 0.8614, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.4782228469848633, |
|
"learning_rate": 0.00020806451612903225, |
|
"loss": 0.4111, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.072, |
|
"grad_norm": 0.4558674395084381, |
|
"learning_rate": 0.00020709677419354836, |
|
"loss": 0.3463, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.104, |
|
"grad_norm": 0.4409371316432953, |
|
"learning_rate": 0.0002061290322580645, |
|
"loss": 0.2571, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.136, |
|
"grad_norm": 0.5415034890174866, |
|
"learning_rate": 0.00020516129032258063, |
|
"loss": 0.5707, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.168, |
|
"grad_norm": 0.6157724857330322, |
|
"learning_rate": 0.00020419354838709677, |
|
"loss": 0.5692, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.4855688810348511, |
|
"learning_rate": 0.00020322580645161287, |
|
"loss": 0.3311, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.232, |
|
"grad_norm": 0.569878101348877, |
|
"learning_rate": 0.000202258064516129, |
|
"loss": 0.4707, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.2640000000000002, |
|
"grad_norm": 0.645232081413269, |
|
"learning_rate": 0.00020129032258064516, |
|
"loss": 0.5504, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.296, |
|
"grad_norm": 0.5775763392448425, |
|
"learning_rate": 0.00020032258064516128, |
|
"loss": 0.3651, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 3.328, |
|
"grad_norm": 0.5808250904083252, |
|
"learning_rate": 0.00019935483870967742, |
|
"loss": 0.5068, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 0.689313530921936, |
|
"learning_rate": 0.0001983870967741935, |
|
"loss": 0.4936, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.392, |
|
"grad_norm": 0.6571519374847412, |
|
"learning_rate": 0.00019741935483870966, |
|
"loss": 0.3671, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 3.424, |
|
"grad_norm": 0.6340517401695251, |
|
"learning_rate": 0.00019645161290322578, |
|
"loss": 0.4783, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 3.456, |
|
"grad_norm": 0.7031407952308655, |
|
"learning_rate": 0.00019548387096774192, |
|
"loss": 0.427, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 3.488, |
|
"grad_norm": 0.728496789932251, |
|
"learning_rate": 0.00019451612903225807, |
|
"loss": 0.5497, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.6106727719306946, |
|
"learning_rate": 0.00019354838709677416, |
|
"loss": 0.392, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.552, |
|
"grad_norm": 0.5296047329902649, |
|
"learning_rate": 0.0001925806451612903, |
|
"loss": 0.3412, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 3.584, |
|
"grad_norm": 0.6282025575637817, |
|
"learning_rate": 0.00019161290322580643, |
|
"loss": 0.4081, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.616, |
|
"grad_norm": 0.6166461110115051, |
|
"learning_rate": 0.00019064516129032257, |
|
"loss": 0.4771, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 3.648, |
|
"grad_norm": 0.5448863506317139, |
|
"learning_rate": 0.0001896774193548387, |
|
"loss": 0.404, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.6598389148712158, |
|
"learning_rate": 0.0001887096774193548, |
|
"loss": 0.3915, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.7119999999999997, |
|
"grad_norm": 0.5567564368247986, |
|
"learning_rate": 0.00018774193548387095, |
|
"loss": 0.3862, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 3.7439999999999998, |
|
"grad_norm": 0.6524521708488464, |
|
"learning_rate": 0.00018677419354838707, |
|
"loss": 0.5315, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 3.776, |
|
"grad_norm": 0.7040128707885742, |
|
"learning_rate": 0.00018580645161290322, |
|
"loss": 0.5387, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 3.808, |
|
"grad_norm": 0.690262496471405, |
|
"learning_rate": 0.00018483870967741934, |
|
"loss": 0.4877, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 0.6928034424781799, |
|
"learning_rate": 0.00018387096774193548, |
|
"loss": 0.4895, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.872, |
|
"grad_norm": 0.7148469686508179, |
|
"learning_rate": 0.00018290322580645157, |
|
"loss": 0.4814, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 3.904, |
|
"grad_norm": 0.6096572875976562, |
|
"learning_rate": 0.00018193548387096772, |
|
"loss": 0.3403, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 3.936, |
|
"grad_norm": 0.7132399678230286, |
|
"learning_rate": 0.00018096774193548387, |
|
"loss": 0.4258, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 3.968, |
|
"grad_norm": 0.7302684187889099, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 0.7215, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.5244004726409912, |
|
"learning_rate": 0.00017903225806451613, |
|
"loss": 0.8544, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 4.032, |
|
"grad_norm": 0.6032777428627014, |
|
"learning_rate": 0.00017806451612903222, |
|
"loss": 0.4183, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 4.064, |
|
"grad_norm": 0.6349691152572632, |
|
"learning_rate": 0.00017709677419354837, |
|
"loss": 0.5871, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 4.096, |
|
"grad_norm": 0.5730060935020447, |
|
"learning_rate": 0.00017612903225806449, |
|
"loss": 0.3786, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.128, |
|
"grad_norm": 0.6988044381141663, |
|
"learning_rate": 0.00017516129032258063, |
|
"loss": 0.3216, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.7379153370857239, |
|
"learning_rate": 0.00017419354838709678, |
|
"loss": 0.4026, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.192, |
|
"grad_norm": 0.7058238983154297, |
|
"learning_rate": 0.00017322580645161287, |
|
"loss": 0.4328, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 4.224, |
|
"grad_norm": 0.80663001537323, |
|
"learning_rate": 0.00017225806451612901, |
|
"loss": 0.3849, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 4.256, |
|
"grad_norm": 0.899818480014801, |
|
"learning_rate": 0.00017129032258064513, |
|
"loss": 0.4191, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 4.288, |
|
"grad_norm": 0.8538224697113037, |
|
"learning_rate": 0.00017032258064516128, |
|
"loss": 0.3587, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 0.8948169350624084, |
|
"learning_rate": 0.00016935483870967742, |
|
"loss": 0.3957, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.352, |
|
"grad_norm": 0.7195591926574707, |
|
"learning_rate": 0.00016838709677419354, |
|
"loss": 0.3361, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.384, |
|
"grad_norm": 0.7769681215286255, |
|
"learning_rate": 0.00016741935483870966, |
|
"loss": 0.3519, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 4.416, |
|
"grad_norm": 0.9509867429733276, |
|
"learning_rate": 0.00016645161290322578, |
|
"loss": 0.4216, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 4.448, |
|
"grad_norm": 0.7923309206962585, |
|
"learning_rate": 0.00016548387096774193, |
|
"loss": 0.3999, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 0.8961685299873352, |
|
"learning_rate": 0.00016451612903225804, |
|
"loss": 0.5385, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.5120000000000005, |
|
"grad_norm": 0.7496562004089355, |
|
"learning_rate": 0.0001635483870967742, |
|
"loss": 0.341, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 4.5440000000000005, |
|
"grad_norm": 0.8512839674949646, |
|
"learning_rate": 0.00016258064516129034, |
|
"loss": 0.3847, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 4.576, |
|
"grad_norm": 0.7487362027168274, |
|
"learning_rate": 0.00016161290322580643, |
|
"loss": 0.3694, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 4.608, |
|
"grad_norm": 0.7957774996757507, |
|
"learning_rate": 0.00016064516129032257, |
|
"loss": 0.3379, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 0.7299221754074097, |
|
"learning_rate": 0.0001596774193548387, |
|
"loss": 0.2989, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.672, |
|
"grad_norm": 0.7909884452819824, |
|
"learning_rate": 0.00015870967741935484, |
|
"loss": 0.3675, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 4.704, |
|
"grad_norm": 0.7321597933769226, |
|
"learning_rate": 0.00015774193548387093, |
|
"loss": 0.3243, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 4.736, |
|
"grad_norm": 0.7196181416511536, |
|
"learning_rate": 0.00015677419354838708, |
|
"loss": 0.2709, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 4.768, |
|
"grad_norm": 0.7918142676353455, |
|
"learning_rate": 0.00015580645161290322, |
|
"loss": 0.3934, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.8657622337341309, |
|
"learning_rate": 0.00015483870967741934, |
|
"loss": 0.3583, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.832, |
|
"grad_norm": 0.8207722306251526, |
|
"learning_rate": 0.00015387096774193549, |
|
"loss": 0.412, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 4.864, |
|
"grad_norm": 0.7206109166145325, |
|
"learning_rate": 0.00015290322580645158, |
|
"loss": 0.3594, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 4.896, |
|
"grad_norm": 0.8529183864593506, |
|
"learning_rate": 0.00015193548387096772, |
|
"loss": 0.512, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 4.928, |
|
"grad_norm": 0.6895930171012878, |
|
"learning_rate": 0.00015096774193548384, |
|
"loss": 0.333, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 0.7422910332679749, |
|
"learning_rate": 0.00015, |
|
"loss": 0.2872, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 4.992, |
|
"grad_norm": 0.7366386651992798, |
|
"learning_rate": 0.0001490322580645161, |
|
"loss": 0.3415, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 5.024, |
|
"grad_norm": 2.1416280269622803, |
|
"learning_rate": 0.00014806451612903225, |
|
"loss": 0.9961, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 5.056, |
|
"grad_norm": 0.7944900393486023, |
|
"learning_rate": 0.00014709677419354837, |
|
"loss": 0.3372, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 5.088, |
|
"grad_norm": 0.7071006298065186, |
|
"learning_rate": 0.00014612903225806452, |
|
"loss": 0.2732, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"grad_norm": 0.7874396443367004, |
|
"learning_rate": 0.00014516129032258063, |
|
"loss": 0.2861, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.152, |
|
"grad_norm": 0.8244249224662781, |
|
"learning_rate": 0.00014419354838709675, |
|
"loss": 0.3428, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 5.184, |
|
"grad_norm": 0.81637042760849, |
|
"learning_rate": 0.0001432258064516129, |
|
"loss": 0.3037, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 5.216, |
|
"grad_norm": 0.9916559457778931, |
|
"learning_rate": 0.00014225806451612902, |
|
"loss": 0.3337, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 5.248, |
|
"grad_norm": 0.9077599048614502, |
|
"learning_rate": 0.00014129032258064514, |
|
"loss": 0.287, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 0.9824132919311523, |
|
"learning_rate": 0.00014032258064516128, |
|
"loss": 0.3852, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 5.312, |
|
"grad_norm": 1.0016467571258545, |
|
"learning_rate": 0.0001393548387096774, |
|
"loss": 0.3234, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 5.344, |
|
"grad_norm": 0.8697543144226074, |
|
"learning_rate": 0.00013838709677419355, |
|
"loss": 0.2848, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 5.376, |
|
"grad_norm": 0.8214029669761658, |
|
"learning_rate": 0.00013741935483870966, |
|
"loss": 0.3377, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 5.408, |
|
"grad_norm": 0.9105691313743591, |
|
"learning_rate": 0.00013645161290322578, |
|
"loss": 0.2944, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"grad_norm": 0.9642040133476257, |
|
"learning_rate": 0.00013548387096774193, |
|
"loss": 0.3624, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.4719999999999995, |
|
"grad_norm": 0.9218887686729431, |
|
"learning_rate": 0.00013451612903225805, |
|
"loss": 0.3938, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 5.504, |
|
"grad_norm": 0.8704710006713867, |
|
"learning_rate": 0.0001335483870967742, |
|
"loss": 0.3629, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 5.536, |
|
"grad_norm": 0.8207693099975586, |
|
"learning_rate": 0.0001325806451612903, |
|
"loss": 0.3169, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 5.568, |
|
"grad_norm": 0.9315701127052307, |
|
"learning_rate": 0.00013161290322580643, |
|
"loss": 0.429, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 0.860234260559082, |
|
"learning_rate": 0.00013064516129032258, |
|
"loss": 0.3842, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 5.632, |
|
"grad_norm": 0.8927604556083679, |
|
"learning_rate": 0.0001296774193548387, |
|
"loss": 0.3405, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 5.664, |
|
"grad_norm": 0.8084587454795837, |
|
"learning_rate": 0.0001287096774193548, |
|
"loss": 0.306, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 5.696, |
|
"grad_norm": 0.9102941155433655, |
|
"learning_rate": 0.00012774193548387096, |
|
"loss": 0.3285, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 5.728, |
|
"grad_norm": 0.763113796710968, |
|
"learning_rate": 0.0001267741935483871, |
|
"loss": 0.2729, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"grad_norm": 0.8704251646995544, |
|
"learning_rate": 0.00012580645161290322, |
|
"loss": 0.3164, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.792, |
|
"grad_norm": 0.9634932279586792, |
|
"learning_rate": 0.00012483870967741934, |
|
"loss": 0.2939, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 5.824, |
|
"grad_norm": 1.1567790508270264, |
|
"learning_rate": 0.00012387096774193546, |
|
"loss": 0.3076, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 5.856, |
|
"grad_norm": 0.9096764922142029, |
|
"learning_rate": 0.0001229032258064516, |
|
"loss": 0.3289, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 5.888, |
|
"grad_norm": 0.9840425848960876, |
|
"learning_rate": 0.00012193548387096773, |
|
"loss": 0.2772, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 0.725844144821167, |
|
"learning_rate": 0.00012096774193548386, |
|
"loss": 0.2151, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 5.952, |
|
"grad_norm": 0.8343638181686401, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 0.3825, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 5.984, |
|
"grad_norm": 0.8040199279785156, |
|
"learning_rate": 0.00011903225806451612, |
|
"loss": 0.2571, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 6.016, |
|
"grad_norm": 1.6932090520858765, |
|
"learning_rate": 0.00011806451612903225, |
|
"loss": 0.5538, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 6.048, |
|
"grad_norm": 0.744048535823822, |
|
"learning_rate": 0.00011709677419354837, |
|
"loss": 0.2335, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"grad_norm": 0.6974924206733704, |
|
"learning_rate": 0.0001161290322580645, |
|
"loss": 0.2891, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 6.112, |
|
"grad_norm": 0.7202953696250916, |
|
"learning_rate": 0.00011516129032258062, |
|
"loss": 0.2017, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 6.144, |
|
"grad_norm": 0.8437547087669373, |
|
"learning_rate": 0.00011419354838709676, |
|
"loss": 0.2175, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 6.176, |
|
"grad_norm": 1.0741796493530273, |
|
"learning_rate": 0.0001132258064516129, |
|
"loss": 0.3913, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 6.208, |
|
"grad_norm": 1.031754493713379, |
|
"learning_rate": 0.00011225806451612902, |
|
"loss": 0.298, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"grad_norm": 0.9575178027153015, |
|
"learning_rate": 0.00011129032258064515, |
|
"loss": 0.3201, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 6.272, |
|
"grad_norm": 0.9503082633018494, |
|
"learning_rate": 0.00011032258064516128, |
|
"loss": 0.2005, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 6.304, |
|
"grad_norm": 1.2572892904281616, |
|
"learning_rate": 0.0001093548387096774, |
|
"loss": 0.3045, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 6.336, |
|
"grad_norm": 1.5667368173599243, |
|
"learning_rate": 0.00010838709677419353, |
|
"loss": 0.4053, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 6.368, |
|
"grad_norm": 0.9439151883125305, |
|
"learning_rate": 0.00010741935483870968, |
|
"loss": 0.2721, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 1.0985567569732666, |
|
"learning_rate": 0.0001064516129032258, |
|
"loss": 0.2543, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.432, |
|
"grad_norm": 0.789880633354187, |
|
"learning_rate": 0.00010548387096774193, |
|
"loss": 0.2148, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 6.464, |
|
"grad_norm": 0.9937541484832764, |
|
"learning_rate": 0.00010451612903225805, |
|
"loss": 0.2343, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 6.496, |
|
"grad_norm": 0.9496509432792664, |
|
"learning_rate": 0.00010354838709677418, |
|
"loss": 0.2576, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 6.5280000000000005, |
|
"grad_norm": 0.9214590191841125, |
|
"learning_rate": 0.00010258064516129031, |
|
"loss": 0.3067, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 6.5600000000000005, |
|
"grad_norm": 0.8984239101409912, |
|
"learning_rate": 0.00010161290322580643, |
|
"loss": 0.2471, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 6.592, |
|
"grad_norm": 0.8055192232131958, |
|
"learning_rate": 0.00010064516129032258, |
|
"loss": 0.2234, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 6.624, |
|
"grad_norm": 0.769008219242096, |
|
"learning_rate": 9.967741935483871e-05, |
|
"loss": 0.1963, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 6.656, |
|
"grad_norm": 0.7947174310684204, |
|
"learning_rate": 9.870967741935483e-05, |
|
"loss": 0.2165, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 6.688, |
|
"grad_norm": 1.0192420482635498, |
|
"learning_rate": 9.774193548387096e-05, |
|
"loss": 0.2581, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"grad_norm": 1.0067439079284668, |
|
"learning_rate": 9.677419354838708e-05, |
|
"loss": 0.2394, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.752, |
|
"grad_norm": 1.0539058446884155, |
|
"learning_rate": 9.580645161290321e-05, |
|
"loss": 0.2526, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 6.784, |
|
"grad_norm": 1.130011796951294, |
|
"learning_rate": 9.483870967741934e-05, |
|
"loss": 0.3339, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 6.816, |
|
"grad_norm": 0.9603860378265381, |
|
"learning_rate": 9.387096774193548e-05, |
|
"loss": 0.2808, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 6.848, |
|
"grad_norm": 1.0667173862457275, |
|
"learning_rate": 9.290322580645161e-05, |
|
"loss": 0.3025, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 0.9093402624130249, |
|
"learning_rate": 9.193548387096774e-05, |
|
"loss": 0.2698, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 6.912, |
|
"grad_norm": 0.8621392846107483, |
|
"learning_rate": 9.096774193548386e-05, |
|
"loss": 0.2259, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 6.944, |
|
"grad_norm": 1.035175085067749, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 0.3156, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 6.976, |
|
"grad_norm": 1.0241689682006836, |
|
"learning_rate": 8.903225806451611e-05, |
|
"loss": 0.2723, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 7.008, |
|
"grad_norm": 1.735946536064148, |
|
"learning_rate": 8.806451612903224e-05, |
|
"loss": 0.411, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"grad_norm": 0.8678178191184998, |
|
"learning_rate": 8.709677419354839e-05, |
|
"loss": 0.2415, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 7.072, |
|
"grad_norm": 0.7134645581245422, |
|
"learning_rate": 8.612903225806451e-05, |
|
"loss": 0.1509, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 7.104, |
|
"grad_norm": 0.8543497920036316, |
|
"learning_rate": 8.516129032258064e-05, |
|
"loss": 0.2459, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 7.136, |
|
"grad_norm": 0.9644029140472412, |
|
"learning_rate": 8.419354838709677e-05, |
|
"loss": 0.2828, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 7.168, |
|
"grad_norm": 0.8568740487098694, |
|
"learning_rate": 8.322580645161289e-05, |
|
"loss": 0.1936, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 1.005867600440979, |
|
"learning_rate": 8.225806451612902e-05, |
|
"loss": 0.2678, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 7.232, |
|
"grad_norm": 0.9942033290863037, |
|
"learning_rate": 8.129032258064517e-05, |
|
"loss": 0.2111, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 7.264, |
|
"grad_norm": 0.9886007905006409, |
|
"learning_rate": 8.032258064516129e-05, |
|
"loss": 0.2375, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 7.296, |
|
"grad_norm": 1.0586844682693481, |
|
"learning_rate": 7.935483870967742e-05, |
|
"loss": 0.2385, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 7.328, |
|
"grad_norm": 1.026432991027832, |
|
"learning_rate": 7.838709677419354e-05, |
|
"loss": 0.2139, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"grad_norm": 1.0039665699005127, |
|
"learning_rate": 7.741935483870967e-05, |
|
"loss": 0.2211, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 7.392, |
|
"grad_norm": 1.1125057935714722, |
|
"learning_rate": 7.645161290322579e-05, |
|
"loss": 0.2725, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 7.424, |
|
"grad_norm": 0.9078079462051392, |
|
"learning_rate": 7.548387096774192e-05, |
|
"loss": 0.1965, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 7.456, |
|
"grad_norm": 0.8247030377388, |
|
"learning_rate": 7.451612903225805e-05, |
|
"loss": 0.1502, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 7.4879999999999995, |
|
"grad_norm": 1.1396474838256836, |
|
"learning_rate": 7.354838709677418e-05, |
|
"loss": 0.37, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 7.52, |
|
"grad_norm": 0.753663182258606, |
|
"learning_rate": 7.258064516129032e-05, |
|
"loss": 0.1627, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 7.552, |
|
"grad_norm": 0.7927701473236084, |
|
"learning_rate": 7.161290322580645e-05, |
|
"loss": 0.1684, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 7.584, |
|
"grad_norm": 0.9258756637573242, |
|
"learning_rate": 7.064516129032257e-05, |
|
"loss": 0.213, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 7.616, |
|
"grad_norm": 0.8111560940742493, |
|
"learning_rate": 6.96774193548387e-05, |
|
"loss": 0.1998, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 7.648, |
|
"grad_norm": 0.8484370708465576, |
|
"learning_rate": 6.870967741935483e-05, |
|
"loss": 0.1307, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 7.68, |
|
"grad_norm": 0.9123087525367737, |
|
"learning_rate": 6.774193548387096e-05, |
|
"loss": 0.2529, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.712, |
|
"grad_norm": 1.0526336431503296, |
|
"learning_rate": 6.67741935483871e-05, |
|
"loss": 0.2468, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 7.744, |
|
"grad_norm": 1.0104210376739502, |
|
"learning_rate": 6.580645161290322e-05, |
|
"loss": 0.23, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 7.776, |
|
"grad_norm": 0.8749745488166809, |
|
"learning_rate": 6.483870967741935e-05, |
|
"loss": 0.1973, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 7.808, |
|
"grad_norm": 0.9921355247497559, |
|
"learning_rate": 6.387096774193548e-05, |
|
"loss": 0.2144, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 7.84, |
|
"grad_norm": 0.8243810534477234, |
|
"learning_rate": 6.290322580645161e-05, |
|
"loss": 0.1531, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 7.872, |
|
"grad_norm": 1.0764353275299072, |
|
"learning_rate": 6.193548387096773e-05, |
|
"loss": 0.2763, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 7.904, |
|
"grad_norm": 1.1754212379455566, |
|
"learning_rate": 6.096774193548386e-05, |
|
"loss": 0.2249, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 7.936, |
|
"grad_norm": 0.8588422536849976, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 0.1782, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 7.968, |
|
"grad_norm": 1.045143961906433, |
|
"learning_rate": 5.903225806451613e-05, |
|
"loss": 0.2789, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 1.9824038743972778, |
|
"learning_rate": 5.806451612903225e-05, |
|
"loss": 0.3057, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 8.032, |
|
"grad_norm": 0.9252362847328186, |
|
"learning_rate": 5.709677419354838e-05, |
|
"loss": 0.2221, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 8.064, |
|
"grad_norm": 0.8381021022796631, |
|
"learning_rate": 5.612903225806451e-05, |
|
"loss": 0.2639, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 8.096, |
|
"grad_norm": 0.9777012467384338, |
|
"learning_rate": 5.516129032258064e-05, |
|
"loss": 0.1533, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 8.128, |
|
"grad_norm": 0.8053516745567322, |
|
"learning_rate": 5.419354838709677e-05, |
|
"loss": 0.1883, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 8.16, |
|
"grad_norm": 0.8703336119651794, |
|
"learning_rate": 5.32258064516129e-05, |
|
"loss": 0.2079, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 8.192, |
|
"grad_norm": 0.8113718032836914, |
|
"learning_rate": 5.2258064516129025e-05, |
|
"loss": 0.1609, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 8.224, |
|
"grad_norm": 1.0667418241500854, |
|
"learning_rate": 5.129032258064516e-05, |
|
"loss": 0.2544, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 8.256, |
|
"grad_norm": 0.7853135466575623, |
|
"learning_rate": 5.032258064516129e-05, |
|
"loss": 0.1391, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 8.288, |
|
"grad_norm": 0.9970865845680237, |
|
"learning_rate": 4.9354838709677415e-05, |
|
"loss": 0.2305, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 8.32, |
|
"grad_norm": 12.063047409057617, |
|
"learning_rate": 4.838709677419354e-05, |
|
"loss": 0.189, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 8.352, |
|
"grad_norm": 1.2325772047042847, |
|
"learning_rate": 4.741935483870967e-05, |
|
"loss": 0.2308, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 8.384, |
|
"grad_norm": 1.1118851900100708, |
|
"learning_rate": 4.6451612903225805e-05, |
|
"loss": 0.2009, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 8.416, |
|
"grad_norm": 1.0783390998840332, |
|
"learning_rate": 4.548387096774193e-05, |
|
"loss": 0.2276, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 8.448, |
|
"grad_norm": 1.2127933502197266, |
|
"learning_rate": 4.4516129032258055e-05, |
|
"loss": 0.2046, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 8.48, |
|
"grad_norm": 1.1135843992233276, |
|
"learning_rate": 4.3548387096774194e-05, |
|
"loss": 0.1791, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 8.512, |
|
"grad_norm": 0.8666661381721497, |
|
"learning_rate": 4.258064516129032e-05, |
|
"loss": 0.1287, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 8.544, |
|
"grad_norm": 0.8430101275444031, |
|
"learning_rate": 4.1612903225806445e-05, |
|
"loss": 0.1475, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 8.576, |
|
"grad_norm": 0.7744110822677612, |
|
"learning_rate": 4.0645161290322584e-05, |
|
"loss": 0.1458, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 8.608, |
|
"grad_norm": 1.4067776203155518, |
|
"learning_rate": 3.967741935483871e-05, |
|
"loss": 0.2189, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"grad_norm": 0.8347670435905457, |
|
"learning_rate": 3.8709677419354835e-05, |
|
"loss": 0.1602, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 8.672, |
|
"grad_norm": 0.7643276453018188, |
|
"learning_rate": 3.774193548387096e-05, |
|
"loss": 0.1363, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 8.704, |
|
"grad_norm": 0.898059606552124, |
|
"learning_rate": 3.677419354838709e-05, |
|
"loss": 0.156, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 8.736, |
|
"grad_norm": 0.8416333198547363, |
|
"learning_rate": 3.5806451612903225e-05, |
|
"loss": 0.1754, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 8.768, |
|
"grad_norm": 0.8691906929016113, |
|
"learning_rate": 3.483870967741935e-05, |
|
"loss": 0.1808, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"grad_norm": 1.062111496925354, |
|
"learning_rate": 3.387096774193548e-05, |
|
"loss": 0.2559, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 8.832, |
|
"grad_norm": 0.881698727607727, |
|
"learning_rate": 3.290322580645161e-05, |
|
"loss": 0.1732, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 8.864, |
|
"grad_norm": 0.8446074724197388, |
|
"learning_rate": 3.193548387096774e-05, |
|
"loss": 0.1833, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 8.896, |
|
"grad_norm": 0.9393475651741028, |
|
"learning_rate": 3.0967741935483865e-05, |
|
"loss": 0.2165, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 8.928, |
|
"grad_norm": 0.8838346004486084, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 0.146, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"grad_norm": 0.8380343914031982, |
|
"learning_rate": 2.9032258064516126e-05, |
|
"loss": 0.1721, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 8.992, |
|
"grad_norm": 0.8561931252479553, |
|
"learning_rate": 2.8064516129032255e-05, |
|
"loss": 0.1519, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 9.024, |
|
"grad_norm": 1.6088253259658813, |
|
"learning_rate": 2.7096774193548384e-05, |
|
"loss": 0.2658, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 9.056, |
|
"grad_norm": 0.8154093027114868, |
|
"learning_rate": 2.6129032258064513e-05, |
|
"loss": 0.1693, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 9.088, |
|
"grad_norm": 0.7722072005271912, |
|
"learning_rate": 2.5161290322580645e-05, |
|
"loss": 0.1853, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 9.12, |
|
"grad_norm": 0.8294870257377625, |
|
"learning_rate": 2.419354838709677e-05, |
|
"loss": 0.1736, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 9.152, |
|
"grad_norm": 0.7481442093849182, |
|
"learning_rate": 2.3225806451612902e-05, |
|
"loss": 0.1544, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 9.184, |
|
"grad_norm": 0.923413872718811, |
|
"learning_rate": 2.2258064516129028e-05, |
|
"loss": 0.2162, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 9.216, |
|
"grad_norm": 0.8326953053474426, |
|
"learning_rate": 2.129032258064516e-05, |
|
"loss": 0.1926, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 9.248, |
|
"grad_norm": 0.7642485499382019, |
|
"learning_rate": 2.0322580645161292e-05, |
|
"loss": 0.1555, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"grad_norm": 0.7902241945266724, |
|
"learning_rate": 1.9354838709677417e-05, |
|
"loss": 0.1459, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 9.312, |
|
"grad_norm": 0.7414844036102295, |
|
"learning_rate": 1.8387096774193546e-05, |
|
"loss": 0.1425, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 9.344, |
|
"grad_norm": 0.7870174646377563, |
|
"learning_rate": 1.7419354838709675e-05, |
|
"loss": 0.1853, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 9.376, |
|
"grad_norm": 0.9091981649398804, |
|
"learning_rate": 1.6451612903225804e-05, |
|
"loss": 0.1666, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 9.408, |
|
"grad_norm": 0.8651584386825562, |
|
"learning_rate": 1.5483870967741933e-05, |
|
"loss": 0.174, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 9.44, |
|
"grad_norm": 0.7866891622543335, |
|
"learning_rate": 1.4516129032258063e-05, |
|
"loss": 0.1478, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 9.472, |
|
"grad_norm": 0.717932403087616, |
|
"learning_rate": 1.3548387096774192e-05, |
|
"loss": 0.1425, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 9.504, |
|
"grad_norm": 1.0217758417129517, |
|
"learning_rate": 1.2580645161290322e-05, |
|
"loss": 0.1574, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 9.536, |
|
"grad_norm": 0.8149961829185486, |
|
"learning_rate": 1.1612903225806451e-05, |
|
"loss": 0.1422, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 9.568, |
|
"grad_norm": 0.9206218719482422, |
|
"learning_rate": 1.064516129032258e-05, |
|
"loss": 0.1809, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 0.6865082383155823, |
|
"learning_rate": 9.677419354838709e-06, |
|
"loss": 0.133, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 9.632, |
|
"grad_norm": 0.7960584759712219, |
|
"learning_rate": 8.709677419354838e-06, |
|
"loss": 0.1289, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 9.664, |
|
"grad_norm": 1.4710181951522827, |
|
"learning_rate": 7.741935483870966e-06, |
|
"loss": 0.1844, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 9.696, |
|
"grad_norm": 0.7321292757987976, |
|
"learning_rate": 6.774193548387096e-06, |
|
"loss": 0.1356, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 9.728, |
|
"grad_norm": 0.9279872179031372, |
|
"learning_rate": 5.8064516129032256e-06, |
|
"loss": 0.1842, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 9.76, |
|
"grad_norm": 0.790213942527771, |
|
"learning_rate": 4.838709677419354e-06, |
|
"loss": 0.1341, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 9.792, |
|
"grad_norm": 0.7292400598526001, |
|
"learning_rate": 3.870967741935483e-06, |
|
"loss": 0.1287, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 9.824, |
|
"grad_norm": 0.8236159682273865, |
|
"learning_rate": 2.9032258064516128e-06, |
|
"loss": 0.1721, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 9.856, |
|
"grad_norm": 1.0054924488067627, |
|
"learning_rate": 1.9354838709677416e-06, |
|
"loss": 0.19, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 9.888, |
|
"grad_norm": 0.8466821312904358, |
|
"learning_rate": 9.677419354838708e-07, |
|
"loss": 0.1742, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 9.92, |
|
"grad_norm": 0.7754448652267456, |
|
"learning_rate": 0.0, |
|
"loss": 0.1368, |
|
"step": 310 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 310, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.34728855486464e+16, |
|
"train_batch_size": 3, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|