llava-v1.5-13b / checkpoint-224 /trainer_state.json
Bleking's picture
push llava-v1.5-13b
07ada15
{
"best_metric": 0.6895740032196045,
"best_model_checkpoint": "./checkpoints/llava-v1.5-13b/checkpoint-224",
"epoch": 7.0,
"eval_steps": 1.0,
"global_step": 224,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03125,
"grad_norm": 0.2380081706918525,
"learning_rate": 0.0,
"loss": 1.2458,
"step": 1
},
{
"epoch": 0.03125,
"eval_loss": 1.3161638975143433,
"eval_runtime": 50.8995,
"eval_samples_per_second": 3.929,
"eval_steps_per_second": 0.255,
"step": 1
},
{
"epoch": 0.0625,
"grad_norm": 0.20429495268987705,
"learning_rate": 8.613531161467863e-06,
"loss": 1.2003,
"step": 2
},
{
"epoch": 0.0625,
"eval_loss": 1.3161638975143433,
"eval_runtime": 47.4818,
"eval_samples_per_second": 4.212,
"eval_steps_per_second": 0.274,
"step": 2
},
{
"epoch": 0.09375,
"grad_norm": 0.20616215800420787,
"learning_rate": 1.3652123889719709e-05,
"loss": 1.2622,
"step": 3
},
{
"epoch": 0.09375,
"eval_loss": 1.309991478919983,
"eval_runtime": 47.4152,
"eval_samples_per_second": 4.218,
"eval_steps_per_second": 0.274,
"step": 3
},
{
"epoch": 0.125,
"grad_norm": 0.20155595022101944,
"learning_rate": 1.7227062322935725e-05,
"loss": 1.2845,
"step": 4
},
{
"epoch": 0.125,
"eval_loss": 1.3013781309127808,
"eval_runtime": 47.4814,
"eval_samples_per_second": 4.212,
"eval_steps_per_second": 0.274,
"step": 4
},
{
"epoch": 0.15625,
"grad_norm": 0.21113117474989132,
"learning_rate": 2e-05,
"loss": 1.246,
"step": 5
},
{
"epoch": 0.15625,
"eval_loss": 1.2892160415649414,
"eval_runtime": 47.7209,
"eval_samples_per_second": 4.191,
"eval_steps_per_second": 0.272,
"step": 5
},
{
"epoch": 0.1875,
"grad_norm": 0.21377946631015488,
"learning_rate": 2e-05,
"loss": 1.2684,
"step": 6
},
{
"epoch": 0.1875,
"eval_loss": 1.2754532098770142,
"eval_runtime": 47.5781,
"eval_samples_per_second": 4.204,
"eval_steps_per_second": 0.273,
"step": 6
},
{
"epoch": 0.21875,
"grad_norm": 0.2284268997618767,
"learning_rate": 2e-05,
"loss": 1.2681,
"step": 7
},
{
"epoch": 0.21875,
"eval_loss": 1.2605774402618408,
"eval_runtime": 47.5326,
"eval_samples_per_second": 4.208,
"eval_steps_per_second": 0.273,
"step": 7
},
{
"epoch": 0.25,
"grad_norm": 0.23585343568544442,
"learning_rate": 2e-05,
"loss": 1.2407,
"step": 8
},
{
"epoch": 0.25,
"eval_loss": 1.244718313217163,
"eval_runtime": 47.5001,
"eval_samples_per_second": 4.211,
"eval_steps_per_second": 0.274,
"step": 8
},
{
"epoch": 0.28125,
"grad_norm": 0.23051191992462533,
"learning_rate": 2e-05,
"loss": 1.2766,
"step": 9
},
{
"epoch": 0.28125,
"eval_loss": 1.2285138368606567,
"eval_runtime": 47.4631,
"eval_samples_per_second": 4.214,
"eval_steps_per_second": 0.274,
"step": 9
},
{
"epoch": 0.3125,
"grad_norm": 0.22726394327484983,
"learning_rate": 2e-05,
"loss": 1.2024,
"step": 10
},
{
"epoch": 0.3125,
"eval_loss": 1.2118008136749268,
"eval_runtime": 47.4991,
"eval_samples_per_second": 4.211,
"eval_steps_per_second": 0.274,
"step": 10
},
{
"epoch": 0.34375,
"grad_norm": 0.25404890894461285,
"learning_rate": 2e-05,
"loss": 1.2742,
"step": 11
},
{
"epoch": 0.34375,
"eval_loss": 1.1942989826202393,
"eval_runtime": 49.2609,
"eval_samples_per_second": 4.06,
"eval_steps_per_second": 0.264,
"step": 11
},
{
"epoch": 0.375,
"grad_norm": 0.26336210916526287,
"learning_rate": 2e-05,
"loss": 1.2258,
"step": 12
},
{
"epoch": 0.375,
"eval_loss": 1.176426649093628,
"eval_runtime": 49.0639,
"eval_samples_per_second": 4.076,
"eval_steps_per_second": 0.265,
"step": 12
},
{
"epoch": 0.40625,
"grad_norm": 0.29637148470746666,
"learning_rate": 2e-05,
"loss": 1.2345,
"step": 13
},
{
"epoch": 0.40625,
"eval_loss": 1.1577811241149902,
"eval_runtime": 49.1352,
"eval_samples_per_second": 4.07,
"eval_steps_per_second": 0.265,
"step": 13
},
{
"epoch": 0.4375,
"grad_norm": 0.2841880377627424,
"learning_rate": 2e-05,
"loss": 1.0765,
"step": 14
},
{
"epoch": 0.4375,
"eval_loss": 1.1381279230117798,
"eval_runtime": 49.25,
"eval_samples_per_second": 4.061,
"eval_steps_per_second": 0.264,
"step": 14
},
{
"epoch": 0.46875,
"grad_norm": 0.2773140636191091,
"learning_rate": 2e-05,
"loss": 1.1812,
"step": 15
},
{
"epoch": 0.46875,
"eval_loss": 1.1178216934204102,
"eval_runtime": 49.0879,
"eval_samples_per_second": 4.074,
"eval_steps_per_second": 0.265,
"step": 15
},
{
"epoch": 0.5,
"grad_norm": 0.3568607365552051,
"learning_rate": 2e-05,
"loss": 1.1327,
"step": 16
},
{
"epoch": 0.5,
"eval_loss": 1.0954149961471558,
"eval_runtime": 48.6546,
"eval_samples_per_second": 4.111,
"eval_steps_per_second": 0.267,
"step": 16
},
{
"epoch": 0.53125,
"grad_norm": 0.32574391414112897,
"learning_rate": 2e-05,
"loss": 1.1162,
"step": 17
},
{
"epoch": 0.53125,
"eval_loss": 1.071275234222412,
"eval_runtime": 48.5618,
"eval_samples_per_second": 4.118,
"eval_steps_per_second": 0.268,
"step": 17
},
{
"epoch": 0.5625,
"grad_norm": 0.4256864144638081,
"learning_rate": 2e-05,
"loss": 1.1138,
"step": 18
},
{
"epoch": 0.5625,
"eval_loss": 1.0455905199050903,
"eval_runtime": 48.4981,
"eval_samples_per_second": 4.124,
"eval_steps_per_second": 0.268,
"step": 18
},
{
"epoch": 0.59375,
"grad_norm": 0.31230014132112643,
"learning_rate": 2e-05,
"loss": 1.0011,
"step": 19
},
{
"epoch": 0.59375,
"eval_loss": 1.0208789110183716,
"eval_runtime": 48.4675,
"eval_samples_per_second": 4.126,
"eval_steps_per_second": 0.268,
"step": 19
},
{
"epoch": 0.625,
"grad_norm": 0.3025724039243594,
"learning_rate": 2e-05,
"loss": 1.109,
"step": 20
},
{
"epoch": 0.625,
"eval_loss": 1.002480149269104,
"eval_runtime": 48.5265,
"eval_samples_per_second": 4.121,
"eval_steps_per_second": 0.268,
"step": 20
},
{
"epoch": 0.65625,
"grad_norm": 0.27787879590501874,
"learning_rate": 2e-05,
"loss": 1.0291,
"step": 21
},
{
"epoch": 0.65625,
"eval_loss": 0.9933492541313171,
"eval_runtime": 50.0369,
"eval_samples_per_second": 3.997,
"eval_steps_per_second": 0.26,
"step": 21
},
{
"epoch": 0.6875,
"grad_norm": 0.4231294067130801,
"learning_rate": 2e-05,
"loss": 1.0779,
"step": 22
},
{
"epoch": 0.6875,
"eval_loss": 0.9850385785102844,
"eval_runtime": 50.0062,
"eval_samples_per_second": 4.0,
"eval_steps_per_second": 0.26,
"step": 22
},
{
"epoch": 0.71875,
"grad_norm": 0.42130097437373987,
"learning_rate": 2e-05,
"loss": 1.0897,
"step": 23
},
{
"epoch": 0.71875,
"eval_loss": 0.9758670330047607,
"eval_runtime": 50.1031,
"eval_samples_per_second": 3.992,
"eval_steps_per_second": 0.259,
"step": 23
},
{
"epoch": 0.75,
"grad_norm": 0.27711808063263893,
"learning_rate": 2e-05,
"loss": 1.0739,
"step": 24
},
{
"epoch": 0.75,
"eval_loss": 0.9674506187438965,
"eval_runtime": 50.0337,
"eval_samples_per_second": 3.997,
"eval_steps_per_second": 0.26,
"step": 24
},
{
"epoch": 0.78125,
"grad_norm": 0.2879649409281791,
"learning_rate": 2e-05,
"loss": 1.0182,
"step": 25
},
{
"epoch": 0.78125,
"eval_loss": 0.9592065215110779,
"eval_runtime": 50.0709,
"eval_samples_per_second": 3.994,
"eval_steps_per_second": 0.26,
"step": 25
},
{
"epoch": 0.8125,
"grad_norm": 0.19327450826076825,
"learning_rate": 2e-05,
"loss": 1.0413,
"step": 26
},
{
"epoch": 0.8125,
"eval_loss": 0.9518552422523499,
"eval_runtime": 50.0572,
"eval_samples_per_second": 3.995,
"eval_steps_per_second": 0.26,
"step": 26
},
{
"epoch": 0.84375,
"grad_norm": 0.19707021382445633,
"learning_rate": 2e-05,
"loss": 0.9525,
"step": 27
},
{
"epoch": 0.84375,
"eval_loss": 0.9449941515922546,
"eval_runtime": 50.0515,
"eval_samples_per_second": 3.996,
"eval_steps_per_second": 0.26,
"step": 27
},
{
"epoch": 0.875,
"grad_norm": 0.2420270757641518,
"learning_rate": 2e-05,
"loss": 0.9658,
"step": 28
},
{
"epoch": 0.875,
"eval_loss": 0.9378474354743958,
"eval_runtime": 49.9299,
"eval_samples_per_second": 4.006,
"eval_steps_per_second": 0.26,
"step": 28
},
{
"epoch": 0.90625,
"grad_norm": 0.18074632782127534,
"learning_rate": 2e-05,
"loss": 0.9866,
"step": 29
},
{
"epoch": 0.90625,
"eval_loss": 0.93099045753479,
"eval_runtime": 50.0096,
"eval_samples_per_second": 3.999,
"eval_steps_per_second": 0.26,
"step": 29
},
{
"epoch": 0.9375,
"grad_norm": 0.1936051126921734,
"learning_rate": 2e-05,
"loss": 1.0128,
"step": 30
},
{
"epoch": 0.9375,
"eval_loss": 0.9244199991226196,
"eval_runtime": 50.2469,
"eval_samples_per_second": 3.98,
"eval_steps_per_second": 0.259,
"step": 30
},
{
"epoch": 0.96875,
"grad_norm": 0.26164254459782943,
"learning_rate": 2e-05,
"loss": 0.88,
"step": 31
},
{
"epoch": 0.96875,
"eval_loss": 0.9175177216529846,
"eval_runtime": 50.1695,
"eval_samples_per_second": 3.986,
"eval_steps_per_second": 0.259,
"step": 31
},
{
"epoch": 1.0,
"grad_norm": 0.18677152741688485,
"learning_rate": 2e-05,
"loss": 0.9569,
"step": 32
},
{
"epoch": 1.0,
"eval_loss": 0.9108598828315735,
"eval_runtime": 50.0387,
"eval_samples_per_second": 3.997,
"eval_steps_per_second": 0.26,
"step": 32
},
{
"epoch": 1.03125,
"grad_norm": 0.20486279036126417,
"learning_rate": 2e-05,
"loss": 1.0208,
"step": 33
},
{
"epoch": 1.03125,
"eval_loss": 0.9042049646377563,
"eval_runtime": 50.1472,
"eval_samples_per_second": 3.988,
"eval_steps_per_second": 0.259,
"step": 33
},
{
"epoch": 1.0625,
"grad_norm": 0.2004946169291112,
"learning_rate": 2e-05,
"loss": 0.9931,
"step": 34
},
{
"epoch": 1.0625,
"eval_loss": 0.8980298042297363,
"eval_runtime": 50.245,
"eval_samples_per_second": 3.98,
"eval_steps_per_second": 0.259,
"step": 34
},
{
"epoch": 1.09375,
"grad_norm": 0.1645872432258401,
"learning_rate": 2e-05,
"loss": 1.0184,
"step": 35
},
{
"epoch": 1.09375,
"eval_loss": 0.8924428820610046,
"eval_runtime": 50.3703,
"eval_samples_per_second": 3.971,
"eval_steps_per_second": 0.258,
"step": 35
},
{
"epoch": 1.125,
"grad_norm": 0.18293519304435016,
"learning_rate": 2e-05,
"loss": 1.0026,
"step": 36
},
{
"epoch": 1.125,
"eval_loss": 0.8870412707328796,
"eval_runtime": 50.0483,
"eval_samples_per_second": 3.996,
"eval_steps_per_second": 0.26,
"step": 36
},
{
"epoch": 1.15625,
"grad_norm": 0.17712548516246762,
"learning_rate": 2e-05,
"loss": 0.9387,
"step": 37
},
{
"epoch": 1.15625,
"eval_loss": 0.881915271282196,
"eval_runtime": 49.9751,
"eval_samples_per_second": 4.002,
"eval_steps_per_second": 0.26,
"step": 37
},
{
"epoch": 1.1875,
"grad_norm": 0.21472689311609464,
"learning_rate": 2e-05,
"loss": 0.958,
"step": 38
},
{
"epoch": 1.1875,
"eval_loss": 0.8768754601478577,
"eval_runtime": 50.1204,
"eval_samples_per_second": 3.99,
"eval_steps_per_second": 0.259,
"step": 38
},
{
"epoch": 1.21875,
"grad_norm": 0.21117297910005806,
"learning_rate": 2e-05,
"loss": 0.9922,
"step": 39
},
{
"epoch": 1.21875,
"eval_loss": 0.8718628883361816,
"eval_runtime": 50.1732,
"eval_samples_per_second": 3.986,
"eval_steps_per_second": 0.259,
"step": 39
},
{
"epoch": 1.25,
"grad_norm": 0.17835587003909165,
"learning_rate": 2e-05,
"loss": 0.9776,
"step": 40
},
{
"epoch": 1.25,
"eval_loss": 0.8669865131378174,
"eval_runtime": 50.1148,
"eval_samples_per_second": 3.991,
"eval_steps_per_second": 0.259,
"step": 40
},
{
"epoch": 1.28125,
"grad_norm": 0.2092736372483734,
"learning_rate": 2e-05,
"loss": 0.9731,
"step": 41
},
{
"epoch": 1.28125,
"eval_loss": 0.8619834780693054,
"eval_runtime": 50.052,
"eval_samples_per_second": 3.996,
"eval_steps_per_second": 0.26,
"step": 41
},
{
"epoch": 1.3125,
"grad_norm": 0.2338857391910308,
"learning_rate": 2e-05,
"loss": 0.9319,
"step": 42
},
{
"epoch": 1.3125,
"eval_loss": 0.8572126030921936,
"eval_runtime": 50.1212,
"eval_samples_per_second": 3.99,
"eval_steps_per_second": 0.259,
"step": 42
},
{
"epoch": 1.34375,
"grad_norm": 0.19168719284572813,
"learning_rate": 2e-05,
"loss": 0.9083,
"step": 43
},
{
"epoch": 1.34375,
"eval_loss": 0.8525611758232117,
"eval_runtime": 50.1733,
"eval_samples_per_second": 3.986,
"eval_steps_per_second": 0.259,
"step": 43
},
{
"epoch": 1.375,
"grad_norm": 0.20004868138433377,
"learning_rate": 2e-05,
"loss": 0.9118,
"step": 44
},
{
"epoch": 1.375,
"eval_loss": 0.8483461141586304,
"eval_runtime": 50.1083,
"eval_samples_per_second": 3.991,
"eval_steps_per_second": 0.259,
"step": 44
},
{
"epoch": 1.40625,
"grad_norm": 0.19012965506122342,
"learning_rate": 2e-05,
"loss": 0.8888,
"step": 45
},
{
"epoch": 1.40625,
"eval_loss": 0.8446614742279053,
"eval_runtime": 50.1171,
"eval_samples_per_second": 3.991,
"eval_steps_per_second": 0.259,
"step": 45
},
{
"epoch": 1.4375,
"grad_norm": 0.21187005706805245,
"learning_rate": 2e-05,
"loss": 0.9319,
"step": 46
},
{
"epoch": 1.4375,
"eval_loss": 0.8412036299705505,
"eval_runtime": 50.0918,
"eval_samples_per_second": 3.993,
"eval_steps_per_second": 0.26,
"step": 46
},
{
"epoch": 1.46875,
"grad_norm": 0.19673832205926584,
"learning_rate": 2e-05,
"loss": 0.9359,
"step": 47
},
{
"epoch": 1.46875,
"eval_loss": 0.8380417823791504,
"eval_runtime": 50.2214,
"eval_samples_per_second": 3.982,
"eval_steps_per_second": 0.259,
"step": 47
},
{
"epoch": 1.5,
"grad_norm": 0.21712294106174318,
"learning_rate": 2e-05,
"loss": 0.8511,
"step": 48
},
{
"epoch": 1.5,
"eval_loss": 0.8353021740913391,
"eval_runtime": 50.1617,
"eval_samples_per_second": 3.987,
"eval_steps_per_second": 0.259,
"step": 48
},
{
"epoch": 1.53125,
"grad_norm": 0.2138924779700934,
"learning_rate": 2e-05,
"loss": 0.8695,
"step": 49
},
{
"epoch": 1.53125,
"eval_loss": 0.8327407836914062,
"eval_runtime": 50.1442,
"eval_samples_per_second": 3.988,
"eval_steps_per_second": 0.259,
"step": 49
},
{
"epoch": 1.5625,
"grad_norm": 0.22387442384578618,
"learning_rate": 2e-05,
"loss": 0.8518,
"step": 50
},
{
"epoch": 1.5625,
"eval_loss": 0.8301742076873779,
"eval_runtime": 50.1867,
"eval_samples_per_second": 3.985,
"eval_steps_per_second": 0.259,
"step": 50
},
{
"epoch": 1.59375,
"grad_norm": 0.1975577146517192,
"learning_rate": 2e-05,
"loss": 0.8868,
"step": 51
},
{
"epoch": 1.59375,
"eval_loss": 0.8275265693664551,
"eval_runtime": 51.2257,
"eval_samples_per_second": 3.904,
"eval_steps_per_second": 0.254,
"step": 51
},
{
"epoch": 1.625,
"grad_norm": 0.21474817057286624,
"learning_rate": 2e-05,
"loss": 0.767,
"step": 52
},
{
"epoch": 1.625,
"eval_loss": 0.824796736240387,
"eval_runtime": 51.276,
"eval_samples_per_second": 3.9,
"eval_steps_per_second": 0.254,
"step": 52
},
{
"epoch": 1.65625,
"grad_norm": 0.21105651676755652,
"learning_rate": 2e-05,
"loss": 0.9219,
"step": 53
},
{
"epoch": 1.65625,
"eval_loss": 0.8221166729927063,
"eval_runtime": 51.141,
"eval_samples_per_second": 3.911,
"eval_steps_per_second": 0.254,
"step": 53
},
{
"epoch": 1.6875,
"grad_norm": 0.20706475184742085,
"learning_rate": 2e-05,
"loss": 0.8873,
"step": 54
},
{
"epoch": 1.6875,
"eval_loss": 0.819589376449585,
"eval_runtime": 51.0045,
"eval_samples_per_second": 3.921,
"eval_steps_per_second": 0.255,
"step": 54
},
{
"epoch": 1.71875,
"grad_norm": 0.21722220033855957,
"learning_rate": 2e-05,
"loss": 0.8956,
"step": 55
},
{
"epoch": 1.71875,
"eval_loss": 0.8176340460777283,
"eval_runtime": 51.1941,
"eval_samples_per_second": 3.907,
"eval_steps_per_second": 0.254,
"step": 55
},
{
"epoch": 1.75,
"grad_norm": 0.20669001221665667,
"learning_rate": 2e-05,
"loss": 0.9506,
"step": 56
},
{
"epoch": 1.75,
"eval_loss": 0.8158826231956482,
"eval_runtime": 52.1162,
"eval_samples_per_second": 3.838,
"eval_steps_per_second": 0.249,
"step": 56
},
{
"epoch": 1.78125,
"grad_norm": 0.22189732090066341,
"learning_rate": 2e-05,
"loss": 0.8955,
"step": 57
},
{
"epoch": 1.78125,
"eval_loss": 0.814656674861908,
"eval_runtime": 52.1361,
"eval_samples_per_second": 3.836,
"eval_steps_per_second": 0.249,
"step": 57
},
{
"epoch": 1.8125,
"grad_norm": 0.2030113892848459,
"learning_rate": 2e-05,
"loss": 0.9108,
"step": 58
},
{
"epoch": 1.8125,
"eval_loss": 0.813343346118927,
"eval_runtime": 52.2552,
"eval_samples_per_second": 3.827,
"eval_steps_per_second": 0.249,
"step": 58
},
{
"epoch": 1.84375,
"grad_norm": 0.2123201057569791,
"learning_rate": 2e-05,
"loss": 0.8779,
"step": 59
},
{
"epoch": 1.84375,
"eval_loss": 0.8116877675056458,
"eval_runtime": 52.1233,
"eval_samples_per_second": 3.837,
"eval_steps_per_second": 0.249,
"step": 59
},
{
"epoch": 1.875,
"grad_norm": 0.211551126937912,
"learning_rate": 2e-05,
"loss": 0.9294,
"step": 60
},
{
"epoch": 1.875,
"eval_loss": 0.8098442554473877,
"eval_runtime": 52.1091,
"eval_samples_per_second": 3.838,
"eval_steps_per_second": 0.249,
"step": 60
},
{
"epoch": 1.90625,
"grad_norm": 0.24981344981629752,
"learning_rate": 2e-05,
"loss": 0.8409,
"step": 61
},
{
"epoch": 1.90625,
"eval_loss": 0.8070770502090454,
"eval_runtime": 53.4187,
"eval_samples_per_second": 3.744,
"eval_steps_per_second": 0.243,
"step": 61
},
{
"epoch": 1.9375,
"grad_norm": 0.2341550589775159,
"learning_rate": 2e-05,
"loss": 0.888,
"step": 62
},
{
"epoch": 1.9375,
"eval_loss": 0.8040286898612976,
"eval_runtime": 53.2197,
"eval_samples_per_second": 3.758,
"eval_steps_per_second": 0.244,
"step": 62
},
{
"epoch": 1.96875,
"grad_norm": 0.2336241775649256,
"learning_rate": 2e-05,
"loss": 0.913,
"step": 63
},
{
"epoch": 1.96875,
"eval_loss": 0.8013430833816528,
"eval_runtime": 53.1784,
"eval_samples_per_second": 3.761,
"eval_steps_per_second": 0.244,
"step": 63
},
{
"epoch": 2.0,
"grad_norm": 0.2414390628081758,
"learning_rate": 2e-05,
"loss": 0.8754,
"step": 64
},
{
"epoch": 2.0,
"eval_loss": 0.7985894680023193,
"eval_runtime": 53.2454,
"eval_samples_per_second": 3.756,
"eval_steps_per_second": 0.244,
"step": 64
},
{
"epoch": 2.03125,
"grad_norm": 0.2484104465653703,
"learning_rate": 2e-05,
"loss": 0.8497,
"step": 65
},
{
"epoch": 2.03125,
"eval_loss": 0.7954932451248169,
"eval_runtime": 53.3794,
"eval_samples_per_second": 3.747,
"eval_steps_per_second": 0.244,
"step": 65
},
{
"epoch": 2.0625,
"grad_norm": 0.23859744120942086,
"learning_rate": 2e-05,
"loss": 0.8567,
"step": 66
},
{
"epoch": 2.0625,
"eval_loss": 0.7929843068122864,
"eval_runtime": 55.517,
"eval_samples_per_second": 3.602,
"eval_steps_per_second": 0.234,
"step": 66
},
{
"epoch": 2.09375,
"grad_norm": 0.24584758647855462,
"learning_rate": 2e-05,
"loss": 0.8489,
"step": 67
},
{
"epoch": 2.09375,
"eval_loss": 0.7903321981430054,
"eval_runtime": 55.4151,
"eval_samples_per_second": 3.609,
"eval_steps_per_second": 0.235,
"step": 67
},
{
"epoch": 2.125,
"grad_norm": 0.2484917818304153,
"learning_rate": 2e-05,
"loss": 0.9122,
"step": 68
},
{
"epoch": 2.125,
"eval_loss": 0.7877185344696045,
"eval_runtime": 55.4069,
"eval_samples_per_second": 3.61,
"eval_steps_per_second": 0.235,
"step": 68
},
{
"epoch": 2.15625,
"grad_norm": 0.2184614083026819,
"learning_rate": 2e-05,
"loss": 0.8355,
"step": 69
},
{
"epoch": 2.15625,
"eval_loss": 0.7852210998535156,
"eval_runtime": 55.3381,
"eval_samples_per_second": 3.614,
"eval_steps_per_second": 0.235,
"step": 69
},
{
"epoch": 2.1875,
"grad_norm": 0.24978410070800153,
"learning_rate": 2e-05,
"loss": 0.7968,
"step": 70
},
{
"epoch": 2.1875,
"eval_loss": 0.7827157378196716,
"eval_runtime": 55.3708,
"eval_samples_per_second": 3.612,
"eval_steps_per_second": 0.235,
"step": 70
},
{
"epoch": 2.21875,
"grad_norm": 0.23059883325890385,
"learning_rate": 2e-05,
"loss": 0.8783,
"step": 71
},
{
"epoch": 2.21875,
"eval_loss": 0.7805906534194946,
"eval_runtime": 55.6033,
"eval_samples_per_second": 3.597,
"eval_steps_per_second": 0.234,
"step": 71
},
{
"epoch": 2.25,
"grad_norm": 0.23261007334915096,
"learning_rate": 2e-05,
"loss": 0.7956,
"step": 72
},
{
"epoch": 2.25,
"eval_loss": 0.7786691784858704,
"eval_runtime": 55.0913,
"eval_samples_per_second": 3.63,
"eval_steps_per_second": 0.236,
"step": 72
},
{
"epoch": 2.28125,
"grad_norm": 0.25779598356574085,
"learning_rate": 2e-05,
"loss": 0.8426,
"step": 73
},
{
"epoch": 2.28125,
"eval_loss": 0.7771151661872864,
"eval_runtime": 55.0698,
"eval_samples_per_second": 3.632,
"eval_steps_per_second": 0.236,
"step": 73
},
{
"epoch": 2.3125,
"grad_norm": 0.2288243335971112,
"learning_rate": 2e-05,
"loss": 0.8381,
"step": 74
},
{
"epoch": 2.3125,
"eval_loss": 0.7756838202476501,
"eval_runtime": 54.8412,
"eval_samples_per_second": 3.647,
"eval_steps_per_second": 0.237,
"step": 74
},
{
"epoch": 2.34375,
"grad_norm": 0.24235644907977733,
"learning_rate": 2e-05,
"loss": 0.887,
"step": 75
},
{
"epoch": 2.34375,
"eval_loss": 0.7739972472190857,
"eval_runtime": 54.9718,
"eval_samples_per_second": 3.638,
"eval_steps_per_second": 0.236,
"step": 75
},
{
"epoch": 2.375,
"grad_norm": 0.23666820017867402,
"learning_rate": 2e-05,
"loss": 0.8007,
"step": 76
},
{
"epoch": 2.375,
"eval_loss": 0.7724328637123108,
"eval_runtime": 55.0225,
"eval_samples_per_second": 3.635,
"eval_steps_per_second": 0.236,
"step": 76
},
{
"epoch": 2.40625,
"grad_norm": 0.22815737396609181,
"learning_rate": 2e-05,
"loss": 0.8529,
"step": 77
},
{
"epoch": 2.40625,
"eval_loss": 0.7710004448890686,
"eval_runtime": 55.321,
"eval_samples_per_second": 3.615,
"eval_steps_per_second": 0.235,
"step": 77
},
{
"epoch": 2.4375,
"grad_norm": 0.2701264871470739,
"learning_rate": 2e-05,
"loss": 0.8515,
"step": 78
},
{
"epoch": 2.4375,
"eval_loss": 0.7695322632789612,
"eval_runtime": 55.3045,
"eval_samples_per_second": 3.616,
"eval_steps_per_second": 0.235,
"step": 78
},
{
"epoch": 2.46875,
"grad_norm": 0.24363813951328234,
"learning_rate": 2e-05,
"loss": 0.8587,
"step": 79
},
{
"epoch": 2.46875,
"eval_loss": 0.7689024209976196,
"eval_runtime": 55.3009,
"eval_samples_per_second": 3.617,
"eval_steps_per_second": 0.235,
"step": 79
},
{
"epoch": 2.5,
"grad_norm": 0.30924701355253065,
"learning_rate": 2e-05,
"loss": 0.9076,
"step": 80
},
{
"epoch": 2.5,
"eval_loss": 0.7676254510879517,
"eval_runtime": 55.2365,
"eval_samples_per_second": 3.621,
"eval_steps_per_second": 0.235,
"step": 80
},
{
"epoch": 2.53125,
"grad_norm": 0.2665188280221636,
"learning_rate": 2e-05,
"loss": 0.8445,
"step": 81
},
{
"epoch": 2.53125,
"eval_loss": 0.7661146521568298,
"eval_runtime": 55.2775,
"eval_samples_per_second": 3.618,
"eval_steps_per_second": 0.235,
"step": 81
},
{
"epoch": 2.5625,
"grad_norm": 0.24674191720675534,
"learning_rate": 2e-05,
"loss": 0.8882,
"step": 82
},
{
"epoch": 2.5625,
"eval_loss": 0.76513671875,
"eval_runtime": 55.0857,
"eval_samples_per_second": 3.631,
"eval_steps_per_second": 0.236,
"step": 82
},
{
"epoch": 2.59375,
"grad_norm": 0.2736689405531704,
"learning_rate": 2e-05,
"loss": 0.8336,
"step": 83
},
{
"epoch": 2.59375,
"eval_loss": 0.764373779296875,
"eval_runtime": 55.2069,
"eval_samples_per_second": 3.623,
"eval_steps_per_second": 0.235,
"step": 83
},
{
"epoch": 2.625,
"grad_norm": 0.290841287198557,
"learning_rate": 2e-05,
"loss": 0.795,
"step": 84
},
{
"epoch": 2.625,
"eval_loss": 0.7632084488868713,
"eval_runtime": 55.1009,
"eval_samples_per_second": 3.63,
"eval_steps_per_second": 0.236,
"step": 84
},
{
"epoch": 2.65625,
"grad_norm": 0.2912051076836381,
"learning_rate": 2e-05,
"loss": 0.772,
"step": 85
},
{
"epoch": 2.65625,
"eval_loss": 0.7618446350097656,
"eval_runtime": 55.3717,
"eval_samples_per_second": 3.612,
"eval_steps_per_second": 0.235,
"step": 85
},
{
"epoch": 2.6875,
"grad_norm": 0.3169908538809109,
"learning_rate": 2e-05,
"loss": 0.8148,
"step": 86
},
{
"epoch": 2.6875,
"eval_loss": 0.7599577307701111,
"eval_runtime": 55.3931,
"eval_samples_per_second": 3.611,
"eval_steps_per_second": 0.235,
"step": 86
},
{
"epoch": 2.71875,
"grad_norm": 0.28780549186847426,
"learning_rate": 2e-05,
"loss": 0.8154,
"step": 87
},
{
"epoch": 2.71875,
"eval_loss": 0.7583369612693787,
"eval_runtime": 55.1679,
"eval_samples_per_second": 3.625,
"eval_steps_per_second": 0.236,
"step": 87
},
{
"epoch": 2.75,
"grad_norm": 0.30695250620091474,
"learning_rate": 2e-05,
"loss": 0.9032,
"step": 88
},
{
"epoch": 2.75,
"eval_loss": 0.7571613192558289,
"eval_runtime": 55.1779,
"eval_samples_per_second": 3.625,
"eval_steps_per_second": 0.236,
"step": 88
},
{
"epoch": 2.78125,
"grad_norm": 0.2693887416759828,
"learning_rate": 2e-05,
"loss": 0.8106,
"step": 89
},
{
"epoch": 2.78125,
"eval_loss": 0.7566004991531372,
"eval_runtime": 55.1107,
"eval_samples_per_second": 3.629,
"eval_steps_per_second": 0.236,
"step": 89
},
{
"epoch": 2.8125,
"grad_norm": 0.2887583627563198,
"learning_rate": 2e-05,
"loss": 0.8518,
"step": 90
},
{
"epoch": 2.8125,
"eval_loss": 0.7558963298797607,
"eval_runtime": 55.2153,
"eval_samples_per_second": 3.622,
"eval_steps_per_second": 0.235,
"step": 90
},
{
"epoch": 2.84375,
"grad_norm": 0.3059402168979351,
"learning_rate": 2e-05,
"loss": 0.7727,
"step": 91
},
{
"epoch": 2.84375,
"eval_loss": 0.7545350790023804,
"eval_runtime": 55.3225,
"eval_samples_per_second": 3.615,
"eval_steps_per_second": 0.235,
"step": 91
},
{
"epoch": 2.875,
"grad_norm": 0.3096260477909968,
"learning_rate": 2e-05,
"loss": 0.8477,
"step": 92
},
{
"epoch": 2.875,
"eval_loss": 0.7526452541351318,
"eval_runtime": 55.4311,
"eval_samples_per_second": 3.608,
"eval_steps_per_second": 0.235,
"step": 92
},
{
"epoch": 2.90625,
"grad_norm": 0.31498884686525297,
"learning_rate": 2e-05,
"loss": 0.7982,
"step": 93
},
{
"epoch": 2.90625,
"eval_loss": 0.7510760426521301,
"eval_runtime": 55.4361,
"eval_samples_per_second": 3.608,
"eval_steps_per_second": 0.235,
"step": 93
},
{
"epoch": 2.9375,
"grad_norm": 0.31302830623184313,
"learning_rate": 2e-05,
"loss": 0.871,
"step": 94
},
{
"epoch": 2.9375,
"eval_loss": 0.7500898838043213,
"eval_runtime": 55.3025,
"eval_samples_per_second": 3.616,
"eval_steps_per_second": 0.235,
"step": 94
},
{
"epoch": 2.96875,
"grad_norm": 0.3132608568779145,
"learning_rate": 2e-05,
"loss": 0.8094,
"step": 95
},
{
"epoch": 2.96875,
"eval_loss": 0.7498895525932312,
"eval_runtime": 55.2402,
"eval_samples_per_second": 3.621,
"eval_steps_per_second": 0.235,
"step": 95
},
{
"epoch": 3.0,
"grad_norm": 0.298645350091386,
"learning_rate": 2e-05,
"loss": 0.7673,
"step": 96
},
{
"epoch": 3.0,
"eval_loss": 0.7493192553520203,
"eval_runtime": 54.8718,
"eval_samples_per_second": 3.645,
"eval_steps_per_second": 0.237,
"step": 96
},
{
"epoch": 3.03125,
"grad_norm": 0.34042584783125357,
"learning_rate": 2e-05,
"loss": 0.7336,
"step": 97
},
{
"epoch": 3.03125,
"eval_loss": 0.7476670742034912,
"eval_runtime": 54.9305,
"eval_samples_per_second": 3.641,
"eval_steps_per_second": 0.237,
"step": 97
},
{
"epoch": 3.0625,
"grad_norm": 0.293099043801068,
"learning_rate": 2e-05,
"loss": 0.8088,
"step": 98
},
{
"epoch": 3.0625,
"eval_loss": 0.745802640914917,
"eval_runtime": 55.2051,
"eval_samples_per_second": 3.623,
"eval_steps_per_second": 0.235,
"step": 98
},
{
"epoch": 3.09375,
"grad_norm": 0.3042839507858426,
"learning_rate": 2e-05,
"loss": 0.787,
"step": 99
},
{
"epoch": 3.09375,
"eval_loss": 0.7439618110656738,
"eval_runtime": 55.0065,
"eval_samples_per_second": 3.636,
"eval_steps_per_second": 0.236,
"step": 99
},
{
"epoch": 3.125,
"grad_norm": 0.32992077073227005,
"learning_rate": 2e-05,
"loss": 0.8296,
"step": 100
},
{
"epoch": 3.125,
"eval_loss": 0.7424842715263367,
"eval_runtime": 55.1254,
"eval_samples_per_second": 3.628,
"eval_steps_per_second": 0.236,
"step": 100
},
{
"epoch": 3.15625,
"grad_norm": 0.2798839747424062,
"learning_rate": 2e-05,
"loss": 0.7642,
"step": 101
},
{
"epoch": 3.15625,
"eval_loss": 0.7414796948432922,
"eval_runtime": 49.183,
"eval_samples_per_second": 4.066,
"eval_steps_per_second": 0.264,
"step": 101
},
{
"epoch": 3.1875,
"grad_norm": 0.3046631191964983,
"learning_rate": 2e-05,
"loss": 0.8203,
"step": 102
},
{
"epoch": 3.1875,
"eval_loss": 0.7410265207290649,
"eval_runtime": 48.1541,
"eval_samples_per_second": 4.153,
"eval_steps_per_second": 0.27,
"step": 102
},
{
"epoch": 3.21875,
"grad_norm": 0.3117517214859861,
"learning_rate": 2e-05,
"loss": 0.8222,
"step": 103
},
{
"epoch": 3.21875,
"eval_loss": 0.7405675649642944,
"eval_runtime": 47.7145,
"eval_samples_per_second": 4.192,
"eval_steps_per_second": 0.272,
"step": 103
},
{
"epoch": 3.25,
"grad_norm": 0.3412709249466801,
"learning_rate": 2e-05,
"loss": 0.7459,
"step": 104
},
{
"epoch": 3.25,
"eval_loss": 0.7395681738853455,
"eval_runtime": 47.5855,
"eval_samples_per_second": 4.203,
"eval_steps_per_second": 0.273,
"step": 104
},
{
"epoch": 3.28125,
"grad_norm": 0.2917443566507923,
"learning_rate": 2e-05,
"loss": 0.7849,
"step": 105
},
{
"epoch": 3.28125,
"eval_loss": 0.7387100458145142,
"eval_runtime": 47.6344,
"eval_samples_per_second": 4.199,
"eval_steps_per_second": 0.273,
"step": 105
},
{
"epoch": 3.3125,
"grad_norm": 0.3054484743574741,
"learning_rate": 2e-05,
"loss": 0.8354,
"step": 106
},
{
"epoch": 3.3125,
"eval_loss": 0.7384718060493469,
"eval_runtime": 47.8373,
"eval_samples_per_second": 4.181,
"eval_steps_per_second": 0.272,
"step": 106
},
{
"epoch": 3.34375,
"grad_norm": 0.34986630381114014,
"learning_rate": 2e-05,
"loss": 0.7069,
"step": 107
},
{
"epoch": 3.34375,
"eval_loss": 0.737342357635498,
"eval_runtime": 47.5763,
"eval_samples_per_second": 4.204,
"eval_steps_per_second": 0.273,
"step": 107
},
{
"epoch": 3.375,
"grad_norm": 0.32324403145716496,
"learning_rate": 2e-05,
"loss": 0.767,
"step": 108
},
{
"epoch": 3.375,
"eval_loss": 0.7360101938247681,
"eval_runtime": 47.5774,
"eval_samples_per_second": 4.204,
"eval_steps_per_second": 0.273,
"step": 108
},
{
"epoch": 3.40625,
"grad_norm": 0.3795969851258545,
"learning_rate": 2e-05,
"loss": 0.7556,
"step": 109
},
{
"epoch": 3.40625,
"eval_loss": 0.7339167594909668,
"eval_runtime": 47.5818,
"eval_samples_per_second": 4.203,
"eval_steps_per_second": 0.273,
"step": 109
},
{
"epoch": 3.4375,
"grad_norm": 0.34401062275458993,
"learning_rate": 2e-05,
"loss": 0.7494,
"step": 110
},
{
"epoch": 3.4375,
"eval_loss": 0.7321068644523621,
"eval_runtime": 47.7643,
"eval_samples_per_second": 4.187,
"eval_steps_per_second": 0.272,
"step": 110
},
{
"epoch": 3.46875,
"grad_norm": 0.3248480010385237,
"learning_rate": 2e-05,
"loss": 0.8103,
"step": 111
},
{
"epoch": 3.46875,
"eval_loss": 0.7309197783470154,
"eval_runtime": 49.5841,
"eval_samples_per_second": 4.034,
"eval_steps_per_second": 0.262,
"step": 111
},
{
"epoch": 3.5,
"grad_norm": 0.3572409124813593,
"learning_rate": 2e-05,
"loss": 0.7972,
"step": 112
},
{
"epoch": 3.5,
"eval_loss": 0.7301727533340454,
"eval_runtime": 49.3728,
"eval_samples_per_second": 4.051,
"eval_steps_per_second": 0.263,
"step": 112
},
{
"epoch": 3.53125,
"grad_norm": 0.37348522775103665,
"learning_rate": 2e-05,
"loss": 0.88,
"step": 113
},
{
"epoch": 3.53125,
"eval_loss": 0.7292957305908203,
"eval_runtime": 49.2192,
"eval_samples_per_second": 4.063,
"eval_steps_per_second": 0.264,
"step": 113
},
{
"epoch": 3.5625,
"grad_norm": 0.37667450960329546,
"learning_rate": 2e-05,
"loss": 0.7518,
"step": 114
},
{
"epoch": 3.5625,
"eval_loss": 0.728556215763092,
"eval_runtime": 49.0971,
"eval_samples_per_second": 4.074,
"eval_steps_per_second": 0.265,
"step": 114
},
{
"epoch": 3.59375,
"grad_norm": 0.3163628607304638,
"learning_rate": 2e-05,
"loss": 0.7948,
"step": 115
},
{
"epoch": 3.59375,
"eval_loss": 0.7287828326225281,
"eval_runtime": 49.0213,
"eval_samples_per_second": 4.08,
"eval_steps_per_second": 0.265,
"step": 115
},
{
"epoch": 3.625,
"grad_norm": 0.3038899302084592,
"learning_rate": 2e-05,
"loss": 0.7791,
"step": 116
},
{
"epoch": 3.625,
"eval_loss": 0.7294514179229736,
"eval_runtime": 51.9137,
"eval_samples_per_second": 3.853,
"eval_steps_per_second": 0.25,
"step": 116
},
{
"epoch": 3.65625,
"grad_norm": 0.3746448663122327,
"learning_rate": 2e-05,
"loss": 0.7863,
"step": 117
},
{
"epoch": 3.65625,
"eval_loss": 0.7289304137229919,
"eval_runtime": 51.3023,
"eval_samples_per_second": 3.898,
"eval_steps_per_second": 0.253,
"step": 117
},
{
"epoch": 3.6875,
"grad_norm": 0.4058937381299434,
"learning_rate": 2e-05,
"loss": 0.7907,
"step": 118
},
{
"epoch": 3.6875,
"eval_loss": 0.7281011343002319,
"eval_runtime": 50.8635,
"eval_samples_per_second": 3.932,
"eval_steps_per_second": 0.256,
"step": 118
},
{
"epoch": 3.71875,
"grad_norm": 0.31608065583227885,
"learning_rate": 2e-05,
"loss": 0.8348,
"step": 119
},
{
"epoch": 3.71875,
"eval_loss": 0.7280247211456299,
"eval_runtime": 50.4903,
"eval_samples_per_second": 3.961,
"eval_steps_per_second": 0.257,
"step": 119
},
{
"epoch": 3.75,
"grad_norm": 0.3375768031046084,
"learning_rate": 2e-05,
"loss": 0.7783,
"step": 120
},
{
"epoch": 3.75,
"eval_loss": 0.7281913757324219,
"eval_runtime": 50.5906,
"eval_samples_per_second": 3.953,
"eval_steps_per_second": 0.257,
"step": 120
},
{
"epoch": 3.78125,
"grad_norm": 0.36047493494859845,
"learning_rate": 2e-05,
"loss": 0.765,
"step": 121
},
{
"epoch": 3.78125,
"eval_loss": 0.7269737124443054,
"eval_runtime": 53.4722,
"eval_samples_per_second": 3.74,
"eval_steps_per_second": 0.243,
"step": 121
},
{
"epoch": 3.8125,
"grad_norm": 0.389743860171921,
"learning_rate": 2e-05,
"loss": 0.8269,
"step": 122
},
{
"epoch": 3.8125,
"eval_loss": 0.7251996397972107,
"eval_runtime": 53.4986,
"eval_samples_per_second": 3.738,
"eval_steps_per_second": 0.243,
"step": 122
},
{
"epoch": 3.84375,
"grad_norm": 0.33850935145960215,
"learning_rate": 2e-05,
"loss": 0.7497,
"step": 123
},
{
"epoch": 3.84375,
"eval_loss": 0.723595142364502,
"eval_runtime": 53.4196,
"eval_samples_per_second": 3.744,
"eval_steps_per_second": 0.243,
"step": 123
},
{
"epoch": 3.875,
"grad_norm": 0.3166770012114478,
"learning_rate": 2e-05,
"loss": 0.7648,
"step": 124
},
{
"epoch": 3.875,
"eval_loss": 0.7223578095436096,
"eval_runtime": 52.6143,
"eval_samples_per_second": 3.801,
"eval_steps_per_second": 0.247,
"step": 124
},
{
"epoch": 3.90625,
"grad_norm": 0.41948670305268276,
"learning_rate": 2e-05,
"loss": 0.8306,
"step": 125
},
{
"epoch": 3.90625,
"eval_loss": 0.7206680774688721,
"eval_runtime": 52.3885,
"eval_samples_per_second": 3.818,
"eval_steps_per_second": 0.248,
"step": 125
},
{
"epoch": 3.9375,
"grad_norm": 0.35580041105853477,
"learning_rate": 2e-05,
"loss": 0.7945,
"step": 126
},
{
"epoch": 3.9375,
"eval_loss": 0.7196171283721924,
"eval_runtime": 55.1225,
"eval_samples_per_second": 3.628,
"eval_steps_per_second": 0.236,
"step": 126
},
{
"epoch": 3.96875,
"grad_norm": 0.38411890663257114,
"learning_rate": 2e-05,
"loss": 0.7466,
"step": 127
},
{
"epoch": 3.96875,
"eval_loss": 0.7188088297843933,
"eval_runtime": 55.3068,
"eval_samples_per_second": 3.616,
"eval_steps_per_second": 0.235,
"step": 127
},
{
"epoch": 4.0,
"grad_norm": 0.3682220575203032,
"learning_rate": 2e-05,
"loss": 0.6752,
"step": 128
},
{
"epoch": 4.0,
"eval_loss": 0.7181470990180969,
"eval_runtime": 53.9116,
"eval_samples_per_second": 3.71,
"eval_steps_per_second": 0.241,
"step": 128
},
{
"epoch": 4.03125,
"grad_norm": 0.34160763542661665,
"learning_rate": 2e-05,
"loss": 0.7788,
"step": 129
},
{
"epoch": 4.03125,
"eval_loss": 0.717949390411377,
"eval_runtime": 53.8446,
"eval_samples_per_second": 3.714,
"eval_steps_per_second": 0.241,
"step": 129
},
{
"epoch": 4.0625,
"grad_norm": 0.35709301353799944,
"learning_rate": 2e-05,
"loss": 0.8002,
"step": 130
},
{
"epoch": 4.0625,
"eval_loss": 0.7179380655288696,
"eval_runtime": 53.9299,
"eval_samples_per_second": 3.709,
"eval_steps_per_second": 0.241,
"step": 130
},
{
"epoch": 4.09375,
"grad_norm": 0.3503147340749238,
"learning_rate": 2e-05,
"loss": 0.7789,
"step": 131
},
{
"epoch": 4.09375,
"eval_loss": 0.7180312871932983,
"eval_runtime": 53.4091,
"eval_samples_per_second": 3.745,
"eval_steps_per_second": 0.243,
"step": 131
},
{
"epoch": 4.125,
"grad_norm": 0.3931715546229069,
"learning_rate": 2e-05,
"loss": 0.762,
"step": 132
},
{
"epoch": 4.125,
"eval_loss": 0.717825710773468,
"eval_runtime": 53.6366,
"eval_samples_per_second": 3.729,
"eval_steps_per_second": 0.242,
"step": 132
},
{
"epoch": 4.15625,
"grad_norm": 0.36864033862644363,
"learning_rate": 2e-05,
"loss": 0.829,
"step": 133
},
{
"epoch": 4.15625,
"eval_loss": 0.7178698182106018,
"eval_runtime": 53.4891,
"eval_samples_per_second": 3.739,
"eval_steps_per_second": 0.243,
"step": 133
},
{
"epoch": 4.1875,
"grad_norm": 0.41393587587462155,
"learning_rate": 2e-05,
"loss": 0.7624,
"step": 134
},
{
"epoch": 4.1875,
"eval_loss": 0.7181968092918396,
"eval_runtime": 53.5395,
"eval_samples_per_second": 3.736,
"eval_steps_per_second": 0.243,
"step": 134
},
{
"epoch": 4.21875,
"grad_norm": 0.36727603900023204,
"learning_rate": 2e-05,
"loss": 0.7572,
"step": 135
},
{
"epoch": 4.21875,
"eval_loss": 0.7187527418136597,
"eval_runtime": 53.4818,
"eval_samples_per_second": 3.74,
"eval_steps_per_second": 0.243,
"step": 135
},
{
"epoch": 4.25,
"grad_norm": 0.3684078795455007,
"learning_rate": 2e-05,
"loss": 0.7352,
"step": 136
},
{
"epoch": 4.25,
"eval_loss": 0.7194793820381165,
"eval_runtime": 53.4694,
"eval_samples_per_second": 3.74,
"eval_steps_per_second": 0.243,
"step": 136
},
{
"epoch": 4.28125,
"grad_norm": 0.42414766562621153,
"learning_rate": 2e-05,
"loss": 0.7433,
"step": 137
},
{
"epoch": 4.28125,
"eval_loss": 0.7189603447914124,
"eval_runtime": 53.8049,
"eval_samples_per_second": 3.717,
"eval_steps_per_second": 0.242,
"step": 137
},
{
"epoch": 4.3125,
"grad_norm": 0.40420796619211563,
"learning_rate": 2e-05,
"loss": 0.7466,
"step": 138
},
{
"epoch": 4.3125,
"eval_loss": 0.7173956036567688,
"eval_runtime": 53.4014,
"eval_samples_per_second": 3.745,
"eval_steps_per_second": 0.243,
"step": 138
},
{
"epoch": 4.34375,
"grad_norm": 0.36419740641344456,
"learning_rate": 2e-05,
"loss": 0.7045,
"step": 139
},
{
"epoch": 4.34375,
"eval_loss": 0.7153105139732361,
"eval_runtime": 53.285,
"eval_samples_per_second": 3.753,
"eval_steps_per_second": 0.244,
"step": 139
},
{
"epoch": 4.375,
"grad_norm": 0.384927357409491,
"learning_rate": 2e-05,
"loss": 0.7437,
"step": 140
},
{
"epoch": 4.375,
"eval_loss": 0.7135314345359802,
"eval_runtime": 53.4056,
"eval_samples_per_second": 3.745,
"eval_steps_per_second": 0.243,
"step": 140
},
{
"epoch": 4.40625,
"grad_norm": 0.37218579680263697,
"learning_rate": 2e-05,
"loss": 0.7693,
"step": 141
},
{
"epoch": 4.40625,
"eval_loss": 0.7120725512504578,
"eval_runtime": 53.5467,
"eval_samples_per_second": 3.735,
"eval_steps_per_second": 0.243,
"step": 141
},
{
"epoch": 4.4375,
"grad_norm": 0.38541382926033946,
"learning_rate": 2e-05,
"loss": 0.708,
"step": 142
},
{
"epoch": 4.4375,
"eval_loss": 0.7110380530357361,
"eval_runtime": 53.4119,
"eval_samples_per_second": 3.744,
"eval_steps_per_second": 0.243,
"step": 142
},
{
"epoch": 4.46875,
"grad_norm": 0.4028726453247759,
"learning_rate": 2e-05,
"loss": 0.7263,
"step": 143
},
{
"epoch": 4.46875,
"eval_loss": 0.7100683450698853,
"eval_runtime": 53.4337,
"eval_samples_per_second": 3.743,
"eval_steps_per_second": 0.243,
"step": 143
},
{
"epoch": 4.5,
"grad_norm": 0.3736204162232246,
"learning_rate": 2e-05,
"loss": 0.698,
"step": 144
},
{
"epoch": 4.5,
"eval_loss": 0.7093971371650696,
"eval_runtime": 53.4582,
"eval_samples_per_second": 3.741,
"eval_steps_per_second": 0.243,
"step": 144
},
{
"epoch": 4.53125,
"grad_norm": 0.4179284798304916,
"learning_rate": 2e-05,
"loss": 0.7611,
"step": 145
},
{
"epoch": 4.53125,
"eval_loss": 0.7089446783065796,
"eval_runtime": 53.4752,
"eval_samples_per_second": 3.74,
"eval_steps_per_second": 0.243,
"step": 145
},
{
"epoch": 4.5625,
"grad_norm": 0.4038858950888911,
"learning_rate": 2e-05,
"loss": 0.6652,
"step": 146
},
{
"epoch": 4.5625,
"eval_loss": 0.7089542150497437,
"eval_runtime": 53.4741,
"eval_samples_per_second": 3.74,
"eval_steps_per_second": 0.243,
"step": 146
},
{
"epoch": 4.59375,
"grad_norm": 0.41740068710674544,
"learning_rate": 2e-05,
"loss": 0.7319,
"step": 147
},
{
"epoch": 4.59375,
"eval_loss": 0.7090431451797485,
"eval_runtime": 53.2419,
"eval_samples_per_second": 3.756,
"eval_steps_per_second": 0.244,
"step": 147
},
{
"epoch": 4.625,
"grad_norm": 0.4288335811568808,
"learning_rate": 2e-05,
"loss": 0.6837,
"step": 148
},
{
"epoch": 4.625,
"eval_loss": 0.7088204026222229,
"eval_runtime": 53.3614,
"eval_samples_per_second": 3.748,
"eval_steps_per_second": 0.244,
"step": 148
},
{
"epoch": 4.65625,
"grad_norm": 0.399955010119186,
"learning_rate": 2e-05,
"loss": 0.7989,
"step": 149
},
{
"epoch": 4.65625,
"eval_loss": 0.7084855437278748,
"eval_runtime": 53.4923,
"eval_samples_per_second": 3.739,
"eval_steps_per_second": 0.243,
"step": 149
},
{
"epoch": 4.6875,
"grad_norm": 0.41794643164255846,
"learning_rate": 2e-05,
"loss": 0.7194,
"step": 150
},
{
"epoch": 4.6875,
"eval_loss": 0.7080708146095276,
"eval_runtime": 53.639,
"eval_samples_per_second": 3.729,
"eval_steps_per_second": 0.242,
"step": 150
},
{
"epoch": 4.71875,
"grad_norm": 0.40953367303148197,
"learning_rate": 2e-05,
"loss": 0.7354,
"step": 151
},
{
"epoch": 4.71875,
"eval_loss": 0.7077429890632629,
"eval_runtime": 53.3837,
"eval_samples_per_second": 3.746,
"eval_steps_per_second": 0.244,
"step": 151
},
{
"epoch": 4.75,
"grad_norm": 0.5012282841513718,
"learning_rate": 2e-05,
"loss": 0.7662,
"step": 152
},
{
"epoch": 4.75,
"eval_loss": 0.7064151167869568,
"eval_runtime": 53.3549,
"eval_samples_per_second": 3.748,
"eval_steps_per_second": 0.244,
"step": 152
},
{
"epoch": 4.78125,
"grad_norm": 0.4210784420989087,
"learning_rate": 2e-05,
"loss": 0.7133,
"step": 153
},
{
"epoch": 4.78125,
"eval_loss": 0.7052726745605469,
"eval_runtime": 53.5059,
"eval_samples_per_second": 3.738,
"eval_steps_per_second": 0.243,
"step": 153
},
{
"epoch": 4.8125,
"grad_norm": 0.43520348530514996,
"learning_rate": 2e-05,
"loss": 0.729,
"step": 154
},
{
"epoch": 4.8125,
"eval_loss": 0.7045274972915649,
"eval_runtime": 53.8352,
"eval_samples_per_second": 3.715,
"eval_steps_per_second": 0.241,
"step": 154
},
{
"epoch": 4.84375,
"grad_norm": 0.4287647569802656,
"learning_rate": 2e-05,
"loss": 0.6727,
"step": 155
},
{
"epoch": 4.84375,
"eval_loss": 0.7041358947753906,
"eval_runtime": 53.7435,
"eval_samples_per_second": 3.721,
"eval_steps_per_second": 0.242,
"step": 155
},
{
"epoch": 4.875,
"grad_norm": 0.41883715320456333,
"learning_rate": 2e-05,
"loss": 0.7755,
"step": 156
},
{
"epoch": 4.875,
"eval_loss": 0.7037128210067749,
"eval_runtime": 53.8035,
"eval_samples_per_second": 3.717,
"eval_steps_per_second": 0.242,
"step": 156
},
{
"epoch": 4.90625,
"grad_norm": 0.40617584505395354,
"learning_rate": 2e-05,
"loss": 0.7776,
"step": 157
},
{
"epoch": 4.90625,
"eval_loss": 0.703965425491333,
"eval_runtime": 53.8731,
"eval_samples_per_second": 3.712,
"eval_steps_per_second": 0.241,
"step": 157
},
{
"epoch": 4.9375,
"grad_norm": 0.4085802225532245,
"learning_rate": 2e-05,
"loss": 0.7628,
"step": 158
},
{
"epoch": 4.9375,
"eval_loss": 0.7040860056877136,
"eval_runtime": 53.9059,
"eval_samples_per_second": 3.71,
"eval_steps_per_second": 0.241,
"step": 158
},
{
"epoch": 4.96875,
"grad_norm": 0.418039298119887,
"learning_rate": 2e-05,
"loss": 0.7221,
"step": 159
},
{
"epoch": 4.96875,
"eval_loss": 0.7039948105812073,
"eval_runtime": 53.7323,
"eval_samples_per_second": 3.722,
"eval_steps_per_second": 0.242,
"step": 159
},
{
"epoch": 5.0,
"grad_norm": 0.46118870048713073,
"learning_rate": 2e-05,
"loss": 0.7029,
"step": 160
},
{
"epoch": 5.0,
"eval_loss": 0.703814685344696,
"eval_runtime": 53.8975,
"eval_samples_per_second": 3.711,
"eval_steps_per_second": 0.241,
"step": 160
},
{
"epoch": 5.03125,
"grad_norm": 0.431474386110294,
"learning_rate": 2e-05,
"loss": 0.6772,
"step": 161
},
{
"epoch": 5.03125,
"eval_loss": 0.7034456133842468,
"eval_runtime": 51.1105,
"eval_samples_per_second": 3.913,
"eval_steps_per_second": 0.254,
"step": 161
},
{
"epoch": 5.0625,
"grad_norm": 0.39618929325750435,
"learning_rate": 2e-05,
"loss": 0.8219,
"step": 162
},
{
"epoch": 5.0625,
"eval_loss": 0.7042189240455627,
"eval_runtime": 47.2927,
"eval_samples_per_second": 4.229,
"eval_steps_per_second": 0.275,
"step": 162
},
{
"epoch": 5.09375,
"grad_norm": 0.4489132713249424,
"learning_rate": 2e-05,
"loss": 0.6387,
"step": 163
},
{
"epoch": 5.09375,
"eval_loss": 0.7061256170272827,
"eval_runtime": 47.387,
"eval_samples_per_second": 4.221,
"eval_steps_per_second": 0.274,
"step": 163
},
{
"epoch": 5.125,
"grad_norm": 0.5100329637159183,
"learning_rate": 2e-05,
"loss": 0.7677,
"step": 164
},
{
"epoch": 5.125,
"eval_loss": 0.708121657371521,
"eval_runtime": 47.3311,
"eval_samples_per_second": 4.226,
"eval_steps_per_second": 0.275,
"step": 164
},
{
"epoch": 5.15625,
"grad_norm": 0.525511631981176,
"learning_rate": 2e-05,
"loss": 0.5956,
"step": 165
},
{
"epoch": 5.15625,
"eval_loss": 0.7091134786605835,
"eval_runtime": 47.2978,
"eval_samples_per_second": 4.229,
"eval_steps_per_second": 0.275,
"step": 165
},
{
"epoch": 5.1875,
"grad_norm": 0.534675354231597,
"learning_rate": 2e-05,
"loss": 0.7097,
"step": 166
},
{
"epoch": 5.1875,
"eval_loss": 0.7097848653793335,
"eval_runtime": 47.4095,
"eval_samples_per_second": 4.219,
"eval_steps_per_second": 0.274,
"step": 166
},
{
"epoch": 5.21875,
"grad_norm": 0.47286903698857446,
"learning_rate": 2e-05,
"loss": 0.7371,
"step": 167
},
{
"epoch": 5.21875,
"eval_loss": 0.7090296745300293,
"eval_runtime": 47.4487,
"eval_samples_per_second": 4.215,
"eval_steps_per_second": 0.274,
"step": 167
},
{
"epoch": 5.25,
"grad_norm": 0.4734705066820788,
"learning_rate": 2e-05,
"loss": 0.7652,
"step": 168
},
{
"epoch": 5.25,
"eval_loss": 0.7079525589942932,
"eval_runtime": 47.4101,
"eval_samples_per_second": 4.219,
"eval_steps_per_second": 0.274,
"step": 168
},
{
"epoch": 5.28125,
"grad_norm": 0.46209764763985184,
"learning_rate": 2e-05,
"loss": 0.6852,
"step": 169
},
{
"epoch": 5.28125,
"eval_loss": 0.7072803974151611,
"eval_runtime": 47.3704,
"eval_samples_per_second": 4.222,
"eval_steps_per_second": 0.274,
"step": 169
},
{
"epoch": 5.3125,
"grad_norm": 0.4828284708486433,
"learning_rate": 2e-05,
"loss": 0.6609,
"step": 170
},
{
"epoch": 5.3125,
"eval_loss": 0.7068901062011719,
"eval_runtime": 47.425,
"eval_samples_per_second": 4.217,
"eval_steps_per_second": 0.274,
"step": 170
},
{
"epoch": 5.34375,
"grad_norm": 0.5230116179180577,
"learning_rate": 2e-05,
"loss": 0.6872,
"step": 171
},
{
"epoch": 5.34375,
"eval_loss": 0.7058187127113342,
"eval_runtime": 47.5711,
"eval_samples_per_second": 4.204,
"eval_steps_per_second": 0.273,
"step": 171
},
{
"epoch": 5.375,
"grad_norm": 0.48081340678536255,
"learning_rate": 2e-05,
"loss": 0.7694,
"step": 172
},
{
"epoch": 5.375,
"eval_loss": 0.7044984698295593,
"eval_runtime": 47.4233,
"eval_samples_per_second": 4.217,
"eval_steps_per_second": 0.274,
"step": 172
},
{
"epoch": 5.40625,
"grad_norm": 0.4787525602476421,
"learning_rate": 2e-05,
"loss": 0.7342,
"step": 173
},
{
"epoch": 5.40625,
"eval_loss": 0.7032212018966675,
"eval_runtime": 47.3534,
"eval_samples_per_second": 4.224,
"eval_steps_per_second": 0.275,
"step": 173
},
{
"epoch": 5.4375,
"grad_norm": 0.4871847582306217,
"learning_rate": 2e-05,
"loss": 0.7562,
"step": 174
},
{
"epoch": 5.4375,
"eval_loss": 0.7019696235656738,
"eval_runtime": 47.382,
"eval_samples_per_second": 4.221,
"eval_steps_per_second": 0.274,
"step": 174
},
{
"epoch": 5.46875,
"grad_norm": 0.47999745025553603,
"learning_rate": 2e-05,
"loss": 0.7534,
"step": 175
},
{
"epoch": 5.46875,
"eval_loss": 0.7014529705047607,
"eval_runtime": 47.4435,
"eval_samples_per_second": 4.216,
"eval_steps_per_second": 0.274,
"step": 175
},
{
"epoch": 5.5,
"grad_norm": 0.5168030891996357,
"learning_rate": 2e-05,
"loss": 0.707,
"step": 176
},
{
"epoch": 5.5,
"eval_loss": 0.6993884444236755,
"eval_runtime": 47.4943,
"eval_samples_per_second": 4.211,
"eval_steps_per_second": 0.274,
"step": 176
},
{
"epoch": 5.53125,
"grad_norm": 0.536450206978984,
"learning_rate": 2e-05,
"loss": 0.7318,
"step": 177
},
{
"epoch": 5.53125,
"eval_loss": 0.6971662640571594,
"eval_runtime": 47.4193,
"eval_samples_per_second": 4.218,
"eval_steps_per_second": 0.274,
"step": 177
},
{
"epoch": 5.5625,
"grad_norm": 0.45352543205020696,
"learning_rate": 2e-05,
"loss": 0.7421,
"step": 178
},
{
"epoch": 5.5625,
"eval_loss": 0.6962605118751526,
"eval_runtime": 47.3798,
"eval_samples_per_second": 4.221,
"eval_steps_per_second": 0.274,
"step": 178
},
{
"epoch": 5.59375,
"grad_norm": 0.5054883443109318,
"learning_rate": 2e-05,
"loss": 0.6668,
"step": 179
},
{
"epoch": 5.59375,
"eval_loss": 0.6970357298851013,
"eval_runtime": 47.3311,
"eval_samples_per_second": 4.226,
"eval_steps_per_second": 0.275,
"step": 179
},
{
"epoch": 5.625,
"grad_norm": 0.49584660418833293,
"learning_rate": 2e-05,
"loss": 0.6548,
"step": 180
},
{
"epoch": 5.625,
"eval_loss": 0.6980059146881104,
"eval_runtime": 47.299,
"eval_samples_per_second": 4.228,
"eval_steps_per_second": 0.275,
"step": 180
},
{
"epoch": 5.65625,
"grad_norm": 0.5114381326491793,
"learning_rate": 2e-05,
"loss": 0.6691,
"step": 181
},
{
"epoch": 5.65625,
"eval_loss": 0.6995040774345398,
"eval_runtime": 47.3887,
"eval_samples_per_second": 4.22,
"eval_steps_per_second": 0.274,
"step": 181
},
{
"epoch": 5.6875,
"grad_norm": 0.48550125668870825,
"learning_rate": 2e-05,
"loss": 0.6525,
"step": 182
},
{
"epoch": 5.6875,
"eval_loss": 0.7020326256752014,
"eval_runtime": 47.3838,
"eval_samples_per_second": 4.221,
"eval_steps_per_second": 0.274,
"step": 182
},
{
"epoch": 5.71875,
"grad_norm": 0.5860847796671736,
"learning_rate": 2e-05,
"loss": 0.674,
"step": 183
},
{
"epoch": 5.71875,
"eval_loss": 0.7027825713157654,
"eval_runtime": 47.3875,
"eval_samples_per_second": 4.221,
"eval_steps_per_second": 0.274,
"step": 183
},
{
"epoch": 5.75,
"grad_norm": 0.5535582209035479,
"learning_rate": 2e-05,
"loss": 0.6643,
"step": 184
},
{
"epoch": 5.75,
"eval_loss": 0.7025408148765564,
"eval_runtime": 47.5534,
"eval_samples_per_second": 4.206,
"eval_steps_per_second": 0.273,
"step": 184
},
{
"epoch": 5.78125,
"grad_norm": 0.5443574176405931,
"learning_rate": 2e-05,
"loss": 0.709,
"step": 185
},
{
"epoch": 5.78125,
"eval_loss": 0.7007840871810913,
"eval_runtime": 47.4469,
"eval_samples_per_second": 4.215,
"eval_steps_per_second": 0.274,
"step": 185
},
{
"epoch": 5.8125,
"grad_norm": 0.563830259704143,
"learning_rate": 2e-05,
"loss": 0.6884,
"step": 186
},
{
"epoch": 5.8125,
"eval_loss": 0.6979361176490784,
"eval_runtime": 49.1203,
"eval_samples_per_second": 4.072,
"eval_steps_per_second": 0.265,
"step": 186
},
{
"epoch": 5.84375,
"grad_norm": 0.5094956892765212,
"learning_rate": 2e-05,
"loss": 0.7318,
"step": 187
},
{
"epoch": 5.84375,
"eval_loss": 0.6962587237358093,
"eval_runtime": 49.1831,
"eval_samples_per_second": 4.066,
"eval_steps_per_second": 0.264,
"step": 187
},
{
"epoch": 5.875,
"grad_norm": 0.5264819980742595,
"learning_rate": 2e-05,
"loss": 0.6746,
"step": 188
},
{
"epoch": 5.875,
"eval_loss": 0.694776713848114,
"eval_runtime": 49.1994,
"eval_samples_per_second": 4.065,
"eval_steps_per_second": 0.264,
"step": 188
},
{
"epoch": 5.90625,
"grad_norm": 0.4737429304023209,
"learning_rate": 2e-05,
"loss": 0.664,
"step": 189
},
{
"epoch": 5.90625,
"eval_loss": 0.6939517855644226,
"eval_runtime": 49.2438,
"eval_samples_per_second": 4.061,
"eval_steps_per_second": 0.264,
"step": 189
},
{
"epoch": 5.9375,
"grad_norm": 0.494163934813738,
"learning_rate": 2e-05,
"loss": 0.6978,
"step": 190
},
{
"epoch": 5.9375,
"eval_loss": 0.6933834552764893,
"eval_runtime": 49.3494,
"eval_samples_per_second": 4.053,
"eval_steps_per_second": 0.263,
"step": 190
},
{
"epoch": 5.96875,
"grad_norm": 0.4945972278087299,
"learning_rate": 2e-05,
"loss": 0.6909,
"step": 191
},
{
"epoch": 5.96875,
"eval_loss": 0.6924250721931458,
"eval_runtime": 50.3255,
"eval_samples_per_second": 3.974,
"eval_steps_per_second": 0.258,
"step": 191
},
{
"epoch": 6.0,
"grad_norm": 0.48872556688745233,
"learning_rate": 2e-05,
"loss": 0.6622,
"step": 192
},
{
"epoch": 6.0,
"eval_loss": 0.6922193765640259,
"eval_runtime": 50.4561,
"eval_samples_per_second": 3.964,
"eval_steps_per_second": 0.258,
"step": 192
},
{
"epoch": 6.03125,
"grad_norm": 0.5013452255378538,
"learning_rate": 2e-05,
"loss": 0.7458,
"step": 193
},
{
"epoch": 6.03125,
"eval_loss": 0.6931161284446716,
"eval_runtime": 50.5049,
"eval_samples_per_second": 3.96,
"eval_steps_per_second": 0.257,
"step": 193
},
{
"epoch": 6.0625,
"grad_norm": 0.48271161232093784,
"learning_rate": 2e-05,
"loss": 0.7171,
"step": 194
},
{
"epoch": 6.0625,
"eval_loss": 0.6959040760993958,
"eval_runtime": 50.2441,
"eval_samples_per_second": 3.981,
"eval_steps_per_second": 0.259,
"step": 194
},
{
"epoch": 6.09375,
"grad_norm": 0.5414562703154852,
"learning_rate": 2e-05,
"loss": 0.6419,
"step": 195
},
{
"epoch": 6.09375,
"eval_loss": 0.7000604271888733,
"eval_runtime": 50.4261,
"eval_samples_per_second": 3.966,
"eval_steps_per_second": 0.258,
"step": 195
},
{
"epoch": 6.125,
"grad_norm": 0.5074661247335385,
"learning_rate": 2e-05,
"loss": 0.6881,
"step": 196
},
{
"epoch": 6.125,
"eval_loss": 0.7039622664451599,
"eval_runtime": 51.5214,
"eval_samples_per_second": 3.882,
"eval_steps_per_second": 0.252,
"step": 196
},
{
"epoch": 6.15625,
"grad_norm": 0.5603468534764365,
"learning_rate": 2e-05,
"loss": 0.7085,
"step": 197
},
{
"epoch": 6.15625,
"eval_loss": 0.7055023312568665,
"eval_runtime": 51.7102,
"eval_samples_per_second": 3.868,
"eval_steps_per_second": 0.251,
"step": 197
},
{
"epoch": 6.1875,
"grad_norm": 0.5992190802422799,
"learning_rate": 2e-05,
"loss": 0.7614,
"step": 198
},
{
"epoch": 6.1875,
"eval_loss": 0.7046856880187988,
"eval_runtime": 51.5464,
"eval_samples_per_second": 3.88,
"eval_steps_per_second": 0.252,
"step": 198
},
{
"epoch": 6.21875,
"grad_norm": 0.6293684167527106,
"learning_rate": 2e-05,
"loss": 0.6435,
"step": 199
},
{
"epoch": 6.21875,
"eval_loss": 0.7021151781082153,
"eval_runtime": 51.5328,
"eval_samples_per_second": 3.881,
"eval_steps_per_second": 0.252,
"step": 199
},
{
"epoch": 6.25,
"grad_norm": 0.591265449241434,
"learning_rate": 2e-05,
"loss": 0.688,
"step": 200
},
{
"epoch": 6.25,
"eval_loss": 0.7002359628677368,
"eval_runtime": 51.5812,
"eval_samples_per_second": 3.877,
"eval_steps_per_second": 0.252,
"step": 200
},
{
"epoch": 6.28125,
"grad_norm": 0.543141536526749,
"learning_rate": 2e-05,
"loss": 0.7027,
"step": 201
},
{
"epoch": 6.28125,
"eval_loss": 0.6986366510391235,
"eval_runtime": 52.6956,
"eval_samples_per_second": 3.795,
"eval_steps_per_second": 0.247,
"step": 201
},
{
"epoch": 6.3125,
"grad_norm": 0.5679656300203245,
"learning_rate": 2e-05,
"loss": 0.625,
"step": 202
},
{
"epoch": 6.3125,
"eval_loss": 0.698679506778717,
"eval_runtime": 52.5102,
"eval_samples_per_second": 3.809,
"eval_steps_per_second": 0.248,
"step": 202
},
{
"epoch": 6.34375,
"grad_norm": 0.5285839896523021,
"learning_rate": 2e-05,
"loss": 0.7687,
"step": 203
},
{
"epoch": 6.34375,
"eval_loss": 0.7005956768989563,
"eval_runtime": 52.6067,
"eval_samples_per_second": 3.802,
"eval_steps_per_second": 0.247,
"step": 203
},
{
"epoch": 6.375,
"grad_norm": 0.6512964945211068,
"learning_rate": 2e-05,
"loss": 0.623,
"step": 204
},
{
"epoch": 6.375,
"eval_loss": 0.7013595104217529,
"eval_runtime": 52.5428,
"eval_samples_per_second": 3.806,
"eval_steps_per_second": 0.247,
"step": 204
},
{
"epoch": 6.40625,
"grad_norm": 0.5295248631519638,
"learning_rate": 2e-05,
"loss": 0.5941,
"step": 205
},
{
"epoch": 6.40625,
"eval_loss": 0.7016547322273254,
"eval_runtime": 52.6142,
"eval_samples_per_second": 3.801,
"eval_steps_per_second": 0.247,
"step": 205
},
{
"epoch": 6.4375,
"grad_norm": 0.6134157701434021,
"learning_rate": 2e-05,
"loss": 0.6506,
"step": 206
},
{
"epoch": 6.4375,
"eval_loss": 0.7009623646736145,
"eval_runtime": 52.1942,
"eval_samples_per_second": 3.832,
"eval_steps_per_second": 0.249,
"step": 206
},
{
"epoch": 6.46875,
"grad_norm": 0.57886797614996,
"learning_rate": 2e-05,
"loss": 0.6983,
"step": 207
},
{
"epoch": 6.46875,
"eval_loss": 0.6988092064857483,
"eval_runtime": 52.2577,
"eval_samples_per_second": 3.827,
"eval_steps_per_second": 0.249,
"step": 207
},
{
"epoch": 6.5,
"grad_norm": 0.5593482836944472,
"learning_rate": 2e-05,
"loss": 0.6348,
"step": 208
},
{
"epoch": 6.5,
"eval_loss": 0.698823094367981,
"eval_runtime": 52.2296,
"eval_samples_per_second": 3.829,
"eval_steps_per_second": 0.249,
"step": 208
},
{
"epoch": 6.53125,
"grad_norm": 0.662802162179718,
"learning_rate": 2e-05,
"loss": 0.6206,
"step": 209
},
{
"epoch": 6.53125,
"eval_loss": 0.6990167498588562,
"eval_runtime": 52.4316,
"eval_samples_per_second": 3.814,
"eval_steps_per_second": 0.248,
"step": 209
},
{
"epoch": 6.5625,
"grad_norm": 0.6874374231122908,
"learning_rate": 2e-05,
"loss": 0.6033,
"step": 210
},
{
"epoch": 6.5625,
"eval_loss": 0.699796736240387,
"eval_runtime": 52.3193,
"eval_samples_per_second": 3.823,
"eval_steps_per_second": 0.248,
"step": 210
},
{
"epoch": 6.59375,
"grad_norm": 0.6625766736772473,
"learning_rate": 2e-05,
"loss": 0.6398,
"step": 211
},
{
"epoch": 6.59375,
"eval_loss": 0.6989737153053284,
"eval_runtime": 52.1885,
"eval_samples_per_second": 3.832,
"eval_steps_per_second": 0.249,
"step": 211
},
{
"epoch": 6.625,
"grad_norm": 0.6563419096027812,
"learning_rate": 2e-05,
"loss": 0.6119,
"step": 212
},
{
"epoch": 6.625,
"eval_loss": 0.6973609924316406,
"eval_runtime": 52.1628,
"eval_samples_per_second": 3.834,
"eval_steps_per_second": 0.249,
"step": 212
},
{
"epoch": 6.65625,
"grad_norm": 0.5796353226697397,
"learning_rate": 2e-05,
"loss": 0.7041,
"step": 213
},
{
"epoch": 6.65625,
"eval_loss": 0.6957942247390747,
"eval_runtime": 52.2028,
"eval_samples_per_second": 3.831,
"eval_steps_per_second": 0.249,
"step": 213
},
{
"epoch": 6.6875,
"grad_norm": 0.5711947110504899,
"learning_rate": 2e-05,
"loss": 0.6465,
"step": 214
},
{
"epoch": 6.6875,
"eval_loss": 0.696739673614502,
"eval_runtime": 52.1849,
"eval_samples_per_second": 3.833,
"eval_steps_per_second": 0.249,
"step": 214
},
{
"epoch": 6.71875,
"grad_norm": 0.6619502413653232,
"learning_rate": 2e-05,
"loss": 0.6563,
"step": 215
},
{
"epoch": 6.71875,
"eval_loss": 0.6960940361022949,
"eval_runtime": 52.0996,
"eval_samples_per_second": 3.839,
"eval_steps_per_second": 0.25,
"step": 215
},
{
"epoch": 6.75,
"grad_norm": 0.6587126256919645,
"learning_rate": 2e-05,
"loss": 0.6505,
"step": 216
},
{
"epoch": 6.75,
"eval_loss": 0.6959022283554077,
"eval_runtime": 52.1062,
"eval_samples_per_second": 3.838,
"eval_steps_per_second": 0.249,
"step": 216
},
{
"epoch": 6.78125,
"grad_norm": 0.648164277941964,
"learning_rate": 2e-05,
"loss": 0.5969,
"step": 217
},
{
"epoch": 6.78125,
"eval_loss": 0.6999121308326721,
"eval_runtime": 51.9356,
"eval_samples_per_second": 3.851,
"eval_steps_per_second": 0.25,
"step": 217
},
{
"epoch": 6.8125,
"grad_norm": 0.6595860789738482,
"learning_rate": 2e-05,
"loss": 0.5945,
"step": 218
},
{
"epoch": 6.8125,
"eval_loss": 0.7028067111968994,
"eval_runtime": 52.2232,
"eval_samples_per_second": 3.83,
"eval_steps_per_second": 0.249,
"step": 218
},
{
"epoch": 6.84375,
"grad_norm": 0.7116894779822719,
"learning_rate": 2e-05,
"loss": 0.7027,
"step": 219
},
{
"epoch": 6.84375,
"eval_loss": 0.7035638689994812,
"eval_runtime": 52.1471,
"eval_samples_per_second": 3.835,
"eval_steps_per_second": 0.249,
"step": 219
},
{
"epoch": 6.875,
"grad_norm": 0.7581142336087988,
"learning_rate": 2e-05,
"loss": 0.7171,
"step": 220
},
{
"epoch": 6.875,
"eval_loss": 0.6981176733970642,
"eval_runtime": 52.1366,
"eval_samples_per_second": 3.836,
"eval_steps_per_second": 0.249,
"step": 220
},
{
"epoch": 6.90625,
"grad_norm": 0.6261292745909233,
"learning_rate": 2e-05,
"loss": 0.658,
"step": 221
},
{
"epoch": 6.90625,
"eval_loss": 0.6939045786857605,
"eval_runtime": 52.2211,
"eval_samples_per_second": 3.83,
"eval_steps_per_second": 0.249,
"step": 221
},
{
"epoch": 6.9375,
"grad_norm": 0.7256427809370966,
"learning_rate": 2e-05,
"loss": 0.6576,
"step": 222
},
{
"epoch": 6.9375,
"eval_loss": 0.6904327273368835,
"eval_runtime": 52.1829,
"eval_samples_per_second": 3.833,
"eval_steps_per_second": 0.249,
"step": 222
},
{
"epoch": 6.96875,
"grad_norm": 0.6653711103404113,
"learning_rate": 2e-05,
"loss": 0.6938,
"step": 223
},
{
"epoch": 6.96875,
"eval_loss": 0.6893274188041687,
"eval_runtime": 51.899,
"eval_samples_per_second": 3.854,
"eval_steps_per_second": 0.25,
"step": 223
},
{
"epoch": 7.0,
"grad_norm": 0.6730688267524797,
"learning_rate": 2e-05,
"loss": 0.7397,
"step": 224
},
{
"epoch": 7.0,
"eval_loss": 0.6895740032196045,
"eval_runtime": 52.1977,
"eval_samples_per_second": 3.832,
"eval_steps_per_second": 0.249,
"step": 224
}
],
"logging_steps": 1.0,
"max_steps": 224,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 5,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 322567586447360.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}