cosmicroberta / trainer_state.json
PaulDrm
first commit model
9d26fad
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 35.026963262554766,
"global_step": 12960,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.32,
"learning_rate": 8e-05,
"loss": 1.9241,
"step": 120
},
{
"epoch": 0.65,
"learning_rate": 0.00016,
"loss": 1.8026,
"step": 240
},
{
"epoch": 0.65,
"eval_loss": 1.7006735801696777,
"eval_runtime": 120.5725,
"eval_samples_per_second": 46.437,
"eval_steps_per_second": 0.73,
"step": 240
},
{
"epoch": 0.97,
"learning_rate": 0.0002,
"loss": 1.7588,
"step": 360
},
{
"epoch": 1.3,
"learning_rate": 0.0002,
"loss": 1.7242,
"step": 480
},
{
"epoch": 1.3,
"eval_loss": 1.6368365287780762,
"eval_runtime": 123.5326,
"eval_samples_per_second": 45.324,
"eval_steps_per_second": 0.712,
"step": 480
},
{
"epoch": 1.62,
"learning_rate": 0.0002,
"loss": 1.6797,
"step": 600
},
{
"epoch": 1.94,
"learning_rate": 0.0002,
"loss": 1.6544,
"step": 720
},
{
"epoch": 1.94,
"eval_loss": 1.589858889579773,
"eval_runtime": 121.9204,
"eval_samples_per_second": 45.923,
"eval_steps_per_second": 0.722,
"step": 720
},
{
"epoch": 2.27,
"learning_rate": 0.0002,
"loss": 1.639,
"step": 840
},
{
"epoch": 2.59,
"learning_rate": 0.0002,
"loss": 1.6103,
"step": 960
},
{
"epoch": 2.59,
"eval_loss": 1.559193730354309,
"eval_runtime": 118.3836,
"eval_samples_per_second": 47.295,
"eval_steps_per_second": 0.743,
"step": 960
},
{
"epoch": 2.92,
"learning_rate": 0.0002,
"loss": 1.5982,
"step": 1080
},
{
"epoch": 3.24,
"learning_rate": 0.0002,
"loss": 1.5858,
"step": 1200
},
{
"epoch": 3.24,
"eval_loss": 1.5362491607666016,
"eval_runtime": 123.4422,
"eval_samples_per_second": 45.357,
"eval_steps_per_second": 0.713,
"step": 1200
},
{
"epoch": 3.57,
"learning_rate": 0.0002,
"loss": 1.5684,
"step": 1320
},
{
"epoch": 3.89,
"learning_rate": 0.0002,
"loss": 1.5566,
"step": 1440
},
{
"epoch": 3.89,
"eval_loss": 1.51528799533844,
"eval_runtime": 120.2858,
"eval_samples_per_second": 46.547,
"eval_steps_per_second": 0.732,
"step": 1440
},
{
"epoch": 4.22,
"learning_rate": 0.0002,
"loss": 1.5593,
"step": 1560
},
{
"epoch": 4.54,
"learning_rate": 0.0002,
"loss": 1.5322,
"step": 1680
},
{
"epoch": 4.54,
"eval_loss": 1.5114836692810059,
"eval_runtime": 117.8482,
"eval_samples_per_second": 47.51,
"eval_steps_per_second": 0.747,
"step": 1680
},
{
"epoch": 4.86,
"learning_rate": 0.0002,
"loss": 1.5285,
"step": 1800
},
{
"epoch": 5.19,
"learning_rate": 0.0002,
"loss": 1.5359,
"step": 1920
},
{
"epoch": 5.19,
"eval_loss": 1.48625910282135,
"eval_runtime": 123.7493,
"eval_samples_per_second": 45.245,
"eval_steps_per_second": 0.711,
"step": 1920
},
{
"epoch": 5.51,
"learning_rate": 0.0002,
"loss": 1.5207,
"step": 2040
},
{
"epoch": 5.84,
"learning_rate": 0.0002,
"loss": 1.5079,
"step": 2160
},
{
"epoch": 5.84,
"eval_loss": 1.4822603464126587,
"eval_runtime": 119.2091,
"eval_samples_per_second": 46.968,
"eval_steps_per_second": 0.738,
"step": 2160
},
{
"epoch": 6.16,
"learning_rate": 0.0002,
"loss": 1.51,
"step": 2280
},
{
"epoch": 6.49,
"learning_rate": 0.0002,
"loss": 1.4909,
"step": 2400
},
{
"epoch": 6.49,
"eval_loss": 1.4646539688110352,
"eval_runtime": 122.8427,
"eval_samples_per_second": 45.579,
"eval_steps_per_second": 0.716,
"step": 2400
},
{
"epoch": 6.81,
"learning_rate": 0.0002,
"loss": 1.4869,
"step": 2520
},
{
"epoch": 7.13,
"learning_rate": 0.0002,
"loss": 1.4894,
"step": 2640
},
{
"epoch": 7.13,
"eval_loss": 1.4567737579345703,
"eval_runtime": 112.4698,
"eval_samples_per_second": 49.782,
"eval_steps_per_second": 0.782,
"step": 2640
},
{
"epoch": 7.46,
"learning_rate": 0.0002,
"loss": 1.4705,
"step": 2760
},
{
"epoch": 7.78,
"learning_rate": 0.0002,
"loss": 1.469,
"step": 2880
},
{
"epoch": 7.78,
"eval_loss": 1.447322130203247,
"eval_runtime": 124.434,
"eval_samples_per_second": 44.996,
"eval_steps_per_second": 0.707,
"step": 2880
},
{
"epoch": 8.11,
"learning_rate": 0.0002,
"loss": 1.4716,
"step": 3000
},
{
"epoch": 8.43,
"learning_rate": 0.0002,
"loss": 1.4525,
"step": 3120
},
{
"epoch": 8.43,
"eval_loss": 1.4480490684509277,
"eval_runtime": 120.9825,
"eval_samples_per_second": 46.279,
"eval_steps_per_second": 0.727,
"step": 3120
},
{
"epoch": 8.75,
"learning_rate": 0.0002,
"loss": 1.452,
"step": 3240
},
{
"epoch": 9.08,
"learning_rate": 0.0002,
"loss": 1.4552,
"step": 3360
},
{
"epoch": 9.08,
"eval_loss": 1.4297771453857422,
"eval_runtime": 119.4349,
"eval_samples_per_second": 46.879,
"eval_steps_per_second": 0.737,
"step": 3360
},
{
"epoch": 9.4,
"learning_rate": 0.0002,
"loss": 1.4369,
"step": 3480
},
{
"epoch": 9.73,
"learning_rate": 0.0002,
"loss": 1.4357,
"step": 3600
},
{
"epoch": 9.73,
"eval_loss": 1.4253787994384766,
"eval_runtime": 123.7286,
"eval_samples_per_second": 45.252,
"eval_steps_per_second": 0.711,
"step": 3600
},
{
"epoch": 10.05,
"learning_rate": 0.0002,
"loss": 1.4449,
"step": 3720
},
{
"epoch": 10.38,
"learning_rate": 0.0002,
"loss": 1.4245,
"step": 3840
},
{
"epoch": 10.38,
"eval_loss": 1.419893741607666,
"eval_runtime": 122.5962,
"eval_samples_per_second": 45.67,
"eval_steps_per_second": 0.718,
"step": 3840
},
{
"epoch": 10.7,
"learning_rate": 0.0002,
"loss": 1.4259,
"step": 3960
},
{
"epoch": 11.03,
"learning_rate": 0.0002,
"loss": 1.4317,
"step": 4080
},
{
"epoch": 11.03,
"eval_loss": 1.4151264429092407,
"eval_runtime": 120.6018,
"eval_samples_per_second": 46.426,
"eval_steps_per_second": 0.73,
"step": 4080
},
{
"epoch": 11.35,
"learning_rate": 0.0002,
"loss": 1.4133,
"step": 4200
},
{
"epoch": 11.67,
"learning_rate": 0.0002,
"loss": 1.4119,
"step": 4320
},
{
"epoch": 11.67,
"eval_loss": 1.4069455862045288,
"eval_runtime": 123.9031,
"eval_samples_per_second": 45.189,
"eval_steps_per_second": 0.71,
"step": 4320
},
{
"epoch": 12.0,
"learning_rate": 0.0002,
"loss": 1.4096,
"step": 4440
},
{
"epoch": 12.32,
"learning_rate": 0.0002,
"loss": 1.4086,
"step": 4560
},
{
"epoch": 12.32,
"eval_loss": 1.4099173545837402,
"eval_runtime": 121.1011,
"eval_samples_per_second": 46.234,
"eval_steps_per_second": 0.727,
"step": 4560
},
{
"epoch": 12.65,
"learning_rate": 0.0002,
"loss": 1.4031,
"step": 4680
},
{
"epoch": 12.97,
"learning_rate": 0.0002,
"loss": 1.401,
"step": 4800
},
{
"epoch": 12.97,
"eval_loss": 1.4046831130981445,
"eval_runtime": 121.8177,
"eval_samples_per_second": 45.962,
"eval_steps_per_second": 0.722,
"step": 4800
},
{
"epoch": 13.3,
"learning_rate": 0.0002,
"loss": 1.4031,
"step": 4920
},
{
"epoch": 13.62,
"learning_rate": 0.0002,
"loss": 1.394,
"step": 5040
},
{
"epoch": 13.62,
"eval_loss": 1.401537299156189,
"eval_runtime": 121.4356,
"eval_samples_per_second": 46.107,
"eval_steps_per_second": 0.725,
"step": 5040
},
{
"epoch": 13.94,
"learning_rate": 0.0002,
"loss": 1.3922,
"step": 5160
},
{
"epoch": 14.27,
"learning_rate": 0.0002,
"loss": 1.3945,
"step": 5280
},
{
"epoch": 14.27,
"eval_loss": 1.3918230533599854,
"eval_runtime": 119.2233,
"eval_samples_per_second": 46.962,
"eval_steps_per_second": 0.738,
"step": 5280
},
{
"epoch": 14.59,
"learning_rate": 0.0002,
"loss": 1.3836,
"step": 5400
},
{
"epoch": 14.92,
"learning_rate": 0.0002,
"loss": 1.3838,
"step": 5520
},
{
"epoch": 14.92,
"eval_loss": 1.385350227355957,
"eval_runtime": 113.4489,
"eval_samples_per_second": 49.353,
"eval_steps_per_second": 0.776,
"step": 5520
},
{
"epoch": 15.24,
"learning_rate": 0.0002,
"loss": 1.387,
"step": 5640
},
{
"epoch": 15.57,
"learning_rate": 0.0002,
"loss": 1.3722,
"step": 5760
},
{
"epoch": 15.57,
"eval_loss": 1.379088282585144,
"eval_runtime": 116.4932,
"eval_samples_per_second": 48.063,
"eval_steps_per_second": 0.755,
"step": 5760
},
{
"epoch": 15.89,
"learning_rate": 0.0002,
"loss": 1.3757,
"step": 5880
},
{
"epoch": 16.22,
"learning_rate": 0.0002,
"loss": 1.3775,
"step": 6000
},
{
"epoch": 16.22,
"eval_loss": 1.384007453918457,
"eval_runtime": 115.8099,
"eval_samples_per_second": 48.346,
"eval_steps_per_second": 0.76,
"step": 6000
},
{
"epoch": 16.54,
"learning_rate": 0.0002,
"loss": 1.3683,
"step": 6120
},
{
"epoch": 16.86,
"learning_rate": 0.0002,
"loss": 1.3675,
"step": 6240
},
{
"epoch": 16.86,
"eval_loss": 1.3760778903961182,
"eval_runtime": 113.2638,
"eval_samples_per_second": 49.433,
"eval_steps_per_second": 0.777,
"step": 6240
},
{
"epoch": 17.19,
"learning_rate": 0.0002,
"loss": 1.375,
"step": 6360
},
{
"epoch": 17.51,
"learning_rate": 0.0002,
"loss": 1.358,
"step": 6480
},
{
"epoch": 17.51,
"eval_loss": 1.3729970455169678,
"eval_runtime": 119.1962,
"eval_samples_per_second": 46.973,
"eval_steps_per_second": 0.738,
"step": 6480
},
{
"epoch": 17.84,
"learning_rate": 0.0002,
"loss": 1.3617,
"step": 6600
},
{
"epoch": 18.16,
"learning_rate": 0.0002,
"loss": 1.3679,
"step": 6720
},
{
"epoch": 18.16,
"eval_loss": 1.3826600313186646,
"eval_runtime": 118.9849,
"eval_samples_per_second": 47.056,
"eval_steps_per_second": 0.74,
"step": 6720
},
{
"epoch": 18.49,
"learning_rate": 0.0002,
"loss": 1.3592,
"step": 6840
},
{
"epoch": 18.81,
"learning_rate": 0.0002,
"loss": 1.3602,
"step": 6960
},
{
"epoch": 18.81,
"eval_loss": 1.3659363985061646,
"eval_runtime": 120.7081,
"eval_samples_per_second": 46.385,
"eval_steps_per_second": 0.729,
"step": 6960
},
{
"epoch": 19.13,
"learning_rate": 0.0002,
"loss": 1.3633,
"step": 7080
},
{
"epoch": 19.46,
"learning_rate": 0.0002,
"loss": 1.3522,
"step": 7200
},
{
"epoch": 19.46,
"eval_loss": 1.372406244277954,
"eval_runtime": 113.6178,
"eval_samples_per_second": 49.279,
"eval_steps_per_second": 0.775,
"step": 7200
},
{
"epoch": 19.78,
"learning_rate": 0.0002,
"loss": 1.345,
"step": 7320
},
{
"epoch": 20.11,
"learning_rate": 0.0002,
"loss": 1.3555,
"step": 7440
},
{
"epoch": 20.11,
"eval_loss": 1.368371844291687,
"eval_runtime": 118.9369,
"eval_samples_per_second": 47.075,
"eval_steps_per_second": 0.74,
"step": 7440
},
{
"epoch": 20.43,
"learning_rate": 0.0002,
"loss": 1.3396,
"step": 7560
},
{
"epoch": 20.75,
"learning_rate": 0.0002,
"loss": 1.3536,
"step": 7680
},
{
"epoch": 20.75,
"eval_loss": 1.3611598014831543,
"eval_runtime": 119.3386,
"eval_samples_per_second": 46.917,
"eval_steps_per_second": 0.737,
"step": 7680
},
{
"epoch": 21.08,
"learning_rate": 0.0002,
"loss": 1.3506,
"step": 7800
},
{
"epoch": 21.4,
"learning_rate": 0.0002,
"loss": 1.3347,
"step": 7920
},
{
"epoch": 21.4,
"eval_loss": 1.3598804473876953,
"eval_runtime": 114.0961,
"eval_samples_per_second": 49.073,
"eval_steps_per_second": 0.771,
"step": 7920
},
{
"epoch": 21.73,
"learning_rate": 0.0002,
"loss": 1.338,
"step": 8040
},
{
"epoch": 22.05,
"learning_rate": 0.0002,
"loss": 1.3463,
"step": 8160
},
{
"epoch": 22.05,
"eval_loss": 1.3614617586135864,
"eval_runtime": 121.7757,
"eval_samples_per_second": 45.978,
"eval_steps_per_second": 0.723,
"step": 8160
},
{
"epoch": 22.38,
"learning_rate": 0.0002,
"loss": 1.3305,
"step": 8280
},
{
"epoch": 22.7,
"learning_rate": 0.0002,
"loss": 1.3296,
"step": 8400
},
{
"epoch": 22.7,
"eval_loss": 1.359055519104004,
"eval_runtime": 113.3148,
"eval_samples_per_second": 49.411,
"eval_steps_per_second": 0.777,
"step": 8400
},
{
"epoch": 23.03,
"learning_rate": 0.0002,
"loss": 1.344,
"step": 8520
},
{
"epoch": 23.35,
"learning_rate": 0.0002,
"loss": 1.3201,
"step": 8640
},
{
"epoch": 23.35,
"eval_loss": 1.358960509300232,
"eval_runtime": 122.2886,
"eval_samples_per_second": 45.785,
"eval_steps_per_second": 0.72,
"step": 8640
},
{
"epoch": 23.67,
"learning_rate": 0.0002,
"loss": 1.3302,
"step": 8760
},
{
"epoch": 24.0,
"learning_rate": 0.0002,
"loss": 1.3292,
"step": 8880
},
{
"epoch": 24.0,
"eval_loss": 1.3509206771850586,
"eval_runtime": 99.6058,
"eval_samples_per_second": 56.212,
"eval_steps_per_second": 0.883,
"step": 8880
},
{
"epoch": 24.32,
"learning_rate": 0.0002,
"loss": 1.3294,
"step": 9000
},
{
"epoch": 24.65,
"learning_rate": 0.0002,
"loss": 1.3207,
"step": 9120
},
{
"epoch": 24.65,
"eval_loss": 1.357851505279541,
"eval_runtime": 105.9073,
"eval_samples_per_second": 52.867,
"eval_steps_per_second": 0.831,
"step": 9120
},
{
"epoch": 24.97,
"learning_rate": 0.0002,
"loss": 1.3215,
"step": 9240
},
{
"epoch": 25.3,
"learning_rate": 0.0002,
"loss": 1.3231,
"step": 9360
},
{
"epoch": 25.3,
"eval_loss": 1.3393853902816772,
"eval_runtime": 99.7219,
"eval_samples_per_second": 56.146,
"eval_steps_per_second": 0.882,
"step": 9360
},
{
"epoch": 25.62,
"learning_rate": 0.0002,
"loss": 1.3121,
"step": 9480
},
{
"epoch": 25.94,
"learning_rate": 0.0002,
"loss": 1.3176,
"step": 9600
},
{
"epoch": 25.94,
"eval_loss": 1.3441215753555298,
"eval_runtime": 101.3937,
"eval_samples_per_second": 55.22,
"eval_steps_per_second": 0.868,
"step": 9600
},
{
"epoch": 26.27,
"learning_rate": 0.0002,
"loss": 1.3188,
"step": 9720
},
{
"epoch": 26.59,
"learning_rate": 0.0002,
"loss": 1.3103,
"step": 9840
},
{
"epoch": 26.59,
"eval_loss": 1.3429008722305298,
"eval_runtime": 100.8116,
"eval_samples_per_second": 55.539,
"eval_steps_per_second": 0.873,
"step": 9840
},
{
"epoch": 26.92,
"learning_rate": 0.0002,
"loss": 1.313,
"step": 9960
},
{
"epoch": 27.24,
"learning_rate": 0.0002,
"loss": 1.3156,
"step": 10080
},
{
"epoch": 27.24,
"eval_loss": 1.3400343656539917,
"eval_runtime": 98.2948,
"eval_samples_per_second": 56.961,
"eval_steps_per_second": 0.895,
"step": 10080
},
{
"epoch": 27.57,
"learning_rate": 0.0002,
"loss": 1.3064,
"step": 10200
},
{
"epoch": 27.89,
"learning_rate": 0.0002,
"loss": 1.306,
"step": 10320
},
{
"epoch": 27.89,
"eval_loss": 1.339460015296936,
"eval_runtime": 97.8707,
"eval_samples_per_second": 57.208,
"eval_steps_per_second": 0.899,
"step": 10320
},
{
"epoch": 28.22,
"learning_rate": 0.0002,
"loss": 1.3093,
"step": 10440
},
{
"epoch": 28.54,
"learning_rate": 0.0002,
"loss": 1.3026,
"step": 10560
},
{
"epoch": 28.54,
"eval_loss": 1.3380861282348633,
"eval_runtime": 99.7827,
"eval_samples_per_second": 56.112,
"eval_steps_per_second": 0.882,
"step": 10560
},
{
"epoch": 28.86,
"learning_rate": 0.0002,
"loss": 1.3014,
"step": 10680
},
{
"epoch": 29.19,
"learning_rate": 0.0002,
"loss": 1.3093,
"step": 10800
},
{
"epoch": 29.19,
"eval_loss": 1.335351824760437,
"eval_runtime": 99.7514,
"eval_samples_per_second": 56.13,
"eval_steps_per_second": 0.882,
"step": 10800
},
{
"epoch": 29.51,
"learning_rate": 0.0002,
"loss": 1.2954,
"step": 10920
},
{
"epoch": 29.84,
"learning_rate": 0.0002,
"loss": 1.2982,
"step": 11040
},
{
"epoch": 29.84,
"eval_loss": 1.33037269115448,
"eval_runtime": 111.392,
"eval_samples_per_second": 50.264,
"eval_steps_per_second": 0.79,
"step": 11040
},
{
"epoch": 30.16,
"learning_rate": 0.0002,
"loss": 1.3032,
"step": 11160
},
{
"epoch": 30.49,
"learning_rate": 0.0002,
"loss": 1.2927,
"step": 11280
},
{
"epoch": 30.49,
"eval_loss": 1.3423055410385132,
"eval_runtime": 110.815,
"eval_samples_per_second": 50.526,
"eval_steps_per_second": 0.794,
"step": 11280
},
{
"epoch": 30.81,
"learning_rate": 0.0002,
"loss": 1.2968,
"step": 11400
},
{
"epoch": 31.13,
"learning_rate": 0.0002,
"loss": 1.3003,
"step": 11520
},
{
"epoch": 31.13,
"eval_loss": 1.3345474004745483,
"eval_runtime": 100.6956,
"eval_samples_per_second": 55.603,
"eval_steps_per_second": 0.874,
"step": 11520
},
{
"epoch": 31.46,
"learning_rate": 0.0002,
"loss": 1.2865,
"step": 11640
},
{
"epoch": 31.78,
"learning_rate": 0.0002,
"loss": 1.2928,
"step": 11760
},
{
"epoch": 31.78,
"eval_loss": 1.337437629699707,
"eval_runtime": 97.2235,
"eval_samples_per_second": 57.589,
"eval_steps_per_second": 0.905,
"step": 11760
},
{
"epoch": 32.11,
"learning_rate": 0.0002,
"loss": 1.2981,
"step": 11880
},
{
"epoch": 32.43,
"learning_rate": 0.0002,
"loss": 1.2847,
"step": 12000
},
{
"epoch": 32.43,
"eval_loss": 1.3236644268035889,
"eval_runtime": 97.4026,
"eval_samples_per_second": 57.483,
"eval_steps_per_second": 0.903,
"step": 12000
},
{
"epoch": 32.75,
"learning_rate": 0.0002,
"loss": 1.2871,
"step": 12120
},
{
"epoch": 33.08,
"learning_rate": 0.0002,
"loss": 1.2966,
"step": 12240
},
{
"epoch": 33.08,
"eval_loss": 1.332656741142273,
"eval_runtime": 97.3643,
"eval_samples_per_second": 57.506,
"eval_steps_per_second": 0.904,
"step": 12240
},
{
"epoch": 33.4,
"learning_rate": 0.0002,
"loss": 1.2789,
"step": 12360
},
{
"epoch": 33.73,
"learning_rate": 0.0002,
"loss": 1.2829,
"step": 12480
},
{
"epoch": 33.73,
"eval_loss": 1.3252918720245361,
"eval_runtime": 104.7279,
"eval_samples_per_second": 53.462,
"eval_steps_per_second": 0.84,
"step": 12480
},
{
"epoch": 34.05,
"learning_rate": 0.0002,
"loss": 1.2926,
"step": 12600
},
{
"epoch": 34.38,
"learning_rate": 0.0002,
"loss": 1.2756,
"step": 12720
},
{
"epoch": 34.38,
"eval_loss": 1.326663613319397,
"eval_runtime": 98.2526,
"eval_samples_per_second": 56.986,
"eval_steps_per_second": 0.896,
"step": 12720
},
{
"epoch": 34.7,
"learning_rate": 0.0002,
"loss": 1.2801,
"step": 12840
},
{
"epoch": 35.03,
"learning_rate": 0.0002,
"loss": 1.2919,
"step": 12960
},
{
"epoch": 35.03,
"eval_loss": 1.3183717727661133,
"eval_runtime": 99.1376,
"eval_samples_per_second": 56.477,
"eval_steps_per_second": 0.888,
"step": 12960
}
],
"max_steps": 14000,
"num_train_epochs": 38,
"total_flos": 1.7505797492048026e+18,
"trial_name": null,
"trial_params": null
}