{ "best_metric": null, "best_model_checkpoint": null, "epoch": 35.026963262554766, "global_step": 12960, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.32, "learning_rate": 8e-05, "loss": 1.9241, "step": 120 }, { "epoch": 0.65, "learning_rate": 0.00016, "loss": 1.8026, "step": 240 }, { "epoch": 0.65, "eval_loss": 1.7006735801696777, "eval_runtime": 120.5725, "eval_samples_per_second": 46.437, "eval_steps_per_second": 0.73, "step": 240 }, { "epoch": 0.97, "learning_rate": 0.0002, "loss": 1.7588, "step": 360 }, { "epoch": 1.3, "learning_rate": 0.0002, "loss": 1.7242, "step": 480 }, { "epoch": 1.3, "eval_loss": 1.6368365287780762, "eval_runtime": 123.5326, "eval_samples_per_second": 45.324, "eval_steps_per_second": 0.712, "step": 480 }, { "epoch": 1.62, "learning_rate": 0.0002, "loss": 1.6797, "step": 600 }, { "epoch": 1.94, "learning_rate": 0.0002, "loss": 1.6544, "step": 720 }, { "epoch": 1.94, "eval_loss": 1.589858889579773, "eval_runtime": 121.9204, "eval_samples_per_second": 45.923, "eval_steps_per_second": 0.722, "step": 720 }, { "epoch": 2.27, "learning_rate": 0.0002, "loss": 1.639, "step": 840 }, { "epoch": 2.59, "learning_rate": 0.0002, "loss": 1.6103, "step": 960 }, { "epoch": 2.59, "eval_loss": 1.559193730354309, "eval_runtime": 118.3836, "eval_samples_per_second": 47.295, "eval_steps_per_second": 0.743, "step": 960 }, { "epoch": 2.92, "learning_rate": 0.0002, "loss": 1.5982, "step": 1080 }, { "epoch": 3.24, "learning_rate": 0.0002, "loss": 1.5858, "step": 1200 }, { "epoch": 3.24, "eval_loss": 1.5362491607666016, "eval_runtime": 123.4422, "eval_samples_per_second": 45.357, "eval_steps_per_second": 0.713, "step": 1200 }, { "epoch": 3.57, "learning_rate": 0.0002, "loss": 1.5684, "step": 1320 }, { "epoch": 3.89, "learning_rate": 0.0002, "loss": 1.5566, "step": 1440 }, { "epoch": 3.89, "eval_loss": 1.51528799533844, "eval_runtime": 120.2858, "eval_samples_per_second": 46.547, "eval_steps_per_second": 0.732, "step": 1440 }, { "epoch": 4.22, "learning_rate": 0.0002, "loss": 1.5593, "step": 1560 }, { "epoch": 4.54, "learning_rate": 0.0002, "loss": 1.5322, "step": 1680 }, { "epoch": 4.54, "eval_loss": 1.5114836692810059, "eval_runtime": 117.8482, "eval_samples_per_second": 47.51, "eval_steps_per_second": 0.747, "step": 1680 }, { "epoch": 4.86, "learning_rate": 0.0002, "loss": 1.5285, "step": 1800 }, { "epoch": 5.19, "learning_rate": 0.0002, "loss": 1.5359, "step": 1920 }, { "epoch": 5.19, "eval_loss": 1.48625910282135, "eval_runtime": 123.7493, "eval_samples_per_second": 45.245, "eval_steps_per_second": 0.711, "step": 1920 }, { "epoch": 5.51, "learning_rate": 0.0002, "loss": 1.5207, "step": 2040 }, { "epoch": 5.84, "learning_rate": 0.0002, "loss": 1.5079, "step": 2160 }, { "epoch": 5.84, "eval_loss": 1.4822603464126587, "eval_runtime": 119.2091, "eval_samples_per_second": 46.968, "eval_steps_per_second": 0.738, "step": 2160 }, { "epoch": 6.16, "learning_rate": 0.0002, "loss": 1.51, "step": 2280 }, { "epoch": 6.49, "learning_rate": 0.0002, "loss": 1.4909, "step": 2400 }, { "epoch": 6.49, "eval_loss": 1.4646539688110352, "eval_runtime": 122.8427, "eval_samples_per_second": 45.579, "eval_steps_per_second": 0.716, "step": 2400 }, { "epoch": 6.81, "learning_rate": 0.0002, "loss": 1.4869, "step": 2520 }, { "epoch": 7.13, "learning_rate": 0.0002, "loss": 1.4894, "step": 2640 }, { "epoch": 7.13, "eval_loss": 1.4567737579345703, "eval_runtime": 112.4698, "eval_samples_per_second": 49.782, "eval_steps_per_second": 0.782, "step": 2640 }, { "epoch": 7.46, "learning_rate": 0.0002, "loss": 1.4705, "step": 2760 }, { "epoch": 7.78, "learning_rate": 0.0002, "loss": 1.469, "step": 2880 }, { "epoch": 7.78, "eval_loss": 1.447322130203247, "eval_runtime": 124.434, "eval_samples_per_second": 44.996, "eval_steps_per_second": 0.707, "step": 2880 }, { "epoch": 8.11, "learning_rate": 0.0002, "loss": 1.4716, "step": 3000 }, { "epoch": 8.43, "learning_rate": 0.0002, "loss": 1.4525, "step": 3120 }, { "epoch": 8.43, "eval_loss": 1.4480490684509277, "eval_runtime": 120.9825, "eval_samples_per_second": 46.279, "eval_steps_per_second": 0.727, "step": 3120 }, { "epoch": 8.75, "learning_rate": 0.0002, "loss": 1.452, "step": 3240 }, { "epoch": 9.08, "learning_rate": 0.0002, "loss": 1.4552, "step": 3360 }, { "epoch": 9.08, "eval_loss": 1.4297771453857422, "eval_runtime": 119.4349, "eval_samples_per_second": 46.879, "eval_steps_per_second": 0.737, "step": 3360 }, { "epoch": 9.4, "learning_rate": 0.0002, "loss": 1.4369, "step": 3480 }, { "epoch": 9.73, "learning_rate": 0.0002, "loss": 1.4357, "step": 3600 }, { "epoch": 9.73, "eval_loss": 1.4253787994384766, "eval_runtime": 123.7286, "eval_samples_per_second": 45.252, "eval_steps_per_second": 0.711, "step": 3600 }, { "epoch": 10.05, "learning_rate": 0.0002, "loss": 1.4449, "step": 3720 }, { "epoch": 10.38, "learning_rate": 0.0002, "loss": 1.4245, "step": 3840 }, { "epoch": 10.38, "eval_loss": 1.419893741607666, "eval_runtime": 122.5962, "eval_samples_per_second": 45.67, "eval_steps_per_second": 0.718, "step": 3840 }, { "epoch": 10.7, "learning_rate": 0.0002, "loss": 1.4259, "step": 3960 }, { "epoch": 11.03, "learning_rate": 0.0002, "loss": 1.4317, "step": 4080 }, { "epoch": 11.03, "eval_loss": 1.4151264429092407, "eval_runtime": 120.6018, "eval_samples_per_second": 46.426, "eval_steps_per_second": 0.73, "step": 4080 }, { "epoch": 11.35, "learning_rate": 0.0002, "loss": 1.4133, "step": 4200 }, { "epoch": 11.67, "learning_rate": 0.0002, "loss": 1.4119, "step": 4320 }, { "epoch": 11.67, "eval_loss": 1.4069455862045288, "eval_runtime": 123.9031, "eval_samples_per_second": 45.189, "eval_steps_per_second": 0.71, "step": 4320 }, { "epoch": 12.0, "learning_rate": 0.0002, "loss": 1.4096, "step": 4440 }, { "epoch": 12.32, "learning_rate": 0.0002, "loss": 1.4086, "step": 4560 }, { "epoch": 12.32, "eval_loss": 1.4099173545837402, "eval_runtime": 121.1011, "eval_samples_per_second": 46.234, "eval_steps_per_second": 0.727, "step": 4560 }, { "epoch": 12.65, "learning_rate": 0.0002, "loss": 1.4031, "step": 4680 }, { "epoch": 12.97, "learning_rate": 0.0002, "loss": 1.401, "step": 4800 }, { "epoch": 12.97, "eval_loss": 1.4046831130981445, "eval_runtime": 121.8177, "eval_samples_per_second": 45.962, "eval_steps_per_second": 0.722, "step": 4800 }, { "epoch": 13.3, "learning_rate": 0.0002, "loss": 1.4031, "step": 4920 }, { "epoch": 13.62, "learning_rate": 0.0002, "loss": 1.394, "step": 5040 }, { "epoch": 13.62, "eval_loss": 1.401537299156189, "eval_runtime": 121.4356, "eval_samples_per_second": 46.107, "eval_steps_per_second": 0.725, "step": 5040 }, { "epoch": 13.94, "learning_rate": 0.0002, "loss": 1.3922, "step": 5160 }, { "epoch": 14.27, "learning_rate": 0.0002, "loss": 1.3945, "step": 5280 }, { "epoch": 14.27, "eval_loss": 1.3918230533599854, "eval_runtime": 119.2233, "eval_samples_per_second": 46.962, "eval_steps_per_second": 0.738, "step": 5280 }, { "epoch": 14.59, "learning_rate": 0.0002, "loss": 1.3836, "step": 5400 }, { "epoch": 14.92, "learning_rate": 0.0002, "loss": 1.3838, "step": 5520 }, { "epoch": 14.92, "eval_loss": 1.385350227355957, "eval_runtime": 113.4489, "eval_samples_per_second": 49.353, "eval_steps_per_second": 0.776, "step": 5520 }, { "epoch": 15.24, "learning_rate": 0.0002, "loss": 1.387, "step": 5640 }, { "epoch": 15.57, "learning_rate": 0.0002, "loss": 1.3722, "step": 5760 }, { "epoch": 15.57, "eval_loss": 1.379088282585144, "eval_runtime": 116.4932, "eval_samples_per_second": 48.063, "eval_steps_per_second": 0.755, "step": 5760 }, { "epoch": 15.89, "learning_rate": 0.0002, "loss": 1.3757, "step": 5880 }, { "epoch": 16.22, "learning_rate": 0.0002, "loss": 1.3775, "step": 6000 }, { "epoch": 16.22, "eval_loss": 1.384007453918457, "eval_runtime": 115.8099, "eval_samples_per_second": 48.346, "eval_steps_per_second": 0.76, "step": 6000 }, { "epoch": 16.54, "learning_rate": 0.0002, "loss": 1.3683, "step": 6120 }, { "epoch": 16.86, "learning_rate": 0.0002, "loss": 1.3675, "step": 6240 }, { "epoch": 16.86, "eval_loss": 1.3760778903961182, "eval_runtime": 113.2638, "eval_samples_per_second": 49.433, "eval_steps_per_second": 0.777, "step": 6240 }, { "epoch": 17.19, "learning_rate": 0.0002, "loss": 1.375, "step": 6360 }, { "epoch": 17.51, "learning_rate": 0.0002, "loss": 1.358, "step": 6480 }, { "epoch": 17.51, "eval_loss": 1.3729970455169678, "eval_runtime": 119.1962, "eval_samples_per_second": 46.973, "eval_steps_per_second": 0.738, "step": 6480 }, { "epoch": 17.84, "learning_rate": 0.0002, "loss": 1.3617, "step": 6600 }, { "epoch": 18.16, "learning_rate": 0.0002, "loss": 1.3679, "step": 6720 }, { "epoch": 18.16, "eval_loss": 1.3826600313186646, "eval_runtime": 118.9849, "eval_samples_per_second": 47.056, "eval_steps_per_second": 0.74, "step": 6720 }, { "epoch": 18.49, "learning_rate": 0.0002, "loss": 1.3592, "step": 6840 }, { "epoch": 18.81, "learning_rate": 0.0002, "loss": 1.3602, "step": 6960 }, { "epoch": 18.81, "eval_loss": 1.3659363985061646, "eval_runtime": 120.7081, "eval_samples_per_second": 46.385, "eval_steps_per_second": 0.729, "step": 6960 }, { "epoch": 19.13, "learning_rate": 0.0002, "loss": 1.3633, "step": 7080 }, { "epoch": 19.46, "learning_rate": 0.0002, "loss": 1.3522, "step": 7200 }, { "epoch": 19.46, "eval_loss": 1.372406244277954, "eval_runtime": 113.6178, "eval_samples_per_second": 49.279, "eval_steps_per_second": 0.775, "step": 7200 }, { "epoch": 19.78, "learning_rate": 0.0002, "loss": 1.345, "step": 7320 }, { "epoch": 20.11, "learning_rate": 0.0002, "loss": 1.3555, "step": 7440 }, { "epoch": 20.11, "eval_loss": 1.368371844291687, "eval_runtime": 118.9369, "eval_samples_per_second": 47.075, "eval_steps_per_second": 0.74, "step": 7440 }, { "epoch": 20.43, "learning_rate": 0.0002, "loss": 1.3396, "step": 7560 }, { "epoch": 20.75, "learning_rate": 0.0002, "loss": 1.3536, "step": 7680 }, { "epoch": 20.75, "eval_loss": 1.3611598014831543, "eval_runtime": 119.3386, "eval_samples_per_second": 46.917, "eval_steps_per_second": 0.737, "step": 7680 }, { "epoch": 21.08, "learning_rate": 0.0002, "loss": 1.3506, "step": 7800 }, { "epoch": 21.4, "learning_rate": 0.0002, "loss": 1.3347, "step": 7920 }, { "epoch": 21.4, "eval_loss": 1.3598804473876953, "eval_runtime": 114.0961, "eval_samples_per_second": 49.073, "eval_steps_per_second": 0.771, "step": 7920 }, { "epoch": 21.73, "learning_rate": 0.0002, "loss": 1.338, "step": 8040 }, { "epoch": 22.05, "learning_rate": 0.0002, "loss": 1.3463, "step": 8160 }, { "epoch": 22.05, "eval_loss": 1.3614617586135864, "eval_runtime": 121.7757, "eval_samples_per_second": 45.978, "eval_steps_per_second": 0.723, "step": 8160 }, { "epoch": 22.38, "learning_rate": 0.0002, "loss": 1.3305, "step": 8280 }, { "epoch": 22.7, "learning_rate": 0.0002, "loss": 1.3296, "step": 8400 }, { "epoch": 22.7, "eval_loss": 1.359055519104004, "eval_runtime": 113.3148, "eval_samples_per_second": 49.411, "eval_steps_per_second": 0.777, "step": 8400 }, { "epoch": 23.03, "learning_rate": 0.0002, "loss": 1.344, "step": 8520 }, { "epoch": 23.35, "learning_rate": 0.0002, "loss": 1.3201, "step": 8640 }, { "epoch": 23.35, "eval_loss": 1.358960509300232, "eval_runtime": 122.2886, "eval_samples_per_second": 45.785, "eval_steps_per_second": 0.72, "step": 8640 }, { "epoch": 23.67, "learning_rate": 0.0002, "loss": 1.3302, "step": 8760 }, { "epoch": 24.0, "learning_rate": 0.0002, "loss": 1.3292, "step": 8880 }, { "epoch": 24.0, "eval_loss": 1.3509206771850586, "eval_runtime": 99.6058, "eval_samples_per_second": 56.212, "eval_steps_per_second": 0.883, "step": 8880 }, { "epoch": 24.32, "learning_rate": 0.0002, "loss": 1.3294, "step": 9000 }, { "epoch": 24.65, "learning_rate": 0.0002, "loss": 1.3207, "step": 9120 }, { "epoch": 24.65, "eval_loss": 1.357851505279541, "eval_runtime": 105.9073, "eval_samples_per_second": 52.867, "eval_steps_per_second": 0.831, "step": 9120 }, { "epoch": 24.97, "learning_rate": 0.0002, "loss": 1.3215, "step": 9240 }, { "epoch": 25.3, "learning_rate": 0.0002, "loss": 1.3231, "step": 9360 }, { "epoch": 25.3, "eval_loss": 1.3393853902816772, "eval_runtime": 99.7219, "eval_samples_per_second": 56.146, "eval_steps_per_second": 0.882, "step": 9360 }, { "epoch": 25.62, "learning_rate": 0.0002, "loss": 1.3121, "step": 9480 }, { "epoch": 25.94, "learning_rate": 0.0002, "loss": 1.3176, "step": 9600 }, { "epoch": 25.94, "eval_loss": 1.3441215753555298, "eval_runtime": 101.3937, "eval_samples_per_second": 55.22, "eval_steps_per_second": 0.868, "step": 9600 }, { "epoch": 26.27, "learning_rate": 0.0002, "loss": 1.3188, "step": 9720 }, { "epoch": 26.59, "learning_rate": 0.0002, "loss": 1.3103, "step": 9840 }, { "epoch": 26.59, "eval_loss": 1.3429008722305298, "eval_runtime": 100.8116, "eval_samples_per_second": 55.539, "eval_steps_per_second": 0.873, "step": 9840 }, { "epoch": 26.92, "learning_rate": 0.0002, "loss": 1.313, "step": 9960 }, { "epoch": 27.24, "learning_rate": 0.0002, "loss": 1.3156, "step": 10080 }, { "epoch": 27.24, "eval_loss": 1.3400343656539917, "eval_runtime": 98.2948, "eval_samples_per_second": 56.961, "eval_steps_per_second": 0.895, "step": 10080 }, { "epoch": 27.57, "learning_rate": 0.0002, "loss": 1.3064, "step": 10200 }, { "epoch": 27.89, "learning_rate": 0.0002, "loss": 1.306, "step": 10320 }, { "epoch": 27.89, "eval_loss": 1.339460015296936, "eval_runtime": 97.8707, "eval_samples_per_second": 57.208, "eval_steps_per_second": 0.899, "step": 10320 }, { "epoch": 28.22, "learning_rate": 0.0002, "loss": 1.3093, "step": 10440 }, { "epoch": 28.54, "learning_rate": 0.0002, "loss": 1.3026, "step": 10560 }, { "epoch": 28.54, "eval_loss": 1.3380861282348633, "eval_runtime": 99.7827, "eval_samples_per_second": 56.112, "eval_steps_per_second": 0.882, "step": 10560 }, { "epoch": 28.86, "learning_rate": 0.0002, "loss": 1.3014, "step": 10680 }, { "epoch": 29.19, "learning_rate": 0.0002, "loss": 1.3093, "step": 10800 }, { "epoch": 29.19, "eval_loss": 1.335351824760437, "eval_runtime": 99.7514, "eval_samples_per_second": 56.13, "eval_steps_per_second": 0.882, "step": 10800 }, { "epoch": 29.51, "learning_rate": 0.0002, "loss": 1.2954, "step": 10920 }, { "epoch": 29.84, "learning_rate": 0.0002, "loss": 1.2982, "step": 11040 }, { "epoch": 29.84, "eval_loss": 1.33037269115448, "eval_runtime": 111.392, "eval_samples_per_second": 50.264, "eval_steps_per_second": 0.79, "step": 11040 }, { "epoch": 30.16, "learning_rate": 0.0002, "loss": 1.3032, "step": 11160 }, { "epoch": 30.49, "learning_rate": 0.0002, "loss": 1.2927, "step": 11280 }, { "epoch": 30.49, "eval_loss": 1.3423055410385132, "eval_runtime": 110.815, "eval_samples_per_second": 50.526, "eval_steps_per_second": 0.794, "step": 11280 }, { "epoch": 30.81, "learning_rate": 0.0002, "loss": 1.2968, "step": 11400 }, { "epoch": 31.13, "learning_rate": 0.0002, "loss": 1.3003, "step": 11520 }, { "epoch": 31.13, "eval_loss": 1.3345474004745483, "eval_runtime": 100.6956, "eval_samples_per_second": 55.603, "eval_steps_per_second": 0.874, "step": 11520 }, { "epoch": 31.46, "learning_rate": 0.0002, "loss": 1.2865, "step": 11640 }, { "epoch": 31.78, "learning_rate": 0.0002, "loss": 1.2928, "step": 11760 }, { "epoch": 31.78, "eval_loss": 1.337437629699707, "eval_runtime": 97.2235, "eval_samples_per_second": 57.589, "eval_steps_per_second": 0.905, "step": 11760 }, { "epoch": 32.11, "learning_rate": 0.0002, "loss": 1.2981, "step": 11880 }, { "epoch": 32.43, "learning_rate": 0.0002, "loss": 1.2847, "step": 12000 }, { "epoch": 32.43, "eval_loss": 1.3236644268035889, "eval_runtime": 97.4026, "eval_samples_per_second": 57.483, "eval_steps_per_second": 0.903, "step": 12000 }, { "epoch": 32.75, "learning_rate": 0.0002, "loss": 1.2871, "step": 12120 }, { "epoch": 33.08, "learning_rate": 0.0002, "loss": 1.2966, "step": 12240 }, { "epoch": 33.08, "eval_loss": 1.332656741142273, "eval_runtime": 97.3643, "eval_samples_per_second": 57.506, "eval_steps_per_second": 0.904, "step": 12240 }, { "epoch": 33.4, "learning_rate": 0.0002, "loss": 1.2789, "step": 12360 }, { "epoch": 33.73, "learning_rate": 0.0002, "loss": 1.2829, "step": 12480 }, { "epoch": 33.73, "eval_loss": 1.3252918720245361, "eval_runtime": 104.7279, "eval_samples_per_second": 53.462, "eval_steps_per_second": 0.84, "step": 12480 }, { "epoch": 34.05, "learning_rate": 0.0002, "loss": 1.2926, "step": 12600 }, { "epoch": 34.38, "learning_rate": 0.0002, "loss": 1.2756, "step": 12720 }, { "epoch": 34.38, "eval_loss": 1.326663613319397, "eval_runtime": 98.2526, "eval_samples_per_second": 56.986, "eval_steps_per_second": 0.896, "step": 12720 }, { "epoch": 34.7, "learning_rate": 0.0002, "loss": 1.2801, "step": 12840 }, { "epoch": 35.03, "learning_rate": 0.0002, "loss": 1.2919, "step": 12960 }, { "epoch": 35.03, "eval_loss": 1.3183717727661133, "eval_runtime": 99.1376, "eval_samples_per_second": 56.477, "eval_steps_per_second": 0.888, "step": 12960 } ], "max_steps": 14000, "num_train_epochs": 38, "total_flos": 1.7505797492048026e+18, "trial_name": null, "trial_params": null }