{ "best_metric": 2.009188652038574, "best_model_checkpoint": "ckpts/sft_gemma-2b/checkpoint-1680", "epoch": 8.865435356200528, "eval_steps": 20, "global_step": 1680, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10554089709762533, "grad_norm": 6.84375, "learning_rate": 4.000000000000001e-06, "loss": 2.4965, "step": 20 }, { "epoch": 0.10554089709762533, "eval_loss": 2.404269218444824, "eval_runtime": 8.1633, "eval_samples_per_second": 24.5, "eval_steps_per_second": 6.125, "step": 20 }, { "epoch": 0.21108179419525067, "grad_norm": 3.0, "learning_rate": 8.000000000000001e-06, "loss": 2.2807, "step": 40 }, { "epoch": 0.21108179419525067, "eval_loss": 2.1983509063720703, "eval_runtime": 7.9364, "eval_samples_per_second": 25.2, "eval_steps_per_second": 6.3, "step": 40 }, { "epoch": 0.316622691292876, "grad_norm": 3.0, "learning_rate": 1e-05, "loss": 2.1723, "step": 60 }, { "epoch": 0.316622691292876, "eval_loss": 2.157801628112793, "eval_runtime": 8.1488, "eval_samples_per_second": 24.543, "eval_steps_per_second": 6.136, "step": 60 }, { "epoch": 0.42216358839050133, "grad_norm": 2.84375, "learning_rate": 1e-05, "loss": 2.0888, "step": 80 }, { "epoch": 0.42216358839050133, "eval_loss": 2.1461212635040283, "eval_runtime": 8.0327, "eval_samples_per_second": 24.898, "eval_steps_per_second": 6.225, "step": 80 }, { "epoch": 0.5277044854881267, "grad_norm": 2.890625, "learning_rate": 1e-05, "loss": 2.1187, "step": 100 }, { "epoch": 0.5277044854881267, "eval_loss": 2.140232801437378, "eval_runtime": 8.0366, "eval_samples_per_second": 24.886, "eval_steps_per_second": 6.222, "step": 100 }, { "epoch": 0.633245382585752, "grad_norm": 2.9375, "learning_rate": 1e-05, "loss": 2.1293, "step": 120 }, { "epoch": 0.633245382585752, "eval_loss": 2.135404109954834, "eval_runtime": 8.1159, "eval_samples_per_second": 24.643, "eval_steps_per_second": 6.161, "step": 120 }, { "epoch": 0.7387862796833773, "grad_norm": 2.640625, "learning_rate": 1e-05, "loss": 2.1351, "step": 140 }, { "epoch": 0.7387862796833773, "eval_loss": 2.1313905715942383, "eval_runtime": 7.9204, "eval_samples_per_second": 25.251, "eval_steps_per_second": 6.313, "step": 140 }, { "epoch": 0.8443271767810027, "grad_norm": 2.8125, "learning_rate": 1e-05, "loss": 2.1204, "step": 160 }, { "epoch": 0.8443271767810027, "eval_loss": 2.1264946460723877, "eval_runtime": 8.1968, "eval_samples_per_second": 24.4, "eval_steps_per_second": 6.1, "step": 160 }, { "epoch": 0.9498680738786279, "grad_norm": 2.953125, "learning_rate": 1e-05, "loss": 2.0984, "step": 180 }, { "epoch": 0.9498680738786279, "eval_loss": 2.123286247253418, "eval_runtime": 8.0328, "eval_samples_per_second": 24.898, "eval_steps_per_second": 6.225, "step": 180 }, { "epoch": 1.0554089709762533, "grad_norm": 3.0, "learning_rate": 1e-05, "loss": 2.1008, "step": 200 }, { "epoch": 1.0554089709762533, "eval_loss": 2.1210150718688965, "eval_runtime": 7.8913, "eval_samples_per_second": 25.344, "eval_steps_per_second": 6.336, "step": 200 }, { "epoch": 1.1609498680738786, "grad_norm": 2.9375, "learning_rate": 1e-05, "loss": 2.0771, "step": 220 }, { "epoch": 1.1609498680738786, "eval_loss": 2.118210792541504, "eval_runtime": 8.2706, "eval_samples_per_second": 24.182, "eval_steps_per_second": 6.045, "step": 220 }, { "epoch": 1.266490765171504, "grad_norm": 2.71875, "learning_rate": 1e-05, "loss": 2.0659, "step": 240 }, { "epoch": 1.266490765171504, "eval_loss": 2.1160335540771484, "eval_runtime": 8.1186, "eval_samples_per_second": 24.635, "eval_steps_per_second": 6.159, "step": 240 }, { "epoch": 1.3720316622691293, "grad_norm": 3.390625, "learning_rate": 1e-05, "loss": 2.0616, "step": 260 }, { "epoch": 1.3720316622691293, "eval_loss": 2.113948106765747, "eval_runtime": 8.3043, "eval_samples_per_second": 24.084, "eval_steps_per_second": 6.021, "step": 260 }, { "epoch": 1.4775725593667546, "grad_norm": 2.875, "learning_rate": 1e-05, "loss": 2.1086, "step": 280 }, { "epoch": 1.4775725593667546, "eval_loss": 2.1105477809906006, "eval_runtime": 8.3509, "eval_samples_per_second": 23.95, "eval_steps_per_second": 5.987, "step": 280 }, { "epoch": 1.58311345646438, "grad_norm": 2.625, "learning_rate": 1e-05, "loss": 2.0473, "step": 300 }, { "epoch": 1.58311345646438, "eval_loss": 2.1075851917266846, "eval_runtime": 8.3203, "eval_samples_per_second": 24.037, "eval_steps_per_second": 6.009, "step": 300 }, { "epoch": 1.6886543535620053, "grad_norm": 2.71875, "learning_rate": 1e-05, "loss": 2.0455, "step": 320 }, { "epoch": 1.6886543535620053, "eval_loss": 2.1052379608154297, "eval_runtime": 8.4112, "eval_samples_per_second": 23.778, "eval_steps_per_second": 5.944, "step": 320 }, { "epoch": 1.7941952506596306, "grad_norm": 3.125, "learning_rate": 1e-05, "loss": 2.0664, "step": 340 }, { "epoch": 1.7941952506596306, "eval_loss": 2.102696180343628, "eval_runtime": 8.1772, "eval_samples_per_second": 24.458, "eval_steps_per_second": 6.115, "step": 340 }, { "epoch": 1.899736147757256, "grad_norm": 2.75, "learning_rate": 1e-05, "loss": 2.0559, "step": 360 }, { "epoch": 1.899736147757256, "eval_loss": 2.100424289703369, "eval_runtime": 8.2651, "eval_samples_per_second": 24.198, "eval_steps_per_second": 6.05, "step": 360 }, { "epoch": 2.005277044854881, "grad_norm": 3.125, "learning_rate": 1e-05, "loss": 2.0638, "step": 380 }, { "epoch": 2.005277044854881, "eval_loss": 2.0989837646484375, "eval_runtime": 8.3491, "eval_samples_per_second": 23.955, "eval_steps_per_second": 5.989, "step": 380 }, { "epoch": 2.1108179419525066, "grad_norm": 2.890625, "learning_rate": 1e-05, "loss": 2.0455, "step": 400 }, { "epoch": 2.1108179419525066, "eval_loss": 2.097106456756592, "eval_runtime": 8.261, "eval_samples_per_second": 24.21, "eval_steps_per_second": 6.053, "step": 400 }, { "epoch": 2.216358839050132, "grad_norm": 2.8125, "learning_rate": 1e-05, "loss": 2.0114, "step": 420 }, { "epoch": 2.216358839050132, "eval_loss": 2.095724582672119, "eval_runtime": 8.1933, "eval_samples_per_second": 24.41, "eval_steps_per_second": 6.103, "step": 420 }, { "epoch": 2.321899736147757, "grad_norm": 2.828125, "learning_rate": 1e-05, "loss": 2.0263, "step": 440 }, { "epoch": 2.321899736147757, "eval_loss": 2.0944039821624756, "eval_runtime": 8.3678, "eval_samples_per_second": 23.901, "eval_steps_per_second": 5.975, "step": 440 }, { "epoch": 2.4274406332453826, "grad_norm": 2.9375, "learning_rate": 1e-05, "loss": 2.0127, "step": 460 }, { "epoch": 2.4274406332453826, "eval_loss": 2.0919580459594727, "eval_runtime": 8.3065, "eval_samples_per_second": 24.078, "eval_steps_per_second": 6.019, "step": 460 }, { "epoch": 2.532981530343008, "grad_norm": 2.484375, "learning_rate": 1e-05, "loss": 1.9744, "step": 480 }, { "epoch": 2.532981530343008, "eval_loss": 2.087993860244751, "eval_runtime": 8.3136, "eval_samples_per_second": 24.057, "eval_steps_per_second": 6.014, "step": 480 }, { "epoch": 2.638522427440633, "grad_norm": 2.984375, "learning_rate": 1e-05, "loss": 2.0236, "step": 500 }, { "epoch": 2.638522427440633, "eval_loss": 2.086052656173706, "eval_runtime": 8.3652, "eval_samples_per_second": 23.908, "eval_steps_per_second": 5.977, "step": 500 }, { "epoch": 2.7440633245382586, "grad_norm": 2.671875, "learning_rate": 1e-05, "loss": 2.0146, "step": 520 }, { "epoch": 2.7440633245382586, "eval_loss": 2.08500075340271, "eval_runtime": 8.387, "eval_samples_per_second": 23.846, "eval_steps_per_second": 5.962, "step": 520 }, { "epoch": 2.849604221635884, "grad_norm": 2.734375, "learning_rate": 1e-05, "loss": 2.0086, "step": 540 }, { "epoch": 2.849604221635884, "eval_loss": 2.0832457542419434, "eval_runtime": 8.2202, "eval_samples_per_second": 24.33, "eval_steps_per_second": 6.083, "step": 540 }, { "epoch": 2.955145118733509, "grad_norm": 3.21875, "learning_rate": 1e-05, "loss": 2.0381, "step": 560 }, { "epoch": 2.955145118733509, "eval_loss": 2.0816333293914795, "eval_runtime": 8.1766, "eval_samples_per_second": 24.46, "eval_steps_per_second": 6.115, "step": 560 }, { "epoch": 3.0606860158311346, "grad_norm": 2.8125, "learning_rate": 1e-05, "loss": 1.9999, "step": 580 }, { "epoch": 3.0606860158311346, "eval_loss": 2.0814907550811768, "eval_runtime": 8.1023, "eval_samples_per_second": 24.684, "eval_steps_per_second": 6.171, "step": 580 }, { "epoch": 3.16622691292876, "grad_norm": 2.953125, "learning_rate": 1e-05, "loss": 1.9754, "step": 600 }, { "epoch": 3.16622691292876, "eval_loss": 2.0809905529022217, "eval_runtime": 8.2702, "eval_samples_per_second": 24.183, "eval_steps_per_second": 6.046, "step": 600 }, { "epoch": 3.271767810026385, "grad_norm": 3.09375, "learning_rate": 1e-05, "loss": 1.9742, "step": 620 }, { "epoch": 3.271767810026385, "eval_loss": 2.0800254344940186, "eval_runtime": 8.3829, "eval_samples_per_second": 23.858, "eval_steps_per_second": 5.965, "step": 620 }, { "epoch": 3.3773087071240107, "grad_norm": 3.375, "learning_rate": 1e-05, "loss": 1.9646, "step": 640 }, { "epoch": 3.3773087071240107, "eval_loss": 2.078634738922119, "eval_runtime": 8.1957, "eval_samples_per_second": 24.403, "eval_steps_per_second": 6.101, "step": 640 }, { "epoch": 3.4828496042216357, "grad_norm": 2.765625, "learning_rate": 1e-05, "loss": 1.9785, "step": 660 }, { "epoch": 3.4828496042216357, "eval_loss": 2.075782537460327, "eval_runtime": 8.2605, "eval_samples_per_second": 24.211, "eval_steps_per_second": 6.053, "step": 660 }, { "epoch": 3.588390501319261, "grad_norm": 4.21875, "learning_rate": 1e-05, "loss": 1.9755, "step": 680 }, { "epoch": 3.588390501319261, "eval_loss": 2.0737786293029785, "eval_runtime": 8.3746, "eval_samples_per_second": 23.882, "eval_steps_per_second": 5.97, "step": 680 }, { "epoch": 3.6939313984168867, "grad_norm": 2.765625, "learning_rate": 1e-05, "loss": 1.9667, "step": 700 }, { "epoch": 3.6939313984168867, "eval_loss": 2.0726418495178223, "eval_runtime": 8.2601, "eval_samples_per_second": 24.213, "eval_steps_per_second": 6.053, "step": 700 }, { "epoch": 3.7994722955145117, "grad_norm": 2.828125, "learning_rate": 1e-05, "loss": 1.9623, "step": 720 }, { "epoch": 3.7994722955145117, "eval_loss": 2.070995330810547, "eval_runtime": 8.3051, "eval_samples_per_second": 24.081, "eval_steps_per_second": 6.02, "step": 720 }, { "epoch": 3.905013192612137, "grad_norm": 3.03125, "learning_rate": 1e-05, "loss": 1.9702, "step": 740 }, { "epoch": 3.905013192612137, "eval_loss": 2.068690776824951, "eval_runtime": 8.3707, "eval_samples_per_second": 23.893, "eval_steps_per_second": 5.973, "step": 740 }, { "epoch": 4.010554089709762, "grad_norm": 2.96875, "learning_rate": 1e-05, "loss": 1.9795, "step": 760 }, { "epoch": 4.010554089709762, "eval_loss": 2.0664331912994385, "eval_runtime": 9.1952, "eval_samples_per_second": 21.75, "eval_steps_per_second": 5.438, "step": 760 }, { "epoch": 4.116094986807388, "grad_norm": 3.0, "learning_rate": 1e-05, "loss": 1.9469, "step": 780 }, { "epoch": 4.116094986807388, "eval_loss": 2.0662195682525635, "eval_runtime": 8.2606, "eval_samples_per_second": 24.211, "eval_steps_per_second": 6.053, "step": 780 }, { "epoch": 4.221635883905013, "grad_norm": 2.984375, "learning_rate": 1e-05, "loss": 1.9415, "step": 800 }, { "epoch": 4.221635883905013, "eval_loss": 2.0639867782592773, "eval_runtime": 8.227, "eval_samples_per_second": 24.31, "eval_steps_per_second": 6.078, "step": 800 }, { "epoch": 4.327176781002638, "grad_norm": 2.78125, "learning_rate": 1e-05, "loss": 1.9574, "step": 820 }, { "epoch": 4.327176781002638, "eval_loss": 2.062091588973999, "eval_runtime": 8.2262, "eval_samples_per_second": 24.313, "eval_steps_per_second": 6.078, "step": 820 }, { "epoch": 4.432717678100264, "grad_norm": 3.203125, "learning_rate": 1e-05, "loss": 1.9202, "step": 840 }, { "epoch": 4.432717678100264, "eval_loss": 2.0608723163604736, "eval_runtime": 8.2161, "eval_samples_per_second": 24.343, "eval_steps_per_second": 6.086, "step": 840 }, { "epoch": 4.538258575197889, "grad_norm": 2.96875, "learning_rate": 1e-05, "loss": 1.9302, "step": 860 }, { "epoch": 4.538258575197889, "eval_loss": 2.058367967605591, "eval_runtime": 8.2827, "eval_samples_per_second": 24.147, "eval_steps_per_second": 6.037, "step": 860 }, { "epoch": 4.643799472295514, "grad_norm": 3.109375, "learning_rate": 1e-05, "loss": 1.9112, "step": 880 }, { "epoch": 4.643799472295514, "eval_loss": 2.058866500854492, "eval_runtime": 8.2412, "eval_samples_per_second": 24.268, "eval_steps_per_second": 6.067, "step": 880 }, { "epoch": 4.74934036939314, "grad_norm": 2.890625, "learning_rate": 1e-05, "loss": 1.9127, "step": 900 }, { "epoch": 4.74934036939314, "eval_loss": 2.0560219287872314, "eval_runtime": 8.2213, "eval_samples_per_second": 24.327, "eval_steps_per_second": 6.082, "step": 900 }, { "epoch": 4.854881266490765, "grad_norm": 2.953125, "learning_rate": 1e-05, "loss": 1.899, "step": 920 }, { "epoch": 4.854881266490765, "eval_loss": 2.054474353790283, "eval_runtime": 8.0902, "eval_samples_per_second": 24.721, "eval_steps_per_second": 6.18, "step": 920 }, { "epoch": 4.96042216358839, "grad_norm": 2.90625, "learning_rate": 1e-05, "loss": 1.9248, "step": 940 }, { "epoch": 4.96042216358839, "eval_loss": 2.052360773086548, "eval_runtime": 8.1899, "eval_samples_per_second": 24.42, "eval_steps_per_second": 6.105, "step": 940 }, { "epoch": 5.065963060686016, "grad_norm": 2.90625, "learning_rate": 1e-05, "loss": 1.8878, "step": 960 }, { "epoch": 5.065963060686016, "eval_loss": 2.0526058673858643, "eval_runtime": 8.0873, "eval_samples_per_second": 24.73, "eval_steps_per_second": 6.183, "step": 960 }, { "epoch": 5.171503957783641, "grad_norm": 3.234375, "learning_rate": 1e-05, "loss": 1.8789, "step": 980 }, { "epoch": 5.171503957783641, "eval_loss": 2.0522069931030273, "eval_runtime": 8.0535, "eval_samples_per_second": 24.834, "eval_steps_per_second": 6.208, "step": 980 }, { "epoch": 5.277044854881266, "grad_norm": 3.015625, "learning_rate": 1e-05, "loss": 1.8908, "step": 1000 }, { "epoch": 5.277044854881266, "eval_loss": 2.051866054534912, "eval_runtime": 7.9907, "eval_samples_per_second": 25.029, "eval_steps_per_second": 6.257, "step": 1000 }, { "epoch": 5.382585751978892, "grad_norm": 3.046875, "learning_rate": 1e-05, "loss": 1.8944, "step": 1020 }, { "epoch": 5.382585751978892, "eval_loss": 2.0503861904144287, "eval_runtime": 7.927, "eval_samples_per_second": 25.23, "eval_steps_per_second": 6.308, "step": 1020 }, { "epoch": 5.488126649076517, "grad_norm": 4.15625, "learning_rate": 1e-05, "loss": 1.8867, "step": 1040 }, { "epoch": 5.488126649076517, "eval_loss": 2.0466654300689697, "eval_runtime": 8.3776, "eval_samples_per_second": 23.873, "eval_steps_per_second": 5.968, "step": 1040 }, { "epoch": 5.593667546174142, "grad_norm": 3.0, "learning_rate": 1e-05, "loss": 1.8764, "step": 1060 }, { "epoch": 5.593667546174142, "eval_loss": 2.0448718070983887, "eval_runtime": 8.2526, "eval_samples_per_second": 24.235, "eval_steps_per_second": 6.059, "step": 1060 }, { "epoch": 5.699208443271768, "grad_norm": 3.140625, "learning_rate": 1e-05, "loss": 1.9082, "step": 1080 }, { "epoch": 5.699208443271768, "eval_loss": 2.0424439907073975, "eval_runtime": 8.0973, "eval_samples_per_second": 24.7, "eval_steps_per_second": 6.175, "step": 1080 }, { "epoch": 5.804749340369393, "grad_norm": 3.296875, "learning_rate": 1e-05, "loss": 1.8782, "step": 1100 }, { "epoch": 5.804749340369393, "eval_loss": 2.0422487258911133, "eval_runtime": 8.3197, "eval_samples_per_second": 24.039, "eval_steps_per_second": 6.01, "step": 1100 }, { "epoch": 5.910290237467018, "grad_norm": 2.984375, "learning_rate": 1e-05, "loss": 1.8394, "step": 1120 }, { "epoch": 5.910290237467018, "eval_loss": 2.0410642623901367, "eval_runtime": 8.2994, "eval_samples_per_second": 24.098, "eval_steps_per_second": 6.025, "step": 1120 }, { "epoch": 6.015831134564644, "grad_norm": 4.1875, "learning_rate": 1e-05, "loss": 1.864, "step": 1140 }, { "epoch": 6.015831134564644, "eval_loss": 2.039353370666504, "eval_runtime": 8.2994, "eval_samples_per_second": 24.098, "eval_steps_per_second": 6.025, "step": 1140 }, { "epoch": 6.121372031662269, "grad_norm": 3.265625, "learning_rate": 1e-05, "loss": 1.8246, "step": 1160 }, { "epoch": 6.121372031662269, "eval_loss": 2.042710304260254, "eval_runtime": 8.5324, "eval_samples_per_second": 23.44, "eval_steps_per_second": 5.86, "step": 1160 }, { "epoch": 6.226912928759894, "grad_norm": 3.3125, "learning_rate": 1e-05, "loss": 1.8343, "step": 1180 }, { "epoch": 6.226912928759894, "eval_loss": 2.0403542518615723, "eval_runtime": 8.2652, "eval_samples_per_second": 24.198, "eval_steps_per_second": 6.049, "step": 1180 }, { "epoch": 6.33245382585752, "grad_norm": 3.984375, "learning_rate": 1e-05, "loss": 1.8541, "step": 1200 }, { "epoch": 6.33245382585752, "eval_loss": 2.03813099861145, "eval_runtime": 8.4385, "eval_samples_per_second": 23.701, "eval_steps_per_second": 5.925, "step": 1200 }, { "epoch": 6.437994722955145, "grad_norm": 3.203125, "learning_rate": 1e-05, "loss": 1.8182, "step": 1220 }, { "epoch": 6.437994722955145, "eval_loss": 2.038771629333496, "eval_runtime": 8.3561, "eval_samples_per_second": 23.934, "eval_steps_per_second": 5.984, "step": 1220 }, { "epoch": 6.54353562005277, "grad_norm": 3.125, "learning_rate": 1e-05, "loss": 1.8427, "step": 1240 }, { "epoch": 6.54353562005277, "eval_loss": 2.0339856147766113, "eval_runtime": 8.3288, "eval_samples_per_second": 24.013, "eval_steps_per_second": 6.003, "step": 1240 }, { "epoch": 6.649076517150396, "grad_norm": 3.15625, "learning_rate": 1e-05, "loss": 1.8289, "step": 1260 }, { "epoch": 6.649076517150396, "eval_loss": 2.035248041152954, "eval_runtime": 8.281, "eval_samples_per_second": 24.152, "eval_steps_per_second": 6.038, "step": 1260 }, { "epoch": 6.754617414248021, "grad_norm": 3.4375, "learning_rate": 1e-05, "loss": 1.8415, "step": 1280 }, { "epoch": 6.754617414248021, "eval_loss": 2.031825304031372, "eval_runtime": 8.3052, "eval_samples_per_second": 24.081, "eval_steps_per_second": 6.02, "step": 1280 }, { "epoch": 6.860158311345646, "grad_norm": 3.453125, "learning_rate": 1e-05, "loss": 1.8357, "step": 1300 }, { "epoch": 6.860158311345646, "eval_loss": 2.028428316116333, "eval_runtime": 8.3001, "eval_samples_per_second": 24.096, "eval_steps_per_second": 6.024, "step": 1300 }, { "epoch": 6.965699208443271, "grad_norm": 3.140625, "learning_rate": 1e-05, "loss": 1.8324, "step": 1320 }, { "epoch": 6.965699208443271, "eval_loss": 2.0289885997772217, "eval_runtime": 8.1618, "eval_samples_per_second": 24.504, "eval_steps_per_second": 6.126, "step": 1320 }, { "epoch": 7.071240105540897, "grad_norm": 3.5625, "learning_rate": 1e-05, "loss": 1.8069, "step": 1340 }, { "epoch": 7.071240105540897, "eval_loss": 2.0348060131073, "eval_runtime": 8.1951, "eval_samples_per_second": 24.405, "eval_steps_per_second": 6.101, "step": 1340 }, { "epoch": 7.176781002638522, "grad_norm": 3.375, "learning_rate": 1e-05, "loss": 1.8152, "step": 1360 }, { "epoch": 7.176781002638522, "eval_loss": 2.0321884155273438, "eval_runtime": 8.2785, "eval_samples_per_second": 24.159, "eval_steps_per_second": 6.04, "step": 1360 }, { "epoch": 7.282321899736147, "grad_norm": 3.234375, "learning_rate": 1e-05, "loss": 1.7871, "step": 1380 }, { "epoch": 7.282321899736147, "eval_loss": 2.0307512283325195, "eval_runtime": 8.0505, "eval_samples_per_second": 24.843, "eval_steps_per_second": 6.211, "step": 1380 }, { "epoch": 7.387862796833773, "grad_norm": 3.59375, "learning_rate": 1e-05, "loss": 1.7871, "step": 1400 }, { "epoch": 7.387862796833773, "eval_loss": 2.0273208618164062, "eval_runtime": 8.1896, "eval_samples_per_second": 24.421, "eval_steps_per_second": 6.105, "step": 1400 }, { "epoch": 7.493403693931398, "grad_norm": 3.328125, "learning_rate": 1e-05, "loss": 1.8076, "step": 1420 }, { "epoch": 7.493403693931398, "eval_loss": 2.0257158279418945, "eval_runtime": 7.9266, "eval_samples_per_second": 25.232, "eval_steps_per_second": 6.308, "step": 1420 }, { "epoch": 7.598944591029023, "grad_norm": 3.28125, "learning_rate": 1e-05, "loss": 1.7753, "step": 1440 }, { "epoch": 7.598944591029023, "eval_loss": 2.026719570159912, "eval_runtime": 7.8566, "eval_samples_per_second": 25.456, "eval_steps_per_second": 6.364, "step": 1440 }, { "epoch": 7.704485488126649, "grad_norm": 3.453125, "learning_rate": 1e-05, "loss": 1.761, "step": 1460 }, { "epoch": 7.704485488126649, "eval_loss": 2.022343397140503, "eval_runtime": 8.1505, "eval_samples_per_second": 24.538, "eval_steps_per_second": 6.135, "step": 1460 }, { "epoch": 7.810026385224274, "grad_norm": 3.234375, "learning_rate": 1e-05, "loss": 1.7837, "step": 1480 }, { "epoch": 7.810026385224274, "eval_loss": 2.0227696895599365, "eval_runtime": 7.9021, "eval_samples_per_second": 25.31, "eval_steps_per_second": 6.327, "step": 1480 }, { "epoch": 7.915567282321899, "grad_norm": 3.5625, "learning_rate": 1e-05, "loss": 1.7809, "step": 1500 }, { "epoch": 7.915567282321899, "eval_loss": 2.0224175453186035, "eval_runtime": 8.146, "eval_samples_per_second": 24.552, "eval_steps_per_second": 6.138, "step": 1500 }, { "epoch": 8.021108179419524, "grad_norm": 3.59375, "learning_rate": 1e-05, "loss": 1.779, "step": 1520 }, { "epoch": 8.021108179419524, "eval_loss": 2.0209100246429443, "eval_runtime": 8.392, "eval_samples_per_second": 23.832, "eval_steps_per_second": 5.958, "step": 1520 }, { "epoch": 8.12664907651715, "grad_norm": 3.109375, "learning_rate": 1e-05, "loss": 1.7353, "step": 1540 }, { "epoch": 8.12664907651715, "eval_loss": 2.0220282077789307, "eval_runtime": 8.6161, "eval_samples_per_second": 23.212, "eval_steps_per_second": 5.803, "step": 1540 }, { "epoch": 8.232189973614776, "grad_norm": 3.5625, "learning_rate": 1e-05, "loss": 1.7363, "step": 1560 }, { "epoch": 8.232189973614776, "eval_loss": 2.0166220664978027, "eval_runtime": 8.2719, "eval_samples_per_second": 24.178, "eval_steps_per_second": 6.045, "step": 1560 }, { "epoch": 8.3377308707124, "grad_norm": 3.71875, "learning_rate": 1e-05, "loss": 1.7511, "step": 1580 }, { "epoch": 8.3377308707124, "eval_loss": 2.01631236076355, "eval_runtime": 8.2537, "eval_samples_per_second": 24.232, "eval_steps_per_second": 6.058, "step": 1580 }, { "epoch": 8.443271767810026, "grad_norm": 3.828125, "learning_rate": 1e-05, "loss": 1.767, "step": 1600 }, { "epoch": 8.443271767810026, "eval_loss": 2.016242265701294, "eval_runtime": 8.1762, "eval_samples_per_second": 24.461, "eval_steps_per_second": 6.115, "step": 1600 }, { "epoch": 8.548812664907652, "grad_norm": 4.4375, "learning_rate": 1e-05, "loss": 1.6945, "step": 1620 }, { "epoch": 8.548812664907652, "eval_loss": 2.019789218902588, "eval_runtime": 8.4823, "eval_samples_per_second": 23.579, "eval_steps_per_second": 5.895, "step": 1620 }, { "epoch": 8.654353562005277, "grad_norm": 4.5625, "learning_rate": 1e-05, "loss": 1.7087, "step": 1640 }, { "epoch": 8.654353562005277, "eval_loss": 2.0212345123291016, "eval_runtime": 8.1335, "eval_samples_per_second": 24.59, "eval_steps_per_second": 6.147, "step": 1640 }, { "epoch": 8.759894459102902, "grad_norm": 3.921875, "learning_rate": 1e-05, "loss": 1.7702, "step": 1660 }, { "epoch": 8.759894459102902, "eval_loss": 2.0104410648345947, "eval_runtime": 8.0047, "eval_samples_per_second": 24.985, "eval_steps_per_second": 6.246, "step": 1660 }, { "epoch": 8.865435356200528, "grad_norm": 4.0625, "learning_rate": 1e-05, "loss": 1.7563, "step": 1680 }, { "epoch": 8.865435356200528, "eval_loss": 2.009188652038574, "eval_runtime": 8.1504, "eval_samples_per_second": 24.539, "eval_steps_per_second": 6.135, "step": 1680 } ], "logging_steps": 20, "max_steps": 9450, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 20, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6272812961435648e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }