{ "best_metric": 0.3915382921695709, "best_model_checkpoint": "mikhail-panzo/fil-ceb_b64_le5_s8000/checkpoint-3000", "epoch": 254.90196078431373, "eval_steps": 500, "global_step": 6500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.9607843137254903, "grad_norm": 1.702310562133789, "learning_rate": 2.5000000000000004e-07, "loss": 0.4851, "step": 50 }, { "epoch": 3.9215686274509802, "grad_norm": 1.732956886291504, "learning_rate": 5.000000000000001e-07, "loss": 0.4815, "step": 100 }, { "epoch": 5.882352941176471, "grad_norm": 1.4467918872833252, "learning_rate": 7.5e-07, "loss": 0.474, "step": 150 }, { "epoch": 7.8431372549019605, "grad_norm": 1.181947112083435, "learning_rate": 1.0000000000000002e-06, "loss": 0.4699, "step": 200 }, { "epoch": 9.803921568627452, "grad_norm": 1.3492683172225952, "learning_rate": 1.25e-06, "loss": 0.4663, "step": 250 }, { "epoch": 11.764705882352942, "grad_norm": 1.3057738542556763, "learning_rate": 1.5e-06, "loss": 0.4675, "step": 300 }, { "epoch": 13.72549019607843, "grad_norm": 1.159374713897705, "learning_rate": 1.75e-06, "loss": 0.4563, "step": 350 }, { "epoch": 15.686274509803921, "grad_norm": 1.079891562461853, "learning_rate": 2.0000000000000003e-06, "loss": 0.4574, "step": 400 }, { "epoch": 17.647058823529413, "grad_norm": 1.121338963508606, "learning_rate": 2.25e-06, "loss": 0.4589, "step": 450 }, { "epoch": 19.607843137254903, "grad_norm": 1.3906892538070679, "learning_rate": 2.5e-06, "loss": 0.4592, "step": 500 }, { "epoch": 19.607843137254903, "eval_loss": 0.4200105369091034, "eval_runtime": 6.0873, "eval_samples_per_second": 29.57, "eval_steps_per_second": 3.778, "step": 500 }, { "epoch": 21.568627450980394, "grad_norm": 1.0391827821731567, "learning_rate": 2.7500000000000004e-06, "loss": 0.457, "step": 550 }, { "epoch": 23.529411764705884, "grad_norm": 1.0599409341812134, "learning_rate": 3e-06, "loss": 0.4463, "step": 600 }, { "epoch": 25.49019607843137, "grad_norm": 1.0316087007522583, "learning_rate": 3.2500000000000002e-06, "loss": 0.4466, "step": 650 }, { "epoch": 27.45098039215686, "grad_norm": 0.7956145405769348, "learning_rate": 3.5e-06, "loss": 0.4425, "step": 700 }, { "epoch": 29.41176470588235, "grad_norm": 1.0623911619186401, "learning_rate": 3.7500000000000005e-06, "loss": 0.4422, "step": 750 }, { "epoch": 31.372549019607842, "grad_norm": 0.9455887675285339, "learning_rate": 4.000000000000001e-06, "loss": 0.4449, "step": 800 }, { "epoch": 33.333333333333336, "grad_norm": 0.955407977104187, "learning_rate": 4.25e-06, "loss": 0.4447, "step": 850 }, { "epoch": 35.294117647058826, "grad_norm": 0.9425307512283325, "learning_rate": 4.5e-06, "loss": 0.4417, "step": 900 }, { "epoch": 37.254901960784316, "grad_norm": 1.1426234245300293, "learning_rate": 4.75e-06, "loss": 0.4376, "step": 950 }, { "epoch": 39.21568627450981, "grad_norm": 1.1839220523834229, "learning_rate": 5e-06, "loss": 0.4389, "step": 1000 }, { "epoch": 39.21568627450981, "eval_loss": 0.4070351719856262, "eval_runtime": 6.2284, "eval_samples_per_second": 28.9, "eval_steps_per_second": 3.693, "step": 1000 }, { "epoch": 41.1764705882353, "grad_norm": 0.9445211887359619, "learning_rate": 5.2500000000000006e-06, "loss": 0.4317, "step": 1050 }, { "epoch": 43.13725490196079, "grad_norm": 0.8958839178085327, "learning_rate": 5.500000000000001e-06, "loss": 0.4329, "step": 1100 }, { "epoch": 45.09803921568628, "grad_norm": 0.9704959392547607, "learning_rate": 5.745000000000001e-06, "loss": 0.4412, "step": 1150 }, { "epoch": 47.05882352941177, "grad_norm": 0.8274173140525818, "learning_rate": 5.995000000000001e-06, "loss": 0.4287, "step": 1200 }, { "epoch": 49.01960784313726, "grad_norm": 1.8516689538955688, "learning_rate": 6.245000000000001e-06, "loss": 0.4323, "step": 1250 }, { "epoch": 50.98039215686274, "grad_norm": 1.067310094833374, "learning_rate": 6.4950000000000005e-06, "loss": 0.4313, "step": 1300 }, { "epoch": 52.94117647058823, "grad_norm": 0.9546812772750854, "learning_rate": 6.745000000000001e-06, "loss": 0.4272, "step": 1350 }, { "epoch": 54.90196078431372, "grad_norm": 1.0992048978805542, "learning_rate": 6.995000000000001e-06, "loss": 0.4254, "step": 1400 }, { "epoch": 56.86274509803921, "grad_norm": 1.1789544820785522, "learning_rate": 7.245000000000001e-06, "loss": 0.4224, "step": 1450 }, { "epoch": 58.8235294117647, "grad_norm": 1.1544729471206665, "learning_rate": 7.495000000000001e-06, "loss": 0.429, "step": 1500 }, { "epoch": 58.8235294117647, "eval_loss": 0.3985774517059326, "eval_runtime": 6.2083, "eval_samples_per_second": 28.993, "eval_steps_per_second": 3.705, "step": 1500 }, { "epoch": 60.78431372549019, "grad_norm": 0.7678772807121277, "learning_rate": 7.745e-06, "loss": 0.4278, "step": 1550 }, { "epoch": 62.745098039215684, "grad_norm": 1.0999374389648438, "learning_rate": 7.995e-06, "loss": 0.4292, "step": 1600 }, { "epoch": 64.70588235294117, "grad_norm": 1.1139107942581177, "learning_rate": 8.245000000000002e-06, "loss": 0.4223, "step": 1650 }, { "epoch": 66.66666666666667, "grad_norm": 0.9838771224021912, "learning_rate": 8.495e-06, "loss": 0.428, "step": 1700 }, { "epoch": 68.62745098039215, "grad_norm": 1.2118204832077026, "learning_rate": 8.745000000000002e-06, "loss": 0.421, "step": 1750 }, { "epoch": 70.58823529411765, "grad_norm": 1.1182212829589844, "learning_rate": 8.995000000000001e-06, "loss": 0.4244, "step": 1800 }, { "epoch": 72.54901960784314, "grad_norm": 1.6195331811904907, "learning_rate": 9.245e-06, "loss": 0.4293, "step": 1850 }, { "epoch": 74.50980392156863, "grad_norm": 1.0526857376098633, "learning_rate": 9.495000000000001e-06, "loss": 0.4198, "step": 1900 }, { "epoch": 76.47058823529412, "grad_norm": 1.9449137449264526, "learning_rate": 9.745e-06, "loss": 0.4201, "step": 1950 }, { "epoch": 78.43137254901961, "grad_norm": 1.2872648239135742, "learning_rate": 9.995000000000002e-06, "loss": 0.4173, "step": 2000 }, { "epoch": 78.43137254901961, "eval_loss": 0.3947462737560272, "eval_runtime": 5.956, "eval_samples_per_second": 30.222, "eval_steps_per_second": 3.862, "step": 2000 }, { "epoch": 80.3921568627451, "grad_norm": 0.8246454000473022, "learning_rate": 9.918333333333335e-06, "loss": 0.4159, "step": 2050 }, { "epoch": 82.3529411764706, "grad_norm": 1.8738404512405396, "learning_rate": 9.835000000000002e-06, "loss": 0.4183, "step": 2100 }, { "epoch": 84.31372549019608, "grad_norm": 1.0096226930618286, "learning_rate": 9.751666666666667e-06, "loss": 0.4136, "step": 2150 }, { "epoch": 86.27450980392157, "grad_norm": 1.6164231300354004, "learning_rate": 9.668333333333334e-06, "loss": 0.4186, "step": 2200 }, { "epoch": 88.23529411764706, "grad_norm": 1.2511614561080933, "learning_rate": 9.585e-06, "loss": 0.42, "step": 2250 }, { "epoch": 90.19607843137256, "grad_norm": 0.8566817045211792, "learning_rate": 9.501666666666667e-06, "loss": 0.4131, "step": 2300 }, { "epoch": 92.15686274509804, "grad_norm": 1.2247796058654785, "learning_rate": 9.418333333333334e-06, "loss": 0.4185, "step": 2350 }, { "epoch": 94.11764705882354, "grad_norm": 0.9032573699951172, "learning_rate": 9.335000000000001e-06, "loss": 0.4152, "step": 2400 }, { "epoch": 96.07843137254902, "grad_norm": 1.0673198699951172, "learning_rate": 9.251666666666668e-06, "loss": 0.4108, "step": 2450 }, { "epoch": 98.03921568627452, "grad_norm": 1.3430674076080322, "learning_rate": 9.168333333333333e-06, "loss": 0.4141, "step": 2500 }, { "epoch": 98.03921568627452, "eval_loss": 0.39529120922088623, "eval_runtime": 6.0557, "eval_samples_per_second": 29.724, "eval_steps_per_second": 3.798, "step": 2500 }, { "epoch": 100.0, "grad_norm": 1.1645582914352417, "learning_rate": 9.085e-06, "loss": 0.4142, "step": 2550 }, { "epoch": 101.96078431372548, "grad_norm": 0.8859590291976929, "learning_rate": 9.001666666666667e-06, "loss": 0.4072, "step": 2600 }, { "epoch": 103.92156862745098, "grad_norm": 1.001186490058899, "learning_rate": 8.918333333333334e-06, "loss": 0.4078, "step": 2650 }, { "epoch": 105.88235294117646, "grad_norm": 0.882518470287323, "learning_rate": 8.835000000000001e-06, "loss": 0.4131, "step": 2700 }, { "epoch": 107.84313725490196, "grad_norm": 0.9024128317832947, "learning_rate": 8.751666666666668e-06, "loss": 0.4126, "step": 2750 }, { "epoch": 109.80392156862744, "grad_norm": 1.4872159957885742, "learning_rate": 8.668333333333335e-06, "loss": 0.4073, "step": 2800 }, { "epoch": 111.76470588235294, "grad_norm": 1.1212759017944336, "learning_rate": 8.585000000000002e-06, "loss": 0.4024, "step": 2850 }, { "epoch": 113.72549019607843, "grad_norm": 1.5177373886108398, "learning_rate": 8.501666666666667e-06, "loss": 0.4077, "step": 2900 }, { "epoch": 115.68627450980392, "grad_norm": 0.8353803753852844, "learning_rate": 8.418333333333334e-06, "loss": 0.4007, "step": 2950 }, { "epoch": 117.6470588235294, "grad_norm": 1.0011717081069946, "learning_rate": 8.335e-06, "loss": 0.4043, "step": 3000 }, { "epoch": 117.6470588235294, "eval_loss": 0.3915382921695709, "eval_runtime": 5.9976, "eval_samples_per_second": 30.012, "eval_steps_per_second": 3.835, "step": 3000 }, { "epoch": 119.6078431372549, "grad_norm": 1.08296799659729, "learning_rate": 8.251666666666668e-06, "loss": 0.4036, "step": 3050 }, { "epoch": 121.56862745098039, "grad_norm": 2.0275931358337402, "learning_rate": 8.168333333333334e-06, "loss": 0.4033, "step": 3100 }, { "epoch": 123.52941176470588, "grad_norm": 1.1004642248153687, "learning_rate": 8.085000000000001e-06, "loss": 0.4043, "step": 3150 }, { "epoch": 125.49019607843137, "grad_norm": 0.9387047290802002, "learning_rate": 8.001666666666668e-06, "loss": 0.4055, "step": 3200 }, { "epoch": 127.45098039215686, "grad_norm": 1.542277455329895, "learning_rate": 7.918333333333333e-06, "loss": 0.4118, "step": 3250 }, { "epoch": 129.41176470588235, "grad_norm": 0.848300576210022, "learning_rate": 7.835e-06, "loss": 0.4008, "step": 3300 }, { "epoch": 131.37254901960785, "grad_norm": 0.9656948447227478, "learning_rate": 7.751666666666667e-06, "loss": 0.4003, "step": 3350 }, { "epoch": 133.33333333333334, "grad_norm": 0.8315781950950623, "learning_rate": 7.668333333333334e-06, "loss": 0.4038, "step": 3400 }, { "epoch": 135.2941176470588, "grad_norm": 0.8850957155227661, "learning_rate": 7.585e-06, "loss": 0.4024, "step": 3450 }, { "epoch": 137.2549019607843, "grad_norm": 0.936983048915863, "learning_rate": 7.501666666666667e-06, "loss": 0.4009, "step": 3500 }, { "epoch": 137.2549019607843, "eval_loss": 0.3919764757156372, "eval_runtime": 6.0942, "eval_samples_per_second": 29.536, "eval_steps_per_second": 3.774, "step": 3500 }, { "epoch": 139.2156862745098, "grad_norm": 1.0259428024291992, "learning_rate": 7.418333333333334e-06, "loss": 0.3997, "step": 3550 }, { "epoch": 141.1764705882353, "grad_norm": 0.8332601189613342, "learning_rate": 7.335000000000001e-06, "loss": 0.3987, "step": 3600 }, { "epoch": 143.13725490196077, "grad_norm": 0.8447348475456238, "learning_rate": 7.251666666666667e-06, "loss": 0.4005, "step": 3650 }, { "epoch": 145.09803921568627, "grad_norm": 0.8829927444458008, "learning_rate": 7.168333333333334e-06, "loss": 0.4018, "step": 3700 }, { "epoch": 147.05882352941177, "grad_norm": 0.7714055180549622, "learning_rate": 7.085000000000001e-06, "loss": 0.3964, "step": 3750 }, { "epoch": 149.01960784313727, "grad_norm": 1.4081863164901733, "learning_rate": 7.001666666666668e-06, "loss": 0.3983, "step": 3800 }, { "epoch": 150.98039215686273, "grad_norm": 0.9548608660697937, "learning_rate": 6.918333333333334e-06, "loss": 0.3991, "step": 3850 }, { "epoch": 152.94117647058823, "grad_norm": 0.8904614448547363, "learning_rate": 6.835000000000001e-06, "loss": 0.397, "step": 3900 }, { "epoch": 154.90196078431373, "grad_norm": 1.3204950094223022, "learning_rate": 6.7516666666666675e-06, "loss": 0.3993, "step": 3950 }, { "epoch": 156.86274509803923, "grad_norm": 1.048753261566162, "learning_rate": 6.668333333333334e-06, "loss": 0.3965, "step": 4000 }, { "epoch": 156.86274509803923, "eval_loss": 0.39249905943870544, "eval_runtime": 5.9893, "eval_samples_per_second": 30.054, "eval_steps_per_second": 3.84, "step": 4000 }, { "epoch": 158.8235294117647, "grad_norm": 0.7640918493270874, "learning_rate": 6.5850000000000005e-06, "loss": 0.4041, "step": 4050 }, { "epoch": 160.7843137254902, "grad_norm": 0.9641494750976562, "learning_rate": 6.501666666666667e-06, "loss": 0.397, "step": 4100 }, { "epoch": 162.7450980392157, "grad_norm": 0.9476408362388611, "learning_rate": 6.418333333333334e-06, "loss": 0.3996, "step": 4150 }, { "epoch": 164.7058823529412, "grad_norm": 0.7405825853347778, "learning_rate": 6.335e-06, "loss": 0.3969, "step": 4200 }, { "epoch": 166.66666666666666, "grad_norm": 0.9017153382301331, "learning_rate": 6.251666666666667e-06, "loss": 0.3984, "step": 4250 }, { "epoch": 168.62745098039215, "grad_norm": 1.1099542379379272, "learning_rate": 6.168333333333334e-06, "loss": 0.4008, "step": 4300 }, { "epoch": 170.58823529411765, "grad_norm": Infinity, "learning_rate": 6.086666666666667e-06, "loss": 0.4008, "step": 4350 }, { "epoch": 172.54901960784315, "grad_norm": 0.9821481108665466, "learning_rate": 6.003333333333334e-06, "loss": 0.3956, "step": 4400 }, { "epoch": 174.50980392156862, "grad_norm": 0.9521259069442749, "learning_rate": 5.92e-06, "loss": 0.3926, "step": 4450 }, { "epoch": 176.47058823529412, "grad_norm": 0.9021440148353577, "learning_rate": 5.836666666666667e-06, "loss": 0.3935, "step": 4500 }, { "epoch": 176.47058823529412, "eval_loss": 0.39356571435928345, "eval_runtime": 6.0395, "eval_samples_per_second": 29.804, "eval_steps_per_second": 3.808, "step": 4500 }, { "epoch": 178.4313725490196, "grad_norm": 1.2105145454406738, "learning_rate": 5.753333333333334e-06, "loss": 0.3973, "step": 4550 }, { "epoch": 180.3921568627451, "grad_norm": 0.764930784702301, "learning_rate": 5.67e-06, "loss": 0.401, "step": 4600 }, { "epoch": 182.35294117647058, "grad_norm": 0.8594487905502319, "learning_rate": 5.586666666666667e-06, "loss": 0.404, "step": 4650 }, { "epoch": 184.31372549019608, "grad_norm": 0.8230810165405273, "learning_rate": 5.503333333333334e-06, "loss": 0.3904, "step": 4700 }, { "epoch": 186.27450980392157, "grad_norm": 1.0998687744140625, "learning_rate": 5.420000000000001e-06, "loss": 0.3934, "step": 4750 }, { "epoch": 188.23529411764707, "grad_norm": 0.8254913091659546, "learning_rate": 5.336666666666667e-06, "loss": 0.3913, "step": 4800 }, { "epoch": 190.19607843137254, "grad_norm": 0.7729244232177734, "learning_rate": 5.2533333333333336e-06, "loss": 0.3932, "step": 4850 }, { "epoch": 192.15686274509804, "grad_norm": 0.936481237411499, "learning_rate": 5.1700000000000005e-06, "loss": 0.393, "step": 4900 }, { "epoch": 194.11764705882354, "grad_norm": 0.9065796732902527, "learning_rate": 5.086666666666667e-06, "loss": 0.3967, "step": 4950 }, { "epoch": 196.07843137254903, "grad_norm": 0.8526683449745178, "learning_rate": 5.0033333333333334e-06, "loss": 0.3898, "step": 5000 }, { "epoch": 196.07843137254903, "eval_loss": 0.39234617352485657, "eval_runtime": 6.0934, "eval_samples_per_second": 29.54, "eval_steps_per_second": 3.775, "step": 5000 }, { "epoch": 198.0392156862745, "grad_norm": 0.7739047408103943, "learning_rate": 4.92e-06, "loss": 0.3958, "step": 5050 }, { "epoch": 200.0, "grad_norm": 0.9808892607688904, "learning_rate": 4.836666666666667e-06, "loss": 0.3909, "step": 5100 }, { "epoch": 201.9607843137255, "grad_norm": 0.6833762526512146, "learning_rate": 4.753333333333333e-06, "loss": 0.3909, "step": 5150 }, { "epoch": 203.92156862745097, "grad_norm": 0.9400734305381775, "learning_rate": 4.670000000000001e-06, "loss": 0.3897, "step": 5200 }, { "epoch": 205.88235294117646, "grad_norm": 0.7391972541809082, "learning_rate": 4.586666666666667e-06, "loss": 0.3954, "step": 5250 }, { "epoch": 207.84313725490196, "grad_norm": 0.8772739171981812, "learning_rate": 4.503333333333333e-06, "loss": 0.3925, "step": 5300 }, { "epoch": 209.80392156862746, "grad_norm": 1.0046323537826538, "learning_rate": 4.42e-06, "loss": 0.3911, "step": 5350 }, { "epoch": 211.76470588235293, "grad_norm": 1.2856130599975586, "learning_rate": 4.336666666666667e-06, "loss": 0.3934, "step": 5400 }, { "epoch": 213.72549019607843, "grad_norm": 0.904976487159729, "learning_rate": 4.253333333333334e-06, "loss": 0.3916, "step": 5450 }, { "epoch": 215.68627450980392, "grad_norm": 0.8475141525268555, "learning_rate": 4.17e-06, "loss": 0.3966, "step": 5500 }, { "epoch": 215.68627450980392, "eval_loss": 0.3928934931755066, "eval_runtime": 6.3131, "eval_samples_per_second": 28.512, "eval_steps_per_second": 3.643, "step": 5500 }, { "epoch": 217.64705882352942, "grad_norm": 0.7401838302612305, "learning_rate": 4.086666666666667e-06, "loss": 0.3858, "step": 5550 }, { "epoch": 219.6078431372549, "grad_norm": 0.9024094939231873, "learning_rate": 4.003333333333334e-06, "loss": 0.3945, "step": 5600 }, { "epoch": 221.5686274509804, "grad_norm": 0.6779565811157227, "learning_rate": 3.920000000000001e-06, "loss": 0.3919, "step": 5650 }, { "epoch": 223.52941176470588, "grad_norm": 1.0921446084976196, "learning_rate": 3.836666666666667e-06, "loss": 0.39, "step": 5700 }, { "epoch": 225.49019607843138, "grad_norm": 0.8531523942947388, "learning_rate": 3.753333333333334e-06, "loss": 0.3893, "step": 5750 }, { "epoch": 227.45098039215685, "grad_norm": 0.9430971741676331, "learning_rate": 3.6700000000000004e-06, "loss": 0.3919, "step": 5800 }, { "epoch": 229.41176470588235, "grad_norm": 0.7794237732887268, "learning_rate": 3.5866666666666673e-06, "loss": 0.3895, "step": 5850 }, { "epoch": 231.37254901960785, "grad_norm": 0.766396701335907, "learning_rate": 3.5033333333333334e-06, "loss": 0.3893, "step": 5900 }, { "epoch": 233.33333333333334, "grad_norm": 0.7494246959686279, "learning_rate": 3.4200000000000007e-06, "loss": 0.3865, "step": 5950 }, { "epoch": 235.2941176470588, "grad_norm": 0.9798209071159363, "learning_rate": 3.3366666666666668e-06, "loss": 0.3936, "step": 6000 }, { "epoch": 235.2941176470588, "eval_loss": 0.3921584188938141, "eval_runtime": 6.0843, "eval_samples_per_second": 29.584, "eval_steps_per_second": 3.78, "step": 6000 }, { "epoch": 237.2549019607843, "grad_norm": 0.8828366994857788, "learning_rate": 3.2533333333333332e-06, "loss": 0.3881, "step": 6050 }, { "epoch": 239.2156862745098, "grad_norm": 1.034189224243164, "learning_rate": 3.17e-06, "loss": 0.3905, "step": 6100 }, { "epoch": 241.1764705882353, "grad_norm": 0.7035565972328186, "learning_rate": 3.0866666666666666e-06, "loss": 0.3889, "step": 6150 }, { "epoch": 243.13725490196077, "grad_norm": 0.8675793409347534, "learning_rate": 3.0033333333333335e-06, "loss": 0.3895, "step": 6200 }, { "epoch": 245.09803921568627, "grad_norm": 0.731158435344696, "learning_rate": 2.92e-06, "loss": 0.3835, "step": 6250 }, { "epoch": 247.05882352941177, "grad_norm": 0.8929085731506348, "learning_rate": 2.836666666666667e-06, "loss": 0.3906, "step": 6300 }, { "epoch": 249.01960784313727, "grad_norm": 1.1059510707855225, "learning_rate": 2.7533333333333334e-06, "loss": 0.3905, "step": 6350 }, { "epoch": 250.98039215686273, "grad_norm": 0.9429101347923279, "learning_rate": 2.6700000000000003e-06, "loss": 0.3894, "step": 6400 }, { "epoch": 252.94117647058823, "grad_norm": 1.0544432401657104, "learning_rate": 2.5866666666666667e-06, "loss": 0.3937, "step": 6450 }, { "epoch": 254.90196078431373, "grad_norm": 0.8494179844856262, "learning_rate": 2.5033333333333336e-06, "loss": 0.3921, "step": 6500 }, { "epoch": 254.90196078431373, "eval_loss": 0.3925850987434387, "eval_runtime": 6.2722, "eval_samples_per_second": 28.698, "eval_steps_per_second": 3.667, "step": 6500 } ], "logging_steps": 50, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 320, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.072681729994813e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }