{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "global_step": 513, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06, "learning_rate": 5.917808219178082e-06, "loss": 0.0322, "step": 10 }, { "epoch": 0.06, "eval_oasst_export_accuracy": 0.49051699074786653, "eval_oasst_export_loss": 3.484375, "eval_oasst_export_runtime": 6.0372, "eval_oasst_export_samples_per_second": 181.21, "eval_oasst_export_steps_per_second": 2.982, "step": 10 }, { "epoch": 0.12, "learning_rate": 5.800391389432485e-06, "loss": 0.0246, "step": 20 }, { "epoch": 0.12, "eval_oasst_export_accuracy": 0.489678997143981, "eval_oasst_export_loss": 3.779296875, "eval_oasst_export_runtime": 6.056, "eval_oasst_export_samples_per_second": 180.647, "eval_oasst_export_steps_per_second": 2.972, "step": 20 }, { "epoch": 0.18, "learning_rate": 5.682974559686888e-06, "loss": 0.0203, "step": 30 }, { "epoch": 0.18, "eval_oasst_export_accuracy": 0.4914062900826022, "eval_oasst_export_loss": 3.734375, "eval_oasst_export_runtime": 6.0349, "eval_oasst_export_samples_per_second": 181.279, "eval_oasst_export_steps_per_second": 2.983, "step": 30 }, { "epoch": 0.23, "learning_rate": 5.565557729941292e-06, "loss": 0.0187, "step": 40 }, { "epoch": 0.23, "eval_oasst_export_accuracy": 0.4879859080259265, "eval_oasst_export_loss": 3.7109375, "eval_oasst_export_runtime": 6.0607, "eval_oasst_export_samples_per_second": 180.508, "eval_oasst_export_steps_per_second": 2.97, "step": 40 }, { "epoch": 0.29, "learning_rate": 5.448140900195694e-06, "loss": 0.0162, "step": 50 }, { "epoch": 0.29, "eval_oasst_export_accuracy": 0.4880030099362099, "eval_oasst_export_loss": 3.783203125, "eval_oasst_export_runtime": 6.0304, "eval_oasst_export_samples_per_second": 181.415, "eval_oasst_export_steps_per_second": 2.985, "step": 50 }, { "epoch": 0.35, "learning_rate": 5.330724070450098e-06, "loss": 0.0159, "step": 60 }, { "epoch": 0.35, "eval_oasst_export_accuracy": 0.486292818907872, "eval_oasst_export_loss": 3.7890625, "eval_oasst_export_runtime": 6.0367, "eval_oasst_export_samples_per_second": 181.224, "eval_oasst_export_steps_per_second": 2.982, "step": 60 }, { "epoch": 0.41, "learning_rate": 5.213307240704501e-06, "loss": 0.0172, "step": 70 }, { "epoch": 0.41, "eval_oasst_export_accuracy": 0.48716501633232434, "eval_oasst_export_loss": 3.806640625, "eval_oasst_export_runtime": 6.0405, "eval_oasst_export_samples_per_second": 181.11, "eval_oasst_export_steps_per_second": 2.98, "step": 70 }, { "epoch": 0.47, "learning_rate": 5.095890410958904e-06, "loss": 0.0157, "step": 80 }, { "epoch": 0.47, "eval_oasst_export_accuracy": 0.48454842405896736, "eval_oasst_export_loss": 3.791015625, "eval_oasst_export_runtime": 6.0411, "eval_oasst_export_samples_per_second": 181.092, "eval_oasst_export_steps_per_second": 2.98, "step": 80 }, { "epoch": 0.53, "learning_rate": 4.978473581213307e-06, "loss": 0.0163, "step": 90 }, { "epoch": 0.53, "eval_oasst_export_accuracy": 0.49138918817231886, "eval_oasst_export_loss": 3.822265625, "eval_oasst_export_runtime": 5.6229, "eval_oasst_export_samples_per_second": 194.561, "eval_oasst_export_steps_per_second": 3.201, "step": 90 }, { "epoch": 0.58, "learning_rate": 4.86105675146771e-06, "loss": 0.0139, "step": 100 }, { "epoch": 0.58, "eval_oasst_export_accuracy": 0.4956817676534469, "eval_oasst_export_loss": 3.826171875, "eval_oasst_export_runtime": 6.0223, "eval_oasst_export_samples_per_second": 181.659, "eval_oasst_export_steps_per_second": 2.989, "step": 100 }, { "epoch": 0.64, "learning_rate": 4.7436399217221134e-06, "loss": 0.0151, "step": 110 }, { "epoch": 0.64, "eval_oasst_export_accuracy": 0.4845655259692508, "eval_oasst_export_loss": 3.904296875, "eval_oasst_export_runtime": 6.0257, "eval_oasst_export_samples_per_second": 181.554, "eval_oasst_export_steps_per_second": 2.987, "step": 110 }, { "epoch": 0.7, "learning_rate": 4.626223091976516e-06, "loss": 0.0139, "step": 120 }, { "epoch": 0.7, "eval_oasst_export_accuracy": 0.49049988883758316, "eval_oasst_export_loss": 3.958984375, "eval_oasst_export_runtime": 6.0136, "eval_oasst_export_samples_per_second": 181.92, "eval_oasst_export_steps_per_second": 2.993, "step": 120 }, { "epoch": 0.76, "learning_rate": 4.50880626223092e-06, "loss": 0.0133, "step": 130 }, { "epoch": 0.76, "eval_oasst_export_accuracy": 0.4803413541292562, "eval_oasst_export_loss": 3.845703125, "eval_oasst_export_runtime": 6.0221, "eval_oasst_export_samples_per_second": 181.663, "eval_oasst_export_steps_per_second": 2.989, "step": 130 }, { "epoch": 0.82, "learning_rate": 4.3913894324853226e-06, "loss": 0.0142, "step": 140 }, { "epoch": 0.82, "eval_oasst_export_accuracy": 0.4777076599456159, "eval_oasst_export_loss": 3.828125, "eval_oasst_export_runtime": 6.0368, "eval_oasst_export_samples_per_second": 181.223, "eval_oasst_export_steps_per_second": 2.982, "step": 140 }, { "epoch": 0.88, "learning_rate": 4.273972602739726e-06, "loss": 0.0141, "step": 150 }, { "epoch": 0.88, "eval_oasst_export_accuracy": 0.47088399774254786, "eval_oasst_export_loss": 3.998046875, "eval_oasst_export_runtime": 6.0502, "eval_oasst_export_samples_per_second": 180.82, "eval_oasst_export_steps_per_second": 2.975, "step": 150 }, { "epoch": 0.94, "learning_rate": 4.156555772994129e-06, "loss": 0.0157, "step": 160 }, { "epoch": 0.94, "eval_oasst_export_accuracy": 0.4726796983223026, "eval_oasst_export_loss": 3.8828125, "eval_oasst_export_runtime": 5.995, "eval_oasst_export_samples_per_second": 182.487, "eval_oasst_export_steps_per_second": 3.003, "step": 160 }, { "epoch": 0.99, "learning_rate": 4.0391389432485325e-06, "loss": 0.0139, "step": 170 }, { "epoch": 0.99, "eval_oasst_export_accuracy": 0.4744240931712072, "eval_oasst_export_loss": 3.9609375, "eval_oasst_export_runtime": 6.0696, "eval_oasst_export_samples_per_second": 180.244, "eval_oasst_export_steps_per_second": 2.966, "step": 170 }, { "epoch": 1.05, "learning_rate": 3.921722113502935e-06, "loss": 0.0125, "step": 180 }, { "epoch": 1.05, "eval_oasst_export_accuracy": 0.48210285088844423, "eval_oasst_export_loss": 4.03125, "eval_oasst_export_runtime": 5.616, "eval_oasst_export_samples_per_second": 194.8, "eval_oasst_export_steps_per_second": 3.205, "step": 180 }, { "epoch": 1.11, "learning_rate": 3.8043052837573385e-06, "loss": 0.017, "step": 190 }, { "epoch": 1.11, "eval_oasst_export_accuracy": 0.4897303028748311, "eval_oasst_export_loss": 3.970703125, "eval_oasst_export_runtime": 6.03, "eval_oasst_export_samples_per_second": 181.426, "eval_oasst_export_steps_per_second": 2.985, "step": 190 }, { "epoch": 1.17, "learning_rate": 3.6868884540117416e-06, "loss": 0.0137, "step": 200 }, { "epoch": 1.17, "eval_oasst_export_accuracy": 0.4794862586150873, "eval_oasst_export_loss": 3.986328125, "eval_oasst_export_runtime": 6.0192, "eval_oasst_export_samples_per_second": 181.751, "eval_oasst_export_steps_per_second": 2.99, "step": 200 }, { "epoch": 1.23, "learning_rate": 3.569471624266145e-06, "loss": 0.0133, "step": 210 }, { "epoch": 1.23, "eval_oasst_export_accuracy": 0.47693807398286386, "eval_oasst_export_loss": 4.140625, "eval_oasst_export_runtime": 6.0148, "eval_oasst_export_samples_per_second": 181.886, "eval_oasst_export_steps_per_second": 2.993, "step": 210 }, { "epoch": 1.29, "learning_rate": 3.452054794520548e-06, "loss": 0.0133, "step": 220 }, { "epoch": 1.29, "eval_oasst_export_accuracy": 0.48713081251175755, "eval_oasst_export_loss": 4.1484375, "eval_oasst_export_runtime": 6.0347, "eval_oasst_export_samples_per_second": 181.285, "eval_oasst_export_steps_per_second": 2.983, "step": 220 }, { "epoch": 1.35, "learning_rate": 3.334637964774951e-06, "loss": 0.0154, "step": 230 }, { "epoch": 1.35, "eval_oasst_export_accuracy": 0.4752620867750928, "eval_oasst_export_loss": 4.09375, "eval_oasst_export_runtime": 5.6373, "eval_oasst_export_samples_per_second": 194.065, "eval_oasst_export_steps_per_second": 3.193, "step": 230 }, { "epoch": 1.4, "learning_rate": 3.217221135029354e-06, "loss": 0.0139, "step": 240 }, { "epoch": 1.4, "eval_oasst_export_accuracy": 0.4760658765584116, "eval_oasst_export_loss": 4.09765625, "eval_oasst_export_runtime": 6.055, "eval_oasst_export_samples_per_second": 180.676, "eval_oasst_export_steps_per_second": 2.973, "step": 240 }, { "epoch": 1.46, "learning_rate": 3.099804305283757e-06, "loss": 0.0136, "step": 250 }, { "epoch": 1.46, "eval_oasst_export_accuracy": 0.47102081302481486, "eval_oasst_export_loss": 4.20703125, "eval_oasst_export_runtime": 6.0408, "eval_oasst_export_samples_per_second": 181.103, "eval_oasst_export_steps_per_second": 2.98, "step": 250 }, { "epoch": 1.52, "learning_rate": 2.9823874755381603e-06, "loss": 0.0129, "step": 260 }, { "epoch": 1.52, "eval_oasst_export_accuracy": 0.4641458450908967, "eval_oasst_export_loss": 4.2578125, "eval_oasst_export_runtime": 6.0349, "eval_oasst_export_samples_per_second": 181.279, "eval_oasst_export_steps_per_second": 2.983, "step": 260 }, { "epoch": 1.58, "learning_rate": 2.8649706457925635e-06, "loss": 0.0128, "step": 270 }, { "epoch": 1.58, "eval_oasst_export_accuracy": 0.4649838386947822, "eval_oasst_export_loss": 4.24609375, "eval_oasst_export_runtime": 6.0355, "eval_oasst_export_samples_per_second": 181.26, "eval_oasst_export_steps_per_second": 2.982, "step": 270 }, { "epoch": 1.64, "learning_rate": 2.7475538160469666e-06, "loss": 0.0133, "step": 280 }, { "epoch": 1.64, "eval_oasst_export_accuracy": 0.46412874318061326, "eval_oasst_export_loss": 4.2421875, "eval_oasst_export_runtime": 6.0367, "eval_oasst_export_samples_per_second": 181.224, "eval_oasst_export_steps_per_second": 2.982, "step": 280 }, { "epoch": 1.7, "learning_rate": 2.63013698630137e-06, "loss": 0.013, "step": 290 }, { "epoch": 1.7, "eval_oasst_export_accuracy": 0.47182460280813365, "eval_oasst_export_loss": 4.28125, "eval_oasst_export_runtime": 6.0187, "eval_oasst_export_samples_per_second": 181.768, "eval_oasst_export_steps_per_second": 2.991, "step": 290 }, { "epoch": 1.75, "learning_rate": 2.512720156555773e-06, "loss": 0.0133, "step": 300 }, { "epoch": 1.75, "eval_oasst_export_accuracy": 0.47093530347339796, "eval_oasst_export_loss": 4.30078125, "eval_oasst_export_runtime": 6.0452, "eval_oasst_export_samples_per_second": 180.969, "eval_oasst_export_steps_per_second": 2.978, "step": 300 }, { "epoch": 1.81, "learning_rate": 2.395303326810176e-06, "loss": 0.0117, "step": 310 }, { "epoch": 1.81, "eval_oasst_export_accuracy": 0.4726454945017358, "eval_oasst_export_loss": 4.41796875, "eval_oasst_export_runtime": 6.0234, "eval_oasst_export_samples_per_second": 181.624, "eval_oasst_export_steps_per_second": 2.988, "step": 310 }, { "epoch": 1.87, "learning_rate": 2.2778864970645793e-06, "loss": 0.0121, "step": 320 }, { "epoch": 1.87, "eval_oasst_export_accuracy": 0.47096950729396475, "eval_oasst_export_loss": 4.4375, "eval_oasst_export_runtime": 6.0349, "eval_oasst_export_samples_per_second": 181.279, "eval_oasst_export_steps_per_second": 2.983, "step": 320 }, { "epoch": 1.93, "learning_rate": 2.1604696673189825e-06, "loss": 0.0121, "step": 330 }, { "epoch": 1.93, "eval_oasst_export_accuracy": 0.480375557949823, "eval_oasst_export_loss": 4.421875, "eval_oasst_export_runtime": 6.0242, "eval_oasst_export_samples_per_second": 181.602, "eval_oasst_export_steps_per_second": 2.988, "step": 330 }, { "epoch": 1.99, "learning_rate": 2.0430528375733853e-06, "loss": 0.0128, "step": 340 }, { "epoch": 1.99, "eval_oasst_export_accuracy": 0.4811964496434252, "eval_oasst_export_loss": 4.40625, "eval_oasst_export_runtime": 6.0198, "eval_oasst_export_samples_per_second": 181.734, "eval_oasst_export_steps_per_second": 2.99, "step": 340 }, { "epoch": 2.05, "learning_rate": 1.9256360078277885e-06, "loss": 0.0126, "step": 350 }, { "epoch": 2.05, "eval_oasst_export_accuracy": 0.4785969592803516, "eval_oasst_export_loss": 4.34375, "eval_oasst_export_runtime": 5.9923, "eval_oasst_export_samples_per_second": 182.567, "eval_oasst_export_steps_per_second": 3.004, "step": 350 }, { "epoch": 2.11, "learning_rate": 1.8082191780821916e-06, "loss": 0.0142, "step": 360 }, { "epoch": 2.11, "eval_oasst_export_accuracy": 0.47438988935064047, "eval_oasst_export_loss": 4.3359375, "eval_oasst_export_runtime": 6.0319, "eval_oasst_export_samples_per_second": 181.368, "eval_oasst_export_steps_per_second": 2.984, "step": 360 }, { "epoch": 2.16, "learning_rate": 1.6908023483365948e-06, "loss": 0.0119, "step": 370 }, { "epoch": 2.16, "eval_oasst_export_accuracy": 0.47102081302481486, "eval_oasst_export_loss": 4.359375, "eval_oasst_export_runtime": 6.0592, "eval_oasst_export_samples_per_second": 180.553, "eval_oasst_export_steps_per_second": 2.971, "step": 370 }, { "epoch": 2.22, "learning_rate": 1.573385518590998e-06, "loss": 0.0122, "step": 380 }, { "epoch": 2.22, "eval_oasst_export_accuracy": 0.46927641817591026, "eval_oasst_export_loss": 4.37890625, "eval_oasst_export_runtime": 6.0252, "eval_oasst_export_samples_per_second": 181.571, "eval_oasst_export_steps_per_second": 2.987, "step": 380 }, { "epoch": 2.28, "learning_rate": 1.4559686888454012e-06, "loss": 0.0115, "step": 390 }, { "epoch": 2.28, "eval_oasst_export_accuracy": 0.471841704718417, "eval_oasst_export_loss": 4.40625, "eval_oasst_export_runtime": 6.0266, "eval_oasst_export_samples_per_second": 181.528, "eval_oasst_export_steps_per_second": 2.987, "step": 390 }, { "epoch": 2.34, "learning_rate": 1.3385518590998044e-06, "loss": 0.0122, "step": 400 }, { "epoch": 2.34, "eval_oasst_export_accuracy": 0.4727823097840029, "eval_oasst_export_loss": 4.4453125, "eval_oasst_export_runtime": 6.0313, "eval_oasst_export_samples_per_second": 181.387, "eval_oasst_export_steps_per_second": 2.984, "step": 400 }, { "epoch": 2.4, "learning_rate": 1.2211350293542073e-06, "loss": 0.0128, "step": 410 }, { "epoch": 2.4, "eval_oasst_export_accuracy": 0.4727310040531527, "eval_oasst_export_loss": 4.48828125, "eval_oasst_export_runtime": 6.0409, "eval_oasst_export_samples_per_second": 181.1, "eval_oasst_export_steps_per_second": 2.98, "step": 410 }, { "epoch": 2.46, "learning_rate": 1.1037181996086105e-06, "loss": 0.0114, "step": 420 }, { "epoch": 2.46, "eval_oasst_export_accuracy": 0.4734834881056214, "eval_oasst_export_loss": 4.48828125, "eval_oasst_export_runtime": 6.046, "eval_oasst_export_samples_per_second": 180.947, "eval_oasst_export_steps_per_second": 2.977, "step": 420 }, { "epoch": 2.51, "learning_rate": 9.863013698630137e-07, "loss": 0.011, "step": 430 }, { "epoch": 2.51, "eval_oasst_export_accuracy": 0.47777606758674945, "eval_oasst_export_loss": 4.4765625, "eval_oasst_export_runtime": 6.0324, "eval_oasst_export_samples_per_second": 181.355, "eval_oasst_export_steps_per_second": 2.984, "step": 430 }, { "epoch": 2.57, "learning_rate": 8.688845401174168e-07, "loss": 0.0129, "step": 440 }, { "epoch": 2.57, "eval_oasst_export_accuracy": 0.4744240931712072, "eval_oasst_export_loss": 4.45703125, "eval_oasst_export_runtime": 6.0468, "eval_oasst_export_samples_per_second": 180.921, "eval_oasst_export_steps_per_second": 2.977, "step": 440 }, { "epoch": 2.63, "learning_rate": 7.514677103718199e-07, "loss": 0.0133, "step": 450 }, { "epoch": 2.63, "eval_oasst_export_accuracy": 0.4726796983223026, "eval_oasst_export_loss": 4.45703125, "eval_oasst_export_runtime": 6.0371, "eval_oasst_export_samples_per_second": 181.213, "eval_oasst_export_steps_per_second": 2.982, "step": 450 }, { "epoch": 2.69, "learning_rate": 6.340508806262231e-07, "loss": 0.0131, "step": 460 }, { "epoch": 2.69, "eval_oasst_export_accuracy": 0.47782737331759956, "eval_oasst_export_loss": 4.48046875, "eval_oasst_export_runtime": 6.0367, "eval_oasst_export_samples_per_second": 181.226, "eval_oasst_export_steps_per_second": 2.982, "step": 460 }, { "epoch": 2.75, "learning_rate": 5.166340508806262e-07, "loss": 0.0111, "step": 470 }, { "epoch": 2.75, "eval_oasst_export_accuracy": 0.4752620867750928, "eval_oasst_export_loss": 4.484375, "eval_oasst_export_runtime": 6.0217, "eval_oasst_export_samples_per_second": 181.676, "eval_oasst_export_steps_per_second": 2.989, "step": 470 }, { "epoch": 2.81, "learning_rate": 3.9921722113502936e-07, "loss": 0.012, "step": 480 }, { "epoch": 2.81, "eval_oasst_export_accuracy": 0.47774186376618266, "eval_oasst_export_loss": 4.4765625, "eval_oasst_export_runtime": 6.018, "eval_oasst_export_samples_per_second": 181.788, "eval_oasst_export_steps_per_second": 2.991, "step": 480 }, { "epoch": 2.87, "learning_rate": 2.818003913894325e-07, "loss": 0.0111, "step": 490 }, { "epoch": 2.87, "eval_oasst_export_accuracy": 0.47351769192618814, "eval_oasst_export_loss": 4.484375, "eval_oasst_export_runtime": 6.0271, "eval_oasst_export_samples_per_second": 181.513, "eval_oasst_export_steps_per_second": 2.986, "step": 490 }, { "epoch": 2.92, "learning_rate": 1.6438356164383561e-07, "loss": 0.0125, "step": 500 }, { "epoch": 2.92, "eval_oasst_export_accuracy": 0.47608297846869496, "eval_oasst_export_loss": 4.484375, "eval_oasst_export_runtime": 6.026, "eval_oasst_export_samples_per_second": 181.546, "eval_oasst_export_steps_per_second": 2.987, "step": 500 }, { "epoch": 2.98, "learning_rate": 4.6966731898238746e-08, "loss": 0.0124, "step": 510 }, { "epoch": 2.98, "eval_oasst_export_accuracy": 0.475227882954526, "eval_oasst_export_loss": 4.4765625, "eval_oasst_export_runtime": 6.0338, "eval_oasst_export_samples_per_second": 181.313, "eval_oasst_export_steps_per_second": 2.983, "step": 510 } ], "max_steps": 513, "num_train_epochs": 3, "total_flos": 3.6185460841979576e+18, "trial_name": null, "trial_params": null }