{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 22.986753463745117, "learning_rate": 0.00198, "loss": 1.7003, "step": 10 }, { "epoch": 0.0, "grad_norm": 8.073963165283203, "learning_rate": 0.00196, "loss": 1.9785, "step": 20 }, { "epoch": 0.01, "grad_norm": 7.6197099685668945, "learning_rate": 0.0019399999999999999, "loss": 1.9353, "step": 30 }, { "epoch": 0.01, "grad_norm": 6.2731146812438965, "learning_rate": 0.00192, "loss": 1.5962, "step": 40 }, { "epoch": 0.01, "grad_norm": 5.425559043884277, "learning_rate": 0.0019, "loss": 1.389, "step": 50 }, { "epoch": 0.01, "grad_norm": 4.872774600982666, "learning_rate": 0.00188, "loss": 1.4156, "step": 60 }, { "epoch": 0.01, "grad_norm": 5.199490070343018, "learning_rate": 0.00186, "loss": 1.6583, "step": 70 }, { "epoch": 0.02, "grad_norm": 4.413191318511963, "learning_rate": 0.00184, "loss": 1.4334, "step": 80 }, { "epoch": 0.02, "grad_norm": 5.90674352645874, "learning_rate": 0.00182, "loss": 1.6046, "step": 90 }, { "epoch": 0.02, "grad_norm": 6.410930633544922, "learning_rate": 0.0018000000000000002, "loss": 1.5504, "step": 100 }, { "epoch": 0.02, "grad_norm": 3.529223680496216, "learning_rate": 0.0017800000000000001, "loss": 1.6463, "step": 110 }, { "epoch": 0.02, "grad_norm": 4.781284332275391, "learning_rate": 0.00176, "loss": 1.6136, "step": 120 }, { "epoch": 0.03, "grad_norm": 5.6382951736450195, "learning_rate": 0.00174, "loss": 1.5105, "step": 130 }, { "epoch": 0.03, "grad_norm": 4.392839431762695, "learning_rate": 0.00172, "loss": 1.6061, "step": 140 }, { "epoch": 0.03, "grad_norm": 3.9011926651000977, "learning_rate": 0.0017, "loss": 1.6188, "step": 150 }, { "epoch": 0.03, "grad_norm": 4.002920627593994, "learning_rate": 0.00168, "loss": 1.4177, "step": 160 }, { "epoch": 0.03, "grad_norm": 4.34838342666626, "learning_rate": 0.00166, "loss": 1.5689, "step": 170 }, { "epoch": 0.04, "grad_norm": 8.142854690551758, "learning_rate": 0.00164, "loss": 1.5804, "step": 180 }, { "epoch": 0.04, "grad_norm": 5.837989330291748, "learning_rate": 0.0016200000000000001, "loss": 1.5981, "step": 190 }, { "epoch": 0.04, "grad_norm": 2.33852219581604, "learning_rate": 0.0016, "loss": 1.4347, "step": 200 }, { "epoch": 0.04, "grad_norm": 3.069826602935791, "learning_rate": 0.00158, "loss": 1.4809, "step": 210 }, { "epoch": 0.04, "grad_norm": 2.71095609664917, "learning_rate": 0.0015600000000000002, "loss": 1.388, "step": 220 }, { "epoch": 0.05, "grad_norm": 4.450407981872559, "learning_rate": 0.0015400000000000001, "loss": 1.603, "step": 230 }, { "epoch": 0.05, "grad_norm": 4.475738048553467, "learning_rate": 0.00152, "loss": 1.5731, "step": 240 }, { "epoch": 0.05, "grad_norm": 3.051819086074829, "learning_rate": 0.0015, "loss": 1.5133, "step": 250 }, { "epoch": 0.05, "grad_norm": 5.014269828796387, "learning_rate": 0.00148, "loss": 1.5458, "step": 260 }, { "epoch": 0.05, "grad_norm": 2.558957815170288, "learning_rate": 0.00146, "loss": 1.4918, "step": 270 }, { "epoch": 0.06, "grad_norm": 4.6234660148620605, "learning_rate": 0.0014399999999999999, "loss": 1.5247, "step": 280 }, { "epoch": 0.06, "grad_norm": 2.9923095703125, "learning_rate": 0.00142, "loss": 1.6671, "step": 290 }, { "epoch": 0.06, "grad_norm": 7.883978366851807, "learning_rate": 0.0014, "loss": 1.5732, "step": 300 }, { "epoch": 0.06, "grad_norm": 3.3218066692352295, "learning_rate": 0.00138, "loss": 1.6297, "step": 310 }, { "epoch": 0.06, "grad_norm": 9.045559883117676, "learning_rate": 0.00136, "loss": 1.6581, "step": 320 }, { "epoch": 0.07, "grad_norm": 2.832301139831543, "learning_rate": 0.00134, "loss": 1.6966, "step": 330 }, { "epoch": 0.07, "grad_norm": 3.6719107627868652, "learning_rate": 0.00132, "loss": 1.5904, "step": 340 }, { "epoch": 0.07, "grad_norm": 5.4335455894470215, "learning_rate": 0.0013000000000000002, "loss": 1.6643, "step": 350 }, { "epoch": 0.07, "grad_norm": 3.2848339080810547, "learning_rate": 0.00128, "loss": 1.4174, "step": 360 }, { "epoch": 0.07, "grad_norm": 2.8206841945648193, "learning_rate": 0.00126, "loss": 1.7362, "step": 370 }, { "epoch": 0.08, "grad_norm": 3.389599084854126, "learning_rate": 0.00124, "loss": 1.6058, "step": 380 }, { "epoch": 0.08, "grad_norm": 4.887266159057617, "learning_rate": 0.00122, "loss": 1.4604, "step": 390 }, { "epoch": 0.08, "grad_norm": 2.9653384685516357, "learning_rate": 0.0012, "loss": 1.5152, "step": 400 }, { "epoch": 0.08, "grad_norm": 2.5362136363983154, "learning_rate": 0.00118, "loss": 1.469, "step": 410 }, { "epoch": 0.08, "grad_norm": 2.7318670749664307, "learning_rate": 0.00116, "loss": 1.4136, "step": 420 }, { "epoch": 0.09, "grad_norm": 3.6364078521728516, "learning_rate": 0.00114, "loss": 1.6937, "step": 430 }, { "epoch": 0.09, "grad_norm": 1.9428081512451172, "learning_rate": 0.0011200000000000001, "loss": 1.4825, "step": 440 }, { "epoch": 0.09, "grad_norm": 2.1813700199127197, "learning_rate": 0.0011, "loss": 1.4593, "step": 450 }, { "epoch": 0.09, "grad_norm": 4.612652778625488, "learning_rate": 0.00108, "loss": 1.389, "step": 460 }, { "epoch": 0.09, "grad_norm": 2.5145719051361084, "learning_rate": 0.0010600000000000002, "loss": 1.3896, "step": 470 }, { "epoch": 0.1, "grad_norm": 2.4980382919311523, "learning_rate": 0.0010400000000000001, "loss": 1.3725, "step": 480 }, { "epoch": 0.1, "grad_norm": 2.6995227336883545, "learning_rate": 0.00102, "loss": 1.4769, "step": 490 }, { "epoch": 0.1, "grad_norm": 2.1483154296875, "learning_rate": 0.001, "loss": 1.5983, "step": 500 } ], "logging_steps": 10, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 4.0788232684018176e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }