{ "best_metric": 1.3743919134140015, "best_model_checkpoint": "./llama3/21-04-24-Weni-WeniGPT-Agents-Llama3-1.0.8-SFT_Experiment with SFT and Llama3 and updates in requirements-2_max_steps-669_batch_2_2024-04-21_ppid_2917/checkpoint-360", "epoch": 1.610738255033557, "eval_steps": 30, "global_step": 360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0447427293064877, "grad_norm": 0.3199349641799927, "learning_rate": 0.0001, "loss": 1.9361, "step": 10 }, { "epoch": 0.0894854586129754, "grad_norm": 0.9833651781082153, "learning_rate": 0.0002, "loss": 1.8397, "step": 20 }, { "epoch": 0.1342281879194631, "grad_norm": 1.0959680080413818, "learning_rate": 0.00019990511519199923, "loss": 1.7877, "step": 30 }, { "epoch": 0.1342281879194631, "eval_loss": 1.7081660032272339, "eval_runtime": 10.9642, "eval_samples_per_second": 4.195, "eval_steps_per_second": 4.195, "step": 30 }, { "epoch": 0.1789709172259508, "grad_norm": 1.2613288164138794, "learning_rate": 0.00019957734949126304, "loss": 1.7523, "step": 40 }, { "epoch": 0.22371364653243847, "grad_norm": 0.7975640892982483, "learning_rate": 0.0001990162991697884, "loss": 1.516, "step": 50 }, { "epoch": 0.2684563758389262, "grad_norm": 0.71113121509552, "learning_rate": 0.0001982232786270059, "loss": 1.3675, "step": 60 }, { "epoch": 0.2684563758389262, "eval_loss": 1.4538882970809937, "eval_runtime": 10.9625, "eval_samples_per_second": 4.196, "eval_steps_per_second": 4.196, "step": 60 }, { "epoch": 0.3131991051454139, "grad_norm": 0.8126013875007629, "learning_rate": 0.00019720014571008158, "loss": 1.4173, "step": 70 }, { "epoch": 0.3579418344519016, "grad_norm": 0.6399104595184326, "learning_rate": 0.00019594929736144976, "loss": 1.5249, "step": 80 }, { "epoch": 0.40268456375838924, "grad_norm": 0.7685297131538391, "learning_rate": 0.00019447366400338116, "loss": 1.3782, "step": 90 }, { "epoch": 0.40268456375838924, "eval_loss": 1.4235661029815674, "eval_runtime": 10.9616, "eval_samples_per_second": 4.196, "eval_steps_per_second": 4.196, "step": 90 }, { "epoch": 0.44742729306487694, "grad_norm": 0.7407508492469788, "learning_rate": 0.00019277670267274258, "loss": 1.3854, "step": 100 }, { "epoch": 0.49217002237136465, "grad_norm": 0.5381172895431519, "learning_rate": 0.0001908623889220311, "loss": 1.3216, "step": 110 }, { "epoch": 0.5369127516778524, "grad_norm": 0.8507822155952454, "learning_rate": 0.00018873520750565718, "loss": 1.3884, "step": 120 }, { "epoch": 0.5369127516778524, "eval_loss": 1.3937652111053467, "eval_runtime": 11.3392, "eval_samples_per_second": 4.057, "eval_steps_per_second": 4.057, "step": 120 }, { "epoch": 0.5816554809843401, "grad_norm": 0.7832561135292053, "learning_rate": 0.00018640014187329578, "loss": 1.3304, "step": 130 }, { "epoch": 0.6263982102908278, "grad_norm": 0.7797898054122925, "learning_rate": 0.00018386266249492057, "loss": 1.4979, "step": 140 }, { "epoch": 0.6711409395973155, "grad_norm": 0.8133324384689331, "learning_rate": 0.00018112871404487202, "loss": 1.3448, "step": 150 }, { "epoch": 0.6711409395973155, "eval_loss": 1.3898664712905884, "eval_runtime": 10.9657, "eval_samples_per_second": 4.195, "eval_steps_per_second": 4.195, "step": 150 }, { "epoch": 0.7158836689038032, "grad_norm": 0.5568630695343018, "learning_rate": 0.00017820470147498455, "loss": 1.3767, "step": 160 }, { "epoch": 0.7606263982102909, "grad_norm": 0.6193764209747314, "learning_rate": 0.00017509747500939928, "loss": 1.3777, "step": 170 }, { "epoch": 0.8053691275167785, "grad_norm": 0.8501898646354675, "learning_rate": 0.00017181431409621644, "loss": 1.3357, "step": 180 }, { "epoch": 0.8053691275167785, "eval_loss": 1.3869752883911133, "eval_runtime": 10.9836, "eval_samples_per_second": 4.188, "eval_steps_per_second": 4.188, "step": 180 }, { "epoch": 0.8501118568232662, "grad_norm": 0.6570385098457336, "learning_rate": 0.00016836291035358375, "loss": 1.3534, "step": 190 }, { "epoch": 0.8948545861297539, "grad_norm": 0.8461380004882812, "learning_rate": 0.0001647513495501749, "loss": 1.3976, "step": 200 }, { "epoch": 0.9395973154362416, "grad_norm": 0.9425082802772522, "learning_rate": 0.000160988092662272, "loss": 1.2788, "step": 210 }, { "epoch": 0.9395973154362416, "eval_loss": 1.3739150762557983, "eval_runtime": 10.967, "eval_samples_per_second": 4.194, "eval_steps_per_second": 4.194, "step": 210 }, { "epoch": 0.9843400447427293, "grad_norm": 1.4384957551956177, "learning_rate": 0.0001570819560518322, "loss": 1.3569, "step": 220 }, { "epoch": 1.029082774049217, "grad_norm": 0.665977418422699, "learning_rate": 0.00015304209081197425, "loss": 1.3895, "step": 230 }, { "epoch": 1.0738255033557047, "grad_norm": 0.736944317817688, "learning_rate": 0.0001488779613282751, "loss": 1.2396, "step": 240 }, { "epoch": 1.0738255033557047, "eval_loss": 1.3777718544006348, "eval_runtime": 10.9845, "eval_samples_per_second": 4.188, "eval_steps_per_second": 4.188, "step": 240 }, { "epoch": 1.1185682326621924, "grad_norm": 0.5746013522148132, "learning_rate": 0.00014459932310610093, "loss": 1.3425, "step": 250 }, { "epoch": 1.1633109619686801, "grad_norm": 0.8226341009140015, "learning_rate": 0.00014021619991591794, "loss": 1.2856, "step": 260 }, { "epoch": 1.2080536912751678, "grad_norm": 1.051847219467163, "learning_rate": 0.00013573886031012584, "loss": 1.2949, "step": 270 }, { "epoch": 1.2080536912751678, "eval_loss": 1.3808656930923462, "eval_runtime": 10.966, "eval_samples_per_second": 4.195, "eval_steps_per_second": 4.195, "step": 270 }, { "epoch": 1.2527964205816555, "grad_norm": 0.6205160021781921, "learning_rate": 0.00013117779356642872, "loss": 1.2785, "step": 280 }, { "epoch": 1.2975391498881432, "grad_norm": 1.1361136436462402, "learning_rate": 0.00012654368511410245, "loss": 1.2825, "step": 290 }, { "epoch": 1.342281879194631, "grad_norm": 0.8586457967758179, "learning_rate": 0.00012184739150072821, "loss": 1.337, "step": 300 }, { "epoch": 1.342281879194631, "eval_loss": 1.3791558742523193, "eval_runtime": 12.241, "eval_samples_per_second": 3.758, "eval_steps_per_second": 3.758, "step": 300 }, { "epoch": 1.3870246085011186, "grad_norm": 1.0672543048858643, "learning_rate": 0.00011709991495803915, "loss": 1.1912, "step": 310 }, { "epoch": 1.4317673378076063, "grad_norm": 0.8796883821487427, "learning_rate": 0.0001123123776264656, "loss": 1.1901, "step": 320 }, { "epoch": 1.476510067114094, "grad_norm": 0.9798341989517212, "learning_rate": 0.00010749599549876472, "loss": 1.3266, "step": 330 }, { "epoch": 1.476510067114094, "eval_loss": 1.3774683475494385, "eval_runtime": 10.9641, "eval_samples_per_second": 4.196, "eval_steps_per_second": 4.196, "step": 330 }, { "epoch": 1.5212527964205815, "grad_norm": 1.070554494857788, "learning_rate": 0.00010266205214377748, "loss": 1.255, "step": 340 }, { "epoch": 1.5659955257270695, "grad_norm": 0.8746442794799805, "learning_rate": 9.782187227187231e-05, "loss": 1.1981, "step": 350 }, { "epoch": 1.610738255033557, "grad_norm": 1.1643601655960083, "learning_rate": 9.298679520400412e-05, "loss": 1.2735, "step": 360 }, { "epoch": 1.610738255033557, "eval_loss": 1.3743919134140015, "eval_runtime": 10.9721, "eval_samples_per_second": 4.192, "eval_steps_per_second": 4.192, "step": 360 } ], "logging_steps": 10, "max_steps": 669, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 90, "total_flos": 7.286971354177536e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }