{ "best_metric": 0.4079853296279907, "best_model_checkpoint": "./mistral/23-02-24-Weni-ZeroShot-3.3.3-Mistral-7b-Multilanguage-1-epoch-3.2.0_Zeroshot-2_max_steps-201_batch_128_2024-02-23_ppid_2273/checkpoint-180", "epoch": 0.8921933085501859, "eval_steps": 20, "global_step": 180, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1, "grad_norm": 1.0583807229995728, "learning_rate": 0.0002, "loss": 1.5561, "step": 20 }, { "epoch": 0.1, "eval_loss": 0.8398289084434509, "eval_runtime": 365.9464, "eval_samples_per_second": 7.834, "eval_steps_per_second": 0.246, "step": 20 }, { "epoch": 0.2, "grad_norm": 0.2255832850933075, "learning_rate": 0.0001940350472628637, "loss": 0.594, "step": 40 }, { "epoch": 0.2, "eval_loss": 0.5098645687103271, "eval_runtime": 365.932, "eval_samples_per_second": 7.835, "eval_steps_per_second": 0.246, "step": 40 }, { "epoch": 0.3, "grad_norm": 0.188425675034523, "learning_rate": 0.0001779507116848976, "loss": 0.4849, "step": 60 }, { "epoch": 0.3, "eval_loss": 0.46610626578330994, "eval_runtime": 365.7636, "eval_samples_per_second": 7.838, "eval_steps_per_second": 0.246, "step": 60 }, { "epoch": 0.4, "grad_norm": 0.24421139061450958, "learning_rate": 0.00015199063052725745, "loss": 0.4516, "step": 80 }, { "epoch": 0.4, "eval_loss": 0.4327794909477234, "eval_runtime": 366.1258, "eval_samples_per_second": 7.831, "eval_steps_per_second": 0.246, "step": 80 }, { "epoch": 0.5, "grad_norm": 0.1821870654821396, "learning_rate": 0.00011982811629223709, "loss": 0.4237, "step": 100 }, { "epoch": 0.5, "eval_loss": 0.41923952102661133, "eval_runtime": 366.5093, "eval_samples_per_second": 7.822, "eval_steps_per_second": 0.246, "step": 100 }, { "epoch": 0.59, "grad_norm": 0.16211040318012238, "learning_rate": 8.530012652622397e-05, "loss": 0.4235, "step": 120 }, { "epoch": 0.59, "eval_loss": 0.41423356533050537, "eval_runtime": 366.3967, "eval_samples_per_second": 7.825, "eval_steps_per_second": 0.246, "step": 120 }, { "epoch": 0.69, "grad_norm": 0.16519276797771454, "learning_rate": 5.2525817770470084e-05, "loss": 0.4128, "step": 140 }, { "epoch": 0.69, "eval_loss": 0.41103067994117737, "eval_runtime": 366.4036, "eval_samples_per_second": 7.825, "eval_steps_per_second": 0.246, "step": 140 }, { "epoch": 0.79, "grad_norm": 0.15887108445167542, "learning_rate": 2.5415134079383006e-05, "loss": 0.4152, "step": 160 }, { "epoch": 0.79, "eval_loss": 0.4090025722980499, "eval_runtime": 366.5464, "eval_samples_per_second": 7.822, "eval_steps_per_second": 0.246, "step": 160 }, { "epoch": 0.89, "grad_norm": 0.17298047244548798, "learning_rate": 7.202354390738608e-06, "loss": 0.4092, "step": 180 }, { "epoch": 0.89, "eval_loss": 0.4079853296279907, "eval_runtime": 366.7715, "eval_samples_per_second": 7.817, "eval_steps_per_second": 0.245, "step": 180 } ], "logging_steps": 20, "max_steps": 201, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 8.112849538860974e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }