diff --git a/checkpoint-100/README.md b/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..69eced3e4fa0cffb8a4f749d70d2150b3b6038f0 --- /dev/null +++ b/checkpoint-100/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: NousResearch/Llama-2-13b-hf +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.7.1 \ No newline at end of file diff --git a/checkpoint-100/adapter_config.json b/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..089f20ebe8d421867e642bbf0b75c66dc24a7f5f --- /dev/null +++ b/checkpoint-100/adapter_config.json @@ -0,0 +1,31 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "NousResearch/Llama-2-13b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "k_proj", + "gate_proj", + "down_proj", + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-100/adapter_model.safetensors b/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..64e3cd63f208e194605296f1d969da7500e7453b --- /dev/null +++ b/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:818ad0a09302d01af4bee7c4a8ac756c0feb509535ce36961baf72c9357ef5a9 +size 2002857080 diff --git a/checkpoint-100/optimizer.pt b/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3bb3ceb2cb004f3819438900cdce5b9a336e58a --- /dev/null +++ b/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45a521219e0dea101909f4601f465589ec1b358b47402a630069822388c42ad4 +size 1004004436 diff --git a/checkpoint-100/rng_state.pth b/checkpoint-100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f8291cd6ce87668b786a72f3e93d072fbe54902 --- /dev/null +++ b/checkpoint-100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4c917636c7a58af68a29056522a757e9f9b99005b776641aa157c536967817d +size 14244 diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed673670b03c1024d5a09cd94b320e3f065ca995 --- /dev/null +++ b/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26a2852d2c1758fce5ccddfd0a0b1b7abc74840e40494a3f17bf7ca05b28653f +size 1064 diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..74a7a24c1b5897065660ac9198fe6a18806f72b7 --- /dev/null +++ b/checkpoint-100/trainer_state.json @@ -0,0 +1,621 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.31189083820662766, + "eval_steps": 500, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5e-05, + "loss": 1.0506, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001, + "loss": 0.9988, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 0.00015000000000000001, + "loss": 0.9783, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 4 + }, + { + "epoch": 0.02, + "learning_rate": 0.00025, + "loss": 1.0159, + "step": 5 + }, + { + "epoch": 0.02, + "learning_rate": 0.00030000000000000003, + "loss": 0.9847, + "step": 6 + }, + { + "epoch": 0.02, + "learning_rate": 0.00034999999999999994, + "loss": 0.9101, + "step": 7 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 0.9445, + "step": 8 + }, + { + "epoch": 0.03, + "learning_rate": 0.00045, + "loss": 0.8578, + "step": 9 + }, + { + "epoch": 0.03, + "learning_rate": 0.0005, + "loss": 0.9356, + "step": 10 + }, + { + "epoch": 0.03, + "learning_rate": 0.0005499999999999999, + "loss": 0.8395, + "step": 11 + }, + { + "epoch": 0.04, + "learning_rate": 0.0006000000000000001, + "loss": 0.9002, + "step": 12 + }, + { + "epoch": 0.04, + "learning_rate": 0.00065, + "loss": 0.8955, + "step": 13 + }, + { + "epoch": 0.04, + "learning_rate": 0.0006499959204043461, + "loss": 0.902, + "step": 14 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499836817198032, + "loss": 0.8578, + "step": 15 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499632842536263, + "loss": 0.9005, + "step": 16 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499347285178979, + "loss": 0.8539, + "step": 17 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498980152295153, + "loss": 0.8595, + "step": 18 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498531453101735, + "loss": 0.8845, + "step": 19 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498001198863406, + "loss": 0.8924, + "step": 20 + }, + { + "epoch": 0.07, + "learning_rate": 0.000649738940289231, + "loss": 0.8365, + "step": 21 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006496696080547707, + "loss": 0.8462, + "step": 22 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006495921249235596, + "loss": 0.8528, + "step": 23 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006495064928408277, + "loss": 0.8159, + "step": 24 + }, + { + "epoch": 0.08, + "learning_rate": 0.0006494127139563859, + "loss": 0.8245, + "step": 25 + }, + { + "epoch": 0.08, + "learning_rate": 0.000649310790624572, + "loss": 0.8081, + "step": 26 + }, + { + "epoch": 0.08, + "learning_rate": 0.0006492007254041924, + "loss": 0.8535, + "step": 27 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006490825210584566, + "loss": 0.8162, + "step": 28 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006489561805549089, + "loss": 0.8456, + "step": 29 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006488217070653535, + "loss": 0.7799, + "step": 30 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006486791039657748, + "loss": 0.8088, + "step": 31 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006485283748362524, + "loss": 0.8683, + "step": 32 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006483695234608723, + "loss": 0.8871, + "step": 33 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006482025538276304, + "loss": 0.7711, + "step": 34 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006480274701283335, + "loss": 0.7621, + "step": 35 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006478442767584937, + "loss": 0.8243, + "step": 36 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006476529783172177, + "loss": 0.8257, + "step": 37 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006474535796070919, + "loss": 0.8141, + "step": 38 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006472460856340619, + "loss": 0.8109, + "step": 39 + }, + { + "epoch": 0.12, + "learning_rate": 0.000647030501607306, + "loss": 0.7873, + "step": 40 + }, + { + "epoch": 0.13, + "learning_rate": 0.000646806832939105, + "loss": 0.7386, + "step": 41 + }, + { + "epoch": 0.13, + "learning_rate": 0.0006465750852447068, + "loss": 0.8636, + "step": 42 + }, + { + "epoch": 0.13, + "learning_rate": 0.0006463352643421846, + "loss": 0.7357, + "step": 43 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006460873762522906, + "loss": 0.8142, + "step": 44 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006458314271983063, + "loss": 0.7275, + "step": 45 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006455674236058847, + "loss": 0.8029, + "step": 46 + }, + { + "epoch": 0.15, + "learning_rate": 0.00064529537210289, + "loss": 0.7901, + "step": 47 + }, + { + "epoch": 0.15, + "learning_rate": 0.0006450152795192307, + "loss": 0.7788, + "step": 48 + }, + { + "epoch": 0.15, + "learning_rate": 0.0006447271528866881, + "loss": 0.7621, + "step": 49 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006444309994387402, + "loss": 0.7537, + "step": 50 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006441268266103796, + "loss": 0.7917, + "step": 51 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006438146420379274, + "loss": 0.8451, + "step": 52 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006434944535588411, + "loss": 0.8369, + "step": 53 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006431662692115173, + "loss": 0.7637, + "step": 54 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006428300972350914, + "loss": 0.8365, + "step": 55 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006424859460692295, + "loss": 0.7633, + "step": 56 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006421338243539165, + "loss": 0.7718, + "step": 57 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006417737409292403, + "loss": 0.7672, + "step": 58 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006414057048351684, + "loss": 0.8107, + "step": 59 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006410297253113221, + "loss": 0.7979, + "step": 60 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006406458117967443, + "loss": 0.7634, + "step": 61 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006402539739296618, + "loss": 0.7504, + "step": 62 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006398542215472443, + "loss": 0.8082, + "step": 63 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006394465646853571, + "loss": 0.8355, + "step": 64 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006390310135783086, + "loss": 0.7458, + "step": 65 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006386075786585944, + "loss": 0.7525, + "step": 66 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006381762705566343, + "loss": 0.7464, + "step": 67 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006377371001005063, + "loss": 0.78, + "step": 68 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006372900783156745, + "loss": 0.7752, + "step": 69 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006368352164247117, + "loss": 0.7299, + "step": 70 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006363725258470184, + "loss": 0.7722, + "step": 71 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006359020181985365, + "loss": 0.8236, + "step": 72 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006354237052914561, + "loss": 0.7589, + "step": 73 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006349375991339202, + "loss": 0.7948, + "step": 74 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006344437119297233, + "loss": 0.7528, + "step": 75 + }, + { + "epoch": 0.24, + "learning_rate": 0.0006339420560780045, + "loss": 0.7842, + "step": 76 + }, + { + "epoch": 0.24, + "learning_rate": 0.0006334326441729361, + "loss": 0.7541, + "step": 77 + }, + { + "epoch": 0.24, + "learning_rate": 0.000632915489003408, + "loss": 0.7425, + "step": 78 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006323906035527062, + "loss": 0.8168, + "step": 79 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006318580009981871, + "loss": 0.8074, + "step": 80 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006313176947109465, + "loss": 0.7679, + "step": 81 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006307696982554838, + "loss": 0.7465, + "step": 82 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006302140253893622, + "loss": 0.7073, + "step": 83 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006296506900628619, + "loss": 0.7687, + "step": 84 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006290797064186315, + "loss": 0.7578, + "step": 85 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006285010887913319, + "loss": 0.7494, + "step": 86 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006279148517072765, + "loss": 0.7326, + "step": 87 + }, + { + "epoch": 0.27, + "learning_rate": 0.000627321009884067, + "loss": 0.7603, + "step": 88 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006267195782302236, + "loss": 0.8141, + "step": 89 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006261105718448105, + "loss": 0.7542, + "step": 90 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006254940060170575, + "loss": 0.7597, + "step": 91 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006248698962259753, + "loss": 0.7332, + "step": 92 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006242382581399676, + "loss": 0.7031, + "step": 93 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006235991076164375, + "loss": 0.7258, + "step": 94 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006229524607013892, + "loss": 0.7634, + "step": 95 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006222983336290254, + "loss": 0.765, + "step": 96 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006216367428213398, + "loss": 0.7246, + "step": 97 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006209677048877046, + "loss": 0.7115, + "step": 98 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006202912366244535, + "loss": 0.6748, + "step": 99 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006196073550144604, + "loss": 0.6995, + "step": 100 + } + ], + "logging_steps": 1, + "max_steps": 640, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "total_flos": 2.593333107572736e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d220d7d4abfdc29d8393f3089d80acf2e32cf7d7 --- /dev/null +++ b/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d688198fe54ccac5c0a98d11fb9e7224690ace94f7e483ba1d16db91cf33a5c4 +size 4664 diff --git a/checkpoint-150/README.md b/checkpoint-150/README.md new file mode 100644 index 0000000000000000000000000000000000000000..69eced3e4fa0cffb8a4f749d70d2150b3b6038f0 --- /dev/null +++ b/checkpoint-150/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: NousResearch/Llama-2-13b-hf +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.7.1 \ No newline at end of file diff --git a/checkpoint-150/adapter_config.json b/checkpoint-150/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..089f20ebe8d421867e642bbf0b75c66dc24a7f5f --- /dev/null +++ b/checkpoint-150/adapter_config.json @@ -0,0 +1,31 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "NousResearch/Llama-2-13b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "k_proj", + "gate_proj", + "down_proj", + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-150/adapter_model.safetensors b/checkpoint-150/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8eea9a4a16ff3731af832d129a4246e3c4266d51 --- /dev/null +++ b/checkpoint-150/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f30b0e7ca44dc587b2f2c4be5469e60ccdb54c6fc9005580666c6ba2dd99561b +size 2002857080 diff --git a/checkpoint-150/optimizer.pt b/checkpoint-150/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c45917e6e5ed173eff819f82d32278a35f1c1ac --- /dev/null +++ b/checkpoint-150/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5abde634bb65c717c1ce646e6405f01b8adb3bda70c2ebb1fb3de6bc7a88be02 +size 1004004436 diff --git a/checkpoint-150/rng_state.pth b/checkpoint-150/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f8291cd6ce87668b786a72f3e93d072fbe54902 --- /dev/null +++ b/checkpoint-150/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4c917636c7a58af68a29056522a757e9f9b99005b776641aa157c536967817d +size 14244 diff --git a/checkpoint-150/scheduler.pt b/checkpoint-150/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..16e07de5761e3904cccee84a9c178cb0715a8359 --- /dev/null +++ b/checkpoint-150/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b56f89411c5f31f78bae170c9165fda470012498c331d2e04c2e63fe37732a9c +size 1064 diff --git a/checkpoint-150/trainer_state.json b/checkpoint-150/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1a4e6a327c19a15d41089f83785c4259d8c315c7 --- /dev/null +++ b/checkpoint-150/trainer_state.json @@ -0,0 +1,921 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4678362573099415, + "eval_steps": 500, + "global_step": 150, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5e-05, + "loss": 1.0506, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001, + "loss": 0.9988, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 0.00015000000000000001, + "loss": 0.9783, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 4 + }, + { + "epoch": 0.02, + "learning_rate": 0.00025, + "loss": 1.0159, + "step": 5 + }, + { + "epoch": 0.02, + "learning_rate": 0.00030000000000000003, + "loss": 0.9847, + "step": 6 + }, + { + "epoch": 0.02, + "learning_rate": 0.00034999999999999994, + "loss": 0.9101, + "step": 7 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 0.9445, + "step": 8 + }, + { + "epoch": 0.03, + "learning_rate": 0.00045, + "loss": 0.8578, + "step": 9 + }, + { + "epoch": 0.03, + "learning_rate": 0.0005, + "loss": 0.9356, + "step": 10 + }, + { + "epoch": 0.03, + "learning_rate": 0.0005499999999999999, + "loss": 0.8395, + "step": 11 + }, + { + "epoch": 0.04, + "learning_rate": 0.0006000000000000001, + "loss": 0.9002, + "step": 12 + }, + { + "epoch": 0.04, + "learning_rate": 0.00065, + "loss": 0.8955, + "step": 13 + }, + { + "epoch": 0.04, + "learning_rate": 0.0006499959204043461, + "loss": 0.902, + "step": 14 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499836817198032, + "loss": 0.8578, + "step": 15 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499632842536263, + "loss": 0.9005, + "step": 16 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499347285178979, + "loss": 0.8539, + "step": 17 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498980152295153, + "loss": 0.8595, + "step": 18 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498531453101735, + "loss": 0.8845, + "step": 19 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498001198863406, + "loss": 0.8924, + "step": 20 + }, + { + "epoch": 0.07, + "learning_rate": 0.000649738940289231, + "loss": 0.8365, + "step": 21 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006496696080547707, + "loss": 0.8462, + "step": 22 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006495921249235596, + "loss": 0.8528, + "step": 23 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006495064928408277, + "loss": 0.8159, + "step": 24 + }, + { + "epoch": 0.08, + "learning_rate": 0.0006494127139563859, + "loss": 0.8245, + "step": 25 + }, + { + "epoch": 0.08, + "learning_rate": 0.000649310790624572, + "loss": 0.8081, + "step": 26 + }, + { + "epoch": 0.08, + "learning_rate": 0.0006492007254041924, + "loss": 0.8535, + "step": 27 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006490825210584566, + "loss": 0.8162, + "step": 28 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006489561805549089, + "loss": 0.8456, + "step": 29 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006488217070653535, + "loss": 0.7799, + "step": 30 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006486791039657748, + "loss": 0.8088, + "step": 31 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006485283748362524, + "loss": 0.8683, + "step": 32 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006483695234608723, + "loss": 0.8871, + "step": 33 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006482025538276304, + "loss": 0.7711, + "step": 34 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006480274701283335, + "loss": 0.7621, + "step": 35 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006478442767584937, + "loss": 0.8243, + "step": 36 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006476529783172177, + "loss": 0.8257, + "step": 37 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006474535796070919, + "loss": 0.8141, + "step": 38 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006472460856340619, + "loss": 0.8109, + "step": 39 + }, + { + "epoch": 0.12, + "learning_rate": 0.000647030501607306, + "loss": 0.7873, + "step": 40 + }, + { + "epoch": 0.13, + "learning_rate": 0.000646806832939105, + "loss": 0.7386, + "step": 41 + }, + { + "epoch": 0.13, + "learning_rate": 0.0006465750852447068, + "loss": 0.8636, + "step": 42 + }, + { + "epoch": 0.13, + "learning_rate": 0.0006463352643421846, + "loss": 0.7357, + "step": 43 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006460873762522906, + "loss": 0.8142, + "step": 44 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006458314271983063, + "loss": 0.7275, + "step": 45 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006455674236058847, + "loss": 0.8029, + "step": 46 + }, + { + "epoch": 0.15, + "learning_rate": 0.00064529537210289, + "loss": 0.7901, + "step": 47 + }, + { + "epoch": 0.15, + "learning_rate": 0.0006450152795192307, + "loss": 0.7788, + "step": 48 + }, + { + "epoch": 0.15, + "learning_rate": 0.0006447271528866881, + "loss": 0.7621, + "step": 49 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006444309994387402, + "loss": 0.7537, + "step": 50 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006441268266103796, + "loss": 0.7917, + "step": 51 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006438146420379274, + "loss": 0.8451, + "step": 52 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006434944535588411, + "loss": 0.8369, + "step": 53 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006431662692115173, + "loss": 0.7637, + "step": 54 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006428300972350914, + "loss": 0.8365, + "step": 55 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006424859460692295, + "loss": 0.7633, + "step": 56 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006421338243539165, + "loss": 0.7718, + "step": 57 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006417737409292403, + "loss": 0.7672, + "step": 58 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006414057048351684, + "loss": 0.8107, + "step": 59 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006410297253113221, + "loss": 0.7979, + "step": 60 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006406458117967443, + "loss": 0.7634, + "step": 61 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006402539739296618, + "loss": 0.7504, + "step": 62 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006398542215472443, + "loss": 0.8082, + "step": 63 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006394465646853571, + "loss": 0.8355, + "step": 64 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006390310135783086, + "loss": 0.7458, + "step": 65 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006386075786585944, + "loss": 0.7525, + "step": 66 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006381762705566343, + "loss": 0.7464, + "step": 67 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006377371001005063, + "loss": 0.78, + "step": 68 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006372900783156745, + "loss": 0.7752, + "step": 69 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006368352164247117, + "loss": 0.7299, + "step": 70 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006363725258470184, + "loss": 0.7722, + "step": 71 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006359020181985365, + "loss": 0.8236, + "step": 72 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006354237052914561, + "loss": 0.7589, + "step": 73 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006349375991339202, + "loss": 0.7948, + "step": 74 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006344437119297233, + "loss": 0.7528, + "step": 75 + }, + { + "epoch": 0.24, + "learning_rate": 0.0006339420560780045, + "loss": 0.7842, + "step": 76 + }, + { + "epoch": 0.24, + "learning_rate": 0.0006334326441729361, + "loss": 0.7541, + "step": 77 + }, + { + "epoch": 0.24, + "learning_rate": 0.000632915489003408, + "loss": 0.7425, + "step": 78 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006323906035527062, + "loss": 0.8168, + "step": 79 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006318580009981871, + "loss": 0.8074, + "step": 80 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006313176947109465, + "loss": 0.7679, + "step": 81 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006307696982554838, + "loss": 0.7465, + "step": 82 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006302140253893622, + "loss": 0.7073, + "step": 83 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006296506900628619, + "loss": 0.7687, + "step": 84 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006290797064186315, + "loss": 0.7578, + "step": 85 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006285010887913319, + "loss": 0.7494, + "step": 86 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006279148517072765, + "loss": 0.7326, + "step": 87 + }, + { + "epoch": 0.27, + "learning_rate": 0.000627321009884067, + "loss": 0.7603, + "step": 88 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006267195782302236, + "loss": 0.8141, + "step": 89 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006261105718448105, + "loss": 0.7542, + "step": 90 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006254940060170575, + "loss": 0.7597, + "step": 91 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006248698962259753, + "loss": 0.7332, + "step": 92 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006242382581399676, + "loss": 0.7031, + "step": 93 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006235991076164375, + "loss": 0.7258, + "step": 94 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006229524607013892, + "loss": 0.7634, + "step": 95 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006222983336290254, + "loss": 0.765, + "step": 96 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006216367428213398, + "loss": 0.7246, + "step": 97 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006209677048877046, + "loss": 0.7115, + "step": 98 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006202912366244535, + "loss": 0.6748, + "step": 99 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006196073550144604, + "loss": 0.6995, + "step": 100 + }, + { + "epoch": 0.32, + "learning_rate": 0.0006189160772267127, + "loss": 0.7764, + "step": 101 + }, + { + "epoch": 0.32, + "learning_rate": 0.00061821742061588, + "loss": 0.8628, + "step": 102 + }, + { + "epoch": 0.32, + "learning_rate": 0.0006175114027218794, + "loss": 0.7266, + "step": 103 + }, + { + "epoch": 0.32, + "learning_rate": 0.0006167980412694342, + "loss": 0.7557, + "step": 104 + }, + { + "epoch": 0.33, + "learning_rate": 0.0006160773541676288, + "loss": 0.7518, + "step": 105 + }, + { + "epoch": 0.33, + "learning_rate": 0.0006153493595094602, + "loss": 0.7589, + "step": 106 + }, + { + "epoch": 0.33, + "learning_rate": 0.000614614075571383, + "loss": 0.7506, + "step": 107 + }, + { + "epoch": 0.34, + "learning_rate": 0.0006138715208128501, + "loss": 0.6617, + "step": 108 + }, + { + "epoch": 0.34, + "learning_rate": 0.0006131217138758505, + "loss": 0.7396, + "step": 109 + }, + { + "epoch": 0.34, + "learning_rate": 0.0006123646735844401, + "loss": 0.7666, + "step": 110 + }, + { + "epoch": 0.35, + "learning_rate": 0.00061160041894427, + "loss": 0.7555, + "step": 111 + }, + { + "epoch": 0.35, + "learning_rate": 0.0006108289691421089, + "loss": 0.7301, + "step": 112 + }, + { + "epoch": 0.35, + "learning_rate": 0.0006100503435453614, + "loss": 0.7364, + "step": 113 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006092645617015822, + "loss": 0.7461, + "step": 114 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006084716433379844, + "loss": 0.8086, + "step": 115 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006076716083609456, + "loss": 0.7577, + "step": 116 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006068644768555068, + "loss": 0.7094, + "step": 117 + }, + { + "epoch": 0.37, + "learning_rate": 0.0006060502690848696, + "loss": 0.726, + "step": 118 + }, + { + "epoch": 0.37, + "learning_rate": 0.0006052290054898859, + "loss": 0.7243, + "step": 119 + }, + { + "epoch": 0.37, + "learning_rate": 0.0006044007066885458, + "loss": 0.7119, + "step": 120 + }, + { + "epoch": 0.38, + "learning_rate": 0.0006035653934754598, + "loss": 0.7049, + "step": 121 + }, + { + "epoch": 0.38, + "learning_rate": 0.0006027230868213366, + "loss": 0.7424, + "step": 122 + }, + { + "epoch": 0.38, + "learning_rate": 0.0006018738078724563, + "loss": 0.7271, + "step": 123 + }, + { + "epoch": 0.39, + "learning_rate": 0.0006010175779501405, + "loss": 0.7996, + "step": 124 + }, + { + "epoch": 0.39, + "learning_rate": 0.0006001544185502158, + "loss": 0.7468, + "step": 125 + }, + { + "epoch": 0.39, + "learning_rate": 0.0005992843513424754, + "loss": 0.7513, + "step": 126 + }, + { + "epoch": 0.4, + "learning_rate": 0.0005984073981701338, + "loss": 0.7461, + "step": 127 + }, + { + "epoch": 0.4, + "learning_rate": 0.0005975235810492794, + "loss": 0.6821, + "step": 128 + }, + { + "epoch": 0.4, + "learning_rate": 0.0005966329221683215, + "loss": 0.7314, + "step": 129 + }, + { + "epoch": 0.41, + "learning_rate": 0.0005957354438874327, + "loss": 0.714, + "step": 130 + }, + { + "epoch": 0.41, + "learning_rate": 0.0005948311687379884, + "loss": 0.7339, + "step": 131 + }, + { + "epoch": 0.41, + "learning_rate": 0.000593920119422001, + "loss": 0.7021, + "step": 132 + }, + { + "epoch": 0.41, + "learning_rate": 0.0005930023188115492, + "loss": 0.7228, + "step": 133 + }, + { + "epoch": 0.42, + "learning_rate": 0.0005920777899482046, + "loss": 0.7107, + "step": 134 + }, + { + "epoch": 0.42, + "learning_rate": 0.0005911465560424532, + "loss": 0.659, + "step": 135 + }, + { + "epoch": 0.42, + "learning_rate": 0.0005902086404731118, + "loss": 0.7028, + "step": 136 + }, + { + "epoch": 0.43, + "learning_rate": 0.0005892640667867423, + "loss": 0.7275, + "step": 137 + }, + { + "epoch": 0.43, + "learning_rate": 0.00058831285869706, + "loss": 0.6889, + "step": 138 + }, + { + "epoch": 0.43, + "learning_rate": 0.0005873550400843378, + "loss": 0.7891, + "step": 139 + }, + { + "epoch": 0.44, + "learning_rate": 0.0005863906349948074, + "loss": 0.7904, + "step": 140 + }, + { + "epoch": 0.44, + "learning_rate": 0.0005854196676400555, + "loss": 0.6674, + "step": 141 + }, + { + "epoch": 0.44, + "learning_rate": 0.0005844421623964157, + "loss": 0.7352, + "step": 142 + }, + { + "epoch": 0.45, + "learning_rate": 0.0005834581438043563, + "loss": 0.6965, + "step": 143 + }, + { + "epoch": 0.45, + "learning_rate": 0.000582467636567865, + "loss": 0.7238, + "step": 144 + }, + { + "epoch": 0.45, + "learning_rate": 0.0005814706655538279, + "loss": 0.7064, + "step": 145 + }, + { + "epoch": 0.46, + "learning_rate": 0.0005804672557914059, + "loss": 0.6984, + "step": 146 + }, + { + "epoch": 0.46, + "learning_rate": 0.0005794574324714057, + "loss": 0.7594, + "step": 147 + }, + { + "epoch": 0.46, + "learning_rate": 0.0005784412209456479, + "loss": 0.6884, + "step": 148 + }, + { + "epoch": 0.46, + "learning_rate": 0.00057741864672633, + "loss": 0.7141, + "step": 149 + }, + { + "epoch": 0.47, + "learning_rate": 0.0005763897354853866, + "loss": 0.705, + "step": 150 + } + ], + "logging_steps": 1, + "max_steps": 640, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "total_flos": 3.889192088892211e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-150/training_args.bin b/checkpoint-150/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d220d7d4abfdc29d8393f3089d80acf2e32cf7d7 --- /dev/null +++ b/checkpoint-150/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d688198fe54ccac5c0a98d11fb9e7224690ace94f7e483ba1d16db91cf33a5c4 +size 4664 diff --git a/checkpoint-200/README.md b/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..69eced3e4fa0cffb8a4f749d70d2150b3b6038f0 --- /dev/null +++ b/checkpoint-200/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: NousResearch/Llama-2-13b-hf +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.7.1 \ No newline at end of file diff --git a/checkpoint-200/adapter_config.json b/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..089f20ebe8d421867e642bbf0b75c66dc24a7f5f --- /dev/null +++ b/checkpoint-200/adapter_config.json @@ -0,0 +1,31 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "NousResearch/Llama-2-13b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "k_proj", + "gate_proj", + "down_proj", + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-200/adapter_model.safetensors b/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..55082174004dce573b05e5805a786bccc9a711c1 --- /dev/null +++ b/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b26badf95db20e2caeb27a77972d8c0cb9c4f40b9b2553064f8c3060fe4fdfd9 +size 2002857080 diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..95b181115536275c1f3f882a201e53376de13a63 --- /dev/null +++ b/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be3a4233a403d0327ee88407231c206d9ccb18dcaee59e577eeb310c10819925 +size 1004004436 diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f8291cd6ce87668b786a72f3e93d072fbe54902 --- /dev/null +++ b/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4c917636c7a58af68a29056522a757e9f9b99005b776641aa157c536967817d +size 14244 diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d03722612c10723a6e3f3523a3162f5e08eb6b9 --- /dev/null +++ b/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b65d73ea334cd90562f43b83c049540453fcb1263915d214b8d4b51fda32e89a +size 1064 diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5bca92aba40b490876685e64aaff73fdb1583cd0 --- /dev/null +++ b/checkpoint-200/trainer_state.json @@ -0,0 +1,1221 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6237816764132553, + "eval_steps": 500, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5e-05, + "loss": 1.0506, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001, + "loss": 0.9988, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 0.00015000000000000001, + "loss": 0.9783, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 4 + }, + { + "epoch": 0.02, + "learning_rate": 0.00025, + "loss": 1.0159, + "step": 5 + }, + { + "epoch": 0.02, + "learning_rate": 0.00030000000000000003, + "loss": 0.9847, + "step": 6 + }, + { + "epoch": 0.02, + "learning_rate": 0.00034999999999999994, + "loss": 0.9101, + "step": 7 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 0.9445, + "step": 8 + }, + { + "epoch": 0.03, + "learning_rate": 0.00045, + "loss": 0.8578, + "step": 9 + }, + { + "epoch": 0.03, + "learning_rate": 0.0005, + "loss": 0.9356, + "step": 10 + }, + { + "epoch": 0.03, + "learning_rate": 0.0005499999999999999, + "loss": 0.8395, + "step": 11 + }, + { + "epoch": 0.04, + "learning_rate": 0.0006000000000000001, + "loss": 0.9002, + "step": 12 + }, + { + "epoch": 0.04, + "learning_rate": 0.00065, + "loss": 0.8955, + "step": 13 + }, + { + "epoch": 0.04, + "learning_rate": 0.0006499959204043461, + "loss": 0.902, + "step": 14 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499836817198032, + "loss": 0.8578, + "step": 15 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499632842536263, + "loss": 0.9005, + "step": 16 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499347285178979, + "loss": 0.8539, + "step": 17 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498980152295153, + "loss": 0.8595, + "step": 18 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498531453101735, + "loss": 0.8845, + "step": 19 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498001198863406, + "loss": 0.8924, + "step": 20 + }, + { + "epoch": 0.07, + "learning_rate": 0.000649738940289231, + "loss": 0.8365, + "step": 21 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006496696080547707, + "loss": 0.8462, + "step": 22 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006495921249235596, + "loss": 0.8528, + "step": 23 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006495064928408277, + "loss": 0.8159, + "step": 24 + }, + { + "epoch": 0.08, + "learning_rate": 0.0006494127139563859, + "loss": 0.8245, + "step": 25 + }, + { + "epoch": 0.08, + "learning_rate": 0.000649310790624572, + "loss": 0.8081, + "step": 26 + }, + { + "epoch": 0.08, + "learning_rate": 0.0006492007254041924, + "loss": 0.8535, + "step": 27 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006490825210584566, + "loss": 0.8162, + "step": 28 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006489561805549089, + "loss": 0.8456, + "step": 29 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006488217070653535, + "loss": 0.7799, + "step": 30 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006486791039657748, + "loss": 0.8088, + "step": 31 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006485283748362524, + "loss": 0.8683, + "step": 32 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006483695234608723, + "loss": 0.8871, + "step": 33 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006482025538276304, + "loss": 0.7711, + "step": 34 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006480274701283335, + "loss": 0.7621, + "step": 35 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006478442767584937, + "loss": 0.8243, + "step": 36 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006476529783172177, + "loss": 0.8257, + "step": 37 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006474535796070919, + "loss": 0.8141, + "step": 38 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006472460856340619, + "loss": 0.8109, + "step": 39 + }, + { + "epoch": 0.12, + "learning_rate": 0.000647030501607306, + "loss": 0.7873, + "step": 40 + }, + { + "epoch": 0.13, + "learning_rate": 0.000646806832939105, + "loss": 0.7386, + "step": 41 + }, + { + "epoch": 0.13, + "learning_rate": 0.0006465750852447068, + "loss": 0.8636, + "step": 42 + }, + { + "epoch": 0.13, + "learning_rate": 0.0006463352643421846, + "loss": 0.7357, + "step": 43 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006460873762522906, + "loss": 0.8142, + "step": 44 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006458314271983063, + "loss": 0.7275, + "step": 45 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006455674236058847, + "loss": 0.8029, + "step": 46 + }, + { + "epoch": 0.15, + "learning_rate": 0.00064529537210289, + "loss": 0.7901, + "step": 47 + }, + { + "epoch": 0.15, + "learning_rate": 0.0006450152795192307, + "loss": 0.7788, + "step": 48 + }, + { + "epoch": 0.15, + "learning_rate": 0.0006447271528866881, + "loss": 0.7621, + "step": 49 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006444309994387402, + "loss": 0.7537, + "step": 50 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006441268266103796, + "loss": 0.7917, + "step": 51 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006438146420379274, + "loss": 0.8451, + "step": 52 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006434944535588411, + "loss": 0.8369, + "step": 53 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006431662692115173, + "loss": 0.7637, + "step": 54 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006428300972350914, + "loss": 0.8365, + "step": 55 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006424859460692295, + "loss": 0.7633, + "step": 56 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006421338243539165, + "loss": 0.7718, + "step": 57 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006417737409292403, + "loss": 0.7672, + "step": 58 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006414057048351684, + "loss": 0.8107, + "step": 59 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006410297253113221, + "loss": 0.7979, + "step": 60 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006406458117967443, + "loss": 0.7634, + "step": 61 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006402539739296618, + "loss": 0.7504, + "step": 62 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006398542215472443, + "loss": 0.8082, + "step": 63 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006394465646853571, + "loss": 0.8355, + "step": 64 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006390310135783086, + "loss": 0.7458, + "step": 65 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006386075786585944, + "loss": 0.7525, + "step": 66 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006381762705566343, + "loss": 0.7464, + "step": 67 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006377371001005063, + "loss": 0.78, + "step": 68 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006372900783156745, + "loss": 0.7752, + "step": 69 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006368352164247117, + "loss": 0.7299, + "step": 70 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006363725258470184, + "loss": 0.7722, + "step": 71 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006359020181985365, + "loss": 0.8236, + "step": 72 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006354237052914561, + "loss": 0.7589, + "step": 73 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006349375991339202, + "loss": 0.7948, + "step": 74 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006344437119297233, + "loss": 0.7528, + "step": 75 + }, + { + "epoch": 0.24, + "learning_rate": 0.0006339420560780045, + "loss": 0.7842, + "step": 76 + }, + { + "epoch": 0.24, + "learning_rate": 0.0006334326441729361, + "loss": 0.7541, + "step": 77 + }, + { + "epoch": 0.24, + "learning_rate": 0.000632915489003408, + "loss": 0.7425, + "step": 78 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006323906035527062, + "loss": 0.8168, + "step": 79 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006318580009981871, + "loss": 0.8074, + "step": 80 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006313176947109465, + "loss": 0.7679, + "step": 81 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006307696982554838, + "loss": 0.7465, + "step": 82 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006302140253893622, + "loss": 0.7073, + "step": 83 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006296506900628619, + "loss": 0.7687, + "step": 84 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006290797064186315, + "loss": 0.7578, + "step": 85 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006285010887913319, + "loss": 0.7494, + "step": 86 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006279148517072765, + "loss": 0.7326, + "step": 87 + }, + { + "epoch": 0.27, + "learning_rate": 0.000627321009884067, + "loss": 0.7603, + "step": 88 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006267195782302236, + "loss": 0.8141, + "step": 89 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006261105718448105, + "loss": 0.7542, + "step": 90 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006254940060170575, + "loss": 0.7597, + "step": 91 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006248698962259753, + "loss": 0.7332, + "step": 92 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006242382581399676, + "loss": 0.7031, + "step": 93 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006235991076164375, + "loss": 0.7258, + "step": 94 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006229524607013892, + "loss": 0.7634, + "step": 95 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006222983336290254, + "loss": 0.765, + "step": 96 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006216367428213398, + "loss": 0.7246, + "step": 97 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006209677048877046, + "loss": 0.7115, + "step": 98 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006202912366244535, + "loss": 0.6748, + "step": 99 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006196073550144604, + "loss": 0.6995, + "step": 100 + }, + { + "epoch": 0.32, + "learning_rate": 0.0006189160772267127, + "loss": 0.7764, + "step": 101 + }, + { + "epoch": 0.32, + "learning_rate": 0.00061821742061588, + "loss": 0.8628, + "step": 102 + }, + { + "epoch": 0.32, + "learning_rate": 0.0006175114027218794, + "loss": 0.7266, + "step": 103 + }, + { + "epoch": 0.32, + "learning_rate": 0.0006167980412694342, + "loss": 0.7557, + "step": 104 + }, + { + "epoch": 0.33, + "learning_rate": 0.0006160773541676288, + "loss": 0.7518, + "step": 105 + }, + { + "epoch": 0.33, + "learning_rate": 0.0006153493595094602, + "loss": 0.7589, + "step": 106 + }, + { + "epoch": 0.33, + "learning_rate": 0.000614614075571383, + "loss": 0.7506, + "step": 107 + }, + { + "epoch": 0.34, + "learning_rate": 0.0006138715208128501, + "loss": 0.6617, + "step": 108 + }, + { + "epoch": 0.34, + "learning_rate": 0.0006131217138758505, + "loss": 0.7396, + "step": 109 + }, + { + "epoch": 0.34, + "learning_rate": 0.0006123646735844401, + "loss": 0.7666, + "step": 110 + }, + { + "epoch": 0.35, + "learning_rate": 0.00061160041894427, + "loss": 0.7555, + "step": 111 + }, + { + "epoch": 0.35, + "learning_rate": 0.0006108289691421089, + "loss": 0.7301, + "step": 112 + }, + { + "epoch": 0.35, + "learning_rate": 0.0006100503435453614, + "loss": 0.7364, + "step": 113 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006092645617015822, + "loss": 0.7461, + "step": 114 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006084716433379844, + "loss": 0.8086, + "step": 115 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006076716083609456, + "loss": 0.7577, + "step": 116 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006068644768555068, + "loss": 0.7094, + "step": 117 + }, + { + "epoch": 0.37, + "learning_rate": 0.0006060502690848696, + "loss": 0.726, + "step": 118 + }, + { + "epoch": 0.37, + "learning_rate": 0.0006052290054898859, + "loss": 0.7243, + "step": 119 + }, + { + "epoch": 0.37, + "learning_rate": 0.0006044007066885458, + "loss": 0.7119, + "step": 120 + }, + { + "epoch": 0.38, + "learning_rate": 0.0006035653934754598, + "loss": 0.7049, + "step": 121 + }, + { + "epoch": 0.38, + "learning_rate": 0.0006027230868213366, + "loss": 0.7424, + "step": 122 + }, + { + "epoch": 0.38, + "learning_rate": 0.0006018738078724563, + "loss": 0.7271, + "step": 123 + }, + { + "epoch": 0.39, + "learning_rate": 0.0006010175779501405, + "loss": 0.7996, + "step": 124 + }, + { + "epoch": 0.39, + "learning_rate": 0.0006001544185502158, + "loss": 0.7468, + "step": 125 + }, + { + "epoch": 0.39, + "learning_rate": 0.0005992843513424754, + "loss": 0.7513, + "step": 126 + }, + { + "epoch": 0.4, + "learning_rate": 0.0005984073981701338, + "loss": 0.7461, + "step": 127 + }, + { + "epoch": 0.4, + "learning_rate": 0.0005975235810492794, + "loss": 0.6821, + "step": 128 + }, + { + "epoch": 0.4, + "learning_rate": 0.0005966329221683215, + "loss": 0.7314, + "step": 129 + }, + { + "epoch": 0.41, + "learning_rate": 0.0005957354438874327, + "loss": 0.714, + "step": 130 + }, + { + "epoch": 0.41, + "learning_rate": 0.0005948311687379884, + "loss": 0.7339, + "step": 131 + }, + { + "epoch": 0.41, + "learning_rate": 0.000593920119422001, + "loss": 0.7021, + "step": 132 + }, + { + "epoch": 0.41, + "learning_rate": 0.0005930023188115492, + "loss": 0.7228, + "step": 133 + }, + { + "epoch": 0.42, + "learning_rate": 0.0005920777899482046, + "loss": 0.7107, + "step": 134 + }, + { + "epoch": 0.42, + "learning_rate": 0.0005911465560424532, + "loss": 0.659, + "step": 135 + }, + { + "epoch": 0.42, + "learning_rate": 0.0005902086404731118, + "loss": 0.7028, + "step": 136 + }, + { + "epoch": 0.43, + "learning_rate": 0.0005892640667867423, + "loss": 0.7275, + "step": 137 + }, + { + "epoch": 0.43, + "learning_rate": 0.00058831285869706, + "loss": 0.6889, + "step": 138 + }, + { + "epoch": 0.43, + "learning_rate": 0.0005873550400843378, + "loss": 0.7891, + "step": 139 + }, + { + "epoch": 0.44, + "learning_rate": 0.0005863906349948074, + "loss": 0.7904, + "step": 140 + }, + { + "epoch": 0.44, + "learning_rate": 0.0005854196676400555, + "loss": 0.6674, + "step": 141 + }, + { + "epoch": 0.44, + "learning_rate": 0.0005844421623964157, + "loss": 0.7352, + "step": 142 + }, + { + "epoch": 0.45, + "learning_rate": 0.0005834581438043563, + "loss": 0.6965, + "step": 143 + }, + { + "epoch": 0.45, + "learning_rate": 0.000582467636567865, + "loss": 0.7238, + "step": 144 + }, + { + "epoch": 0.45, + "learning_rate": 0.0005814706655538279, + "loss": 0.7064, + "step": 145 + }, + { + "epoch": 0.46, + "learning_rate": 0.0005804672557914059, + "loss": 0.6984, + "step": 146 + }, + { + "epoch": 0.46, + "learning_rate": 0.0005794574324714057, + "loss": 0.7594, + "step": 147 + }, + { + "epoch": 0.46, + "learning_rate": 0.0005784412209456479, + "loss": 0.6884, + "step": 148 + }, + { + "epoch": 0.46, + "learning_rate": 0.00057741864672633, + "loss": 0.7141, + "step": 149 + }, + { + "epoch": 0.47, + "learning_rate": 0.0005763897354853866, + "loss": 0.705, + "step": 150 + }, + { + "epoch": 0.47, + "learning_rate": 0.0005753545130538441, + "loss": 0.7613, + "step": 151 + }, + { + "epoch": 0.47, + "learning_rate": 0.0005743130054211732, + "loss": 0.736, + "step": 152 + }, + { + "epoch": 0.48, + "learning_rate": 0.0005732652387346351, + "loss": 0.6814, + "step": 153 + }, + { + "epoch": 0.48, + "learning_rate": 0.0005722112392986265, + "loss": 0.7002, + "step": 154 + }, + { + "epoch": 0.48, + "learning_rate": 0.0005711510335740182, + "loss": 0.7023, + "step": 155 + }, + { + "epoch": 0.49, + "learning_rate": 0.0005700846481774913, + "loss": 0.7617, + "step": 156 + }, + { + "epoch": 0.49, + "learning_rate": 0.0005690121098808687, + "loss": 0.7079, + "step": 157 + }, + { + "epoch": 0.49, + "learning_rate": 0.0005679334456104429, + "loss": 0.7614, + "step": 158 + }, + { + "epoch": 0.5, + "learning_rate": 0.000566848682446301, + "loss": 0.6786, + "step": 159 + }, + { + "epoch": 0.5, + "learning_rate": 0.0005657578476216432, + "loss": 0.6773, + "step": 160 + }, + { + "epoch": 0.5, + "learning_rate": 0.0005646609685221003, + "loss": 0.7085, + "step": 161 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005635580726850462, + "loss": 0.7167, + "step": 162 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005624491877989055, + "loss": 0.7192, + "step": 163 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005613343417024599, + "loss": 0.6761, + "step": 164 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005602135623841478, + "loss": 0.7508, + "step": 165 + }, + { + "epoch": 0.52, + "learning_rate": 0.0005590868779813627, + "loss": 0.6978, + "step": 166 + }, + { + "epoch": 0.52, + "learning_rate": 0.0005579543167797467, + "loss": 0.7459, + "step": 167 + }, + { + "epoch": 0.52, + "learning_rate": 0.0005568159072124794, + "loss": 0.7438, + "step": 168 + }, + { + "epoch": 0.53, + "learning_rate": 0.0005556716778595654, + "loss": 0.7073, + "step": 169 + }, + { + "epoch": 0.53, + "learning_rate": 0.0005545216574471164, + "loss": 0.6385, + "step": 170 + }, + { + "epoch": 0.53, + "learning_rate": 0.0005533658748466291, + "loss": 0.6993, + "step": 171 + }, + { + "epoch": 0.54, + "learning_rate": 0.0005522043590742615, + "loss": 0.7258, + "step": 172 + }, + { + "epoch": 0.54, + "learning_rate": 0.0005510371392901041, + "loss": 0.7405, + "step": 173 + }, + { + "epoch": 0.54, + "learning_rate": 0.0005498642447974479, + "loss": 0.7525, + "step": 174 + }, + { + "epoch": 0.55, + "learning_rate": 0.0005486857050420481, + "loss": 0.6639, + "step": 175 + }, + { + "epoch": 0.55, + "learning_rate": 0.0005475015496113861, + "loss": 0.7415, + "step": 176 + }, + { + "epoch": 0.55, + "learning_rate": 0.0005463118082339253, + "loss": 0.7816, + "step": 177 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005451165107783659, + "loss": 0.711, + "step": 178 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005439156872528941, + "loss": 0.7138, + "step": 179 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005427093678044299, + "loss": 0.7069, + "step": 180 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005414975827178688, + "loss": 0.7553, + "step": 181 + }, + { + "epoch": 0.57, + "learning_rate": 0.000540280362415323, + "loss": 0.7045, + "step": 182 + }, + { + "epoch": 0.57, + "learning_rate": 0.0005390577374553561, + "loss": 0.7011, + "step": 183 + }, + { + "epoch": 0.57, + "learning_rate": 0.0005378297385322177, + "loss": 0.7441, + "step": 184 + }, + { + "epoch": 0.58, + "learning_rate": 0.0005365963964750707, + "loss": 0.6797, + "step": 185 + }, + { + "epoch": 0.58, + "learning_rate": 0.0005353577422472196, + "loss": 0.6901, + "step": 186 + }, + { + "epoch": 0.58, + "learning_rate": 0.0005341138069453313, + "loss": 0.7136, + "step": 187 + }, + { + "epoch": 0.59, + "learning_rate": 0.0005328646217986553, + "loss": 0.7459, + "step": 188 + }, + { + "epoch": 0.59, + "learning_rate": 0.0005316102181682396, + "loss": 0.7064, + "step": 189 + }, + { + "epoch": 0.59, + "learning_rate": 0.0005303506275461433, + "loss": 0.6705, + "step": 190 + }, + { + "epoch": 0.6, + "learning_rate": 0.0005290858815546459, + "loss": 0.7008, + "step": 191 + }, + { + "epoch": 0.6, + "learning_rate": 0.0005278160119454536, + "loss": 0.7538, + "step": 192 + }, + { + "epoch": 0.6, + "learning_rate": 0.0005265410505989021, + "loss": 0.7726, + "step": 193 + }, + { + "epoch": 0.61, + "learning_rate": 0.000525261029523156, + "loss": 0.7532, + "step": 194 + }, + { + "epoch": 0.61, + "learning_rate": 0.0005239759808534055, + "loss": 0.6978, + "step": 195 + }, + { + "epoch": 0.61, + "learning_rate": 0.0005226859368510599, + "loss": 0.7182, + "step": 196 + }, + { + "epoch": 0.61, + "learning_rate": 0.0005213909299029368, + "loss": 0.6776, + "step": 197 + }, + { + "epoch": 0.62, + "learning_rate": 0.0005200909925204501, + "loss": 0.7447, + "step": 198 + }, + { + "epoch": 0.62, + "learning_rate": 0.0005187861573387928, + "loss": 0.7298, + "step": 199 + }, + { + "epoch": 0.62, + "learning_rate": 0.0005174764571161185, + "loss": 0.6833, + "step": 200 + } + ], + "logging_steps": 1, + "max_steps": 640, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "total_flos": 5.1873840573382656e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d220d7d4abfdc29d8393f3089d80acf2e32cf7d7 --- /dev/null +++ b/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d688198fe54ccac5c0a98d11fb9e7224690ace94f7e483ba1d16db91cf33a5c4 +size 4664 diff --git a/checkpoint-250/README.md b/checkpoint-250/README.md new file mode 100644 index 0000000000000000000000000000000000000000..69eced3e4fa0cffb8a4f749d70d2150b3b6038f0 --- /dev/null +++ b/checkpoint-250/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: NousResearch/Llama-2-13b-hf +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.7.1 \ No newline at end of file diff --git a/checkpoint-250/adapter_config.json b/checkpoint-250/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..089f20ebe8d421867e642bbf0b75c66dc24a7f5f --- /dev/null +++ b/checkpoint-250/adapter_config.json @@ -0,0 +1,31 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "NousResearch/Llama-2-13b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "k_proj", + "gate_proj", + "down_proj", + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-250/adapter_model.safetensors b/checkpoint-250/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1c482dd19c5cdc73a9addb0130ef8678db887db0 --- /dev/null +++ b/checkpoint-250/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b266f98b7032e85090b649c4016a474f977a2b7e4bd41b131d49b149a6f8ebf +size 2002857080 diff --git a/checkpoint-250/optimizer.pt b/checkpoint-250/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..deb536c6fe6bf66890178ac821d08ac7ce3d991b --- /dev/null +++ b/checkpoint-250/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:746320d3c6d753667e711f1a8c927057b1d73e27b50378852b05288fcfd12a82 +size 1004004436 diff --git a/checkpoint-250/rng_state.pth b/checkpoint-250/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f8291cd6ce87668b786a72f3e93d072fbe54902 --- /dev/null +++ b/checkpoint-250/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4c917636c7a58af68a29056522a757e9f9b99005b776641aa157c536967817d +size 14244 diff --git a/checkpoint-250/scheduler.pt b/checkpoint-250/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c966e257259d703284c406639b8258b04816de76 --- /dev/null +++ b/checkpoint-250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c43bba66eac938f38b29c075c8808a28bd8f08375d129a8f697b229d8c469183 +size 1064 diff --git a/checkpoint-250/trainer_state.json b/checkpoint-250/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2e26838b3088b0f02c054aba18cd749b7ee57516 --- /dev/null +++ b/checkpoint-250/trainer_state.json @@ -0,0 +1,1521 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7797270955165692, + "eval_steps": 500, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5e-05, + "loss": 1.0506, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001, + "loss": 0.9988, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 0.00015000000000000001, + "loss": 0.9783, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 4 + }, + { + "epoch": 0.02, + "learning_rate": 0.00025, + "loss": 1.0159, + "step": 5 + }, + { + "epoch": 0.02, + "learning_rate": 0.00030000000000000003, + "loss": 0.9847, + "step": 6 + }, + { + "epoch": 0.02, + "learning_rate": 0.00034999999999999994, + "loss": 0.9101, + "step": 7 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 0.9445, + "step": 8 + }, + { + "epoch": 0.03, + "learning_rate": 0.00045, + "loss": 0.8578, + "step": 9 + }, + { + "epoch": 0.03, + "learning_rate": 0.0005, + "loss": 0.9356, + "step": 10 + }, + { + "epoch": 0.03, + "learning_rate": 0.0005499999999999999, + "loss": 0.8395, + "step": 11 + }, + { + "epoch": 0.04, + "learning_rate": 0.0006000000000000001, + "loss": 0.9002, + "step": 12 + }, + { + "epoch": 0.04, + "learning_rate": 0.00065, + "loss": 0.8955, + "step": 13 + }, + { + "epoch": 0.04, + "learning_rate": 0.0006499959204043461, + "loss": 0.902, + "step": 14 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499836817198032, + "loss": 0.8578, + "step": 15 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499632842536263, + "loss": 0.9005, + "step": 16 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499347285178979, + "loss": 0.8539, + "step": 17 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498980152295153, + "loss": 0.8595, + "step": 18 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498531453101735, + "loss": 0.8845, + "step": 19 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498001198863406, + "loss": 0.8924, + "step": 20 + }, + { + "epoch": 0.07, + "learning_rate": 0.000649738940289231, + "loss": 0.8365, + "step": 21 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006496696080547707, + "loss": 0.8462, + "step": 22 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006495921249235596, + "loss": 0.8528, + "step": 23 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006495064928408277, + "loss": 0.8159, + "step": 24 + }, + { + "epoch": 0.08, + "learning_rate": 0.0006494127139563859, + "loss": 0.8245, + "step": 25 + }, + { + "epoch": 0.08, + "learning_rate": 0.000649310790624572, + "loss": 0.8081, + "step": 26 + }, + { + "epoch": 0.08, + "learning_rate": 0.0006492007254041924, + "loss": 0.8535, + "step": 27 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006490825210584566, + "loss": 0.8162, + "step": 28 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006489561805549089, + "loss": 0.8456, + "step": 29 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006488217070653535, + "loss": 0.7799, + "step": 30 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006486791039657748, + "loss": 0.8088, + "step": 31 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006485283748362524, + "loss": 0.8683, + "step": 32 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006483695234608723, + "loss": 0.8871, + "step": 33 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006482025538276304, + "loss": 0.7711, + "step": 34 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006480274701283335, + "loss": 0.7621, + "step": 35 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006478442767584937, + "loss": 0.8243, + "step": 36 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006476529783172177, + "loss": 0.8257, + "step": 37 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006474535796070919, + "loss": 0.8141, + "step": 38 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006472460856340619, + "loss": 0.8109, + "step": 39 + }, + { + "epoch": 0.12, + "learning_rate": 0.000647030501607306, + "loss": 0.7873, + "step": 40 + }, + { + "epoch": 0.13, + "learning_rate": 0.000646806832939105, + "loss": 0.7386, + "step": 41 + }, + { + "epoch": 0.13, + "learning_rate": 0.0006465750852447068, + "loss": 0.8636, + "step": 42 + }, + { + "epoch": 0.13, + "learning_rate": 0.0006463352643421846, + "loss": 0.7357, + "step": 43 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006460873762522906, + "loss": 0.8142, + "step": 44 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006458314271983063, + "loss": 0.7275, + "step": 45 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006455674236058847, + "loss": 0.8029, + "step": 46 + }, + { + "epoch": 0.15, + "learning_rate": 0.00064529537210289, + "loss": 0.7901, + "step": 47 + }, + { + "epoch": 0.15, + "learning_rate": 0.0006450152795192307, + "loss": 0.7788, + "step": 48 + }, + { + "epoch": 0.15, + "learning_rate": 0.0006447271528866881, + "loss": 0.7621, + "step": 49 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006444309994387402, + "loss": 0.7537, + "step": 50 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006441268266103796, + "loss": 0.7917, + "step": 51 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006438146420379274, + "loss": 0.8451, + "step": 52 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006434944535588411, + "loss": 0.8369, + "step": 53 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006431662692115173, + "loss": 0.7637, + "step": 54 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006428300972350914, + "loss": 0.8365, + "step": 55 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006424859460692295, + "loss": 0.7633, + "step": 56 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006421338243539165, + "loss": 0.7718, + "step": 57 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006417737409292403, + "loss": 0.7672, + "step": 58 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006414057048351684, + "loss": 0.8107, + "step": 59 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006410297253113221, + "loss": 0.7979, + "step": 60 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006406458117967443, + "loss": 0.7634, + "step": 61 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006402539739296618, + "loss": 0.7504, + "step": 62 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006398542215472443, + "loss": 0.8082, + "step": 63 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006394465646853571, + "loss": 0.8355, + "step": 64 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006390310135783086, + "loss": 0.7458, + "step": 65 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006386075786585944, + "loss": 0.7525, + "step": 66 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006381762705566343, + "loss": 0.7464, + "step": 67 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006377371001005063, + "loss": 0.78, + "step": 68 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006372900783156745, + "loss": 0.7752, + "step": 69 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006368352164247117, + "loss": 0.7299, + "step": 70 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006363725258470184, + "loss": 0.7722, + "step": 71 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006359020181985365, + "loss": 0.8236, + "step": 72 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006354237052914561, + "loss": 0.7589, + "step": 73 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006349375991339202, + "loss": 0.7948, + "step": 74 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006344437119297233, + "loss": 0.7528, + "step": 75 + }, + { + "epoch": 0.24, + "learning_rate": 0.0006339420560780045, + "loss": 0.7842, + "step": 76 + }, + { + "epoch": 0.24, + "learning_rate": 0.0006334326441729361, + "loss": 0.7541, + "step": 77 + }, + { + "epoch": 0.24, + "learning_rate": 0.000632915489003408, + "loss": 0.7425, + "step": 78 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006323906035527062, + "loss": 0.8168, + "step": 79 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006318580009981871, + "loss": 0.8074, + "step": 80 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006313176947109465, + "loss": 0.7679, + "step": 81 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006307696982554838, + "loss": 0.7465, + "step": 82 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006302140253893622, + "loss": 0.7073, + "step": 83 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006296506900628619, + "loss": 0.7687, + "step": 84 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006290797064186315, + "loss": 0.7578, + "step": 85 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006285010887913319, + "loss": 0.7494, + "step": 86 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006279148517072765, + "loss": 0.7326, + "step": 87 + }, + { + "epoch": 0.27, + "learning_rate": 0.000627321009884067, + "loss": 0.7603, + "step": 88 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006267195782302236, + "loss": 0.8141, + "step": 89 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006261105718448105, + "loss": 0.7542, + "step": 90 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006254940060170575, + "loss": 0.7597, + "step": 91 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006248698962259753, + "loss": 0.7332, + "step": 92 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006242382581399676, + "loss": 0.7031, + "step": 93 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006235991076164375, + "loss": 0.7258, + "step": 94 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006229524607013892, + "loss": 0.7634, + "step": 95 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006222983336290254, + "loss": 0.765, + "step": 96 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006216367428213398, + "loss": 0.7246, + "step": 97 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006209677048877046, + "loss": 0.7115, + "step": 98 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006202912366244535, + "loss": 0.6748, + "step": 99 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006196073550144604, + "loss": 0.6995, + "step": 100 + }, + { + "epoch": 0.32, + "learning_rate": 0.0006189160772267127, + "loss": 0.7764, + "step": 101 + }, + { + "epoch": 0.32, + "learning_rate": 0.00061821742061588, + "loss": 0.8628, + "step": 102 + }, + { + "epoch": 0.32, + "learning_rate": 0.0006175114027218794, + "loss": 0.7266, + "step": 103 + }, + { + "epoch": 0.32, + "learning_rate": 0.0006167980412694342, + "loss": 0.7557, + "step": 104 + }, + { + "epoch": 0.33, + "learning_rate": 0.0006160773541676288, + "loss": 0.7518, + "step": 105 + }, + { + "epoch": 0.33, + "learning_rate": 0.0006153493595094602, + "loss": 0.7589, + "step": 106 + }, + { + "epoch": 0.33, + "learning_rate": 0.000614614075571383, + "loss": 0.7506, + "step": 107 + }, + { + "epoch": 0.34, + "learning_rate": 0.0006138715208128501, + "loss": 0.6617, + "step": 108 + }, + { + "epoch": 0.34, + "learning_rate": 0.0006131217138758505, + "loss": 0.7396, + "step": 109 + }, + { + "epoch": 0.34, + "learning_rate": 0.0006123646735844401, + "loss": 0.7666, + "step": 110 + }, + { + "epoch": 0.35, + "learning_rate": 0.00061160041894427, + "loss": 0.7555, + "step": 111 + }, + { + "epoch": 0.35, + "learning_rate": 0.0006108289691421089, + "loss": 0.7301, + "step": 112 + }, + { + "epoch": 0.35, + "learning_rate": 0.0006100503435453614, + "loss": 0.7364, + "step": 113 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006092645617015822, + "loss": 0.7461, + "step": 114 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006084716433379844, + "loss": 0.8086, + "step": 115 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006076716083609456, + "loss": 0.7577, + "step": 116 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006068644768555068, + "loss": 0.7094, + "step": 117 + }, + { + "epoch": 0.37, + "learning_rate": 0.0006060502690848696, + "loss": 0.726, + "step": 118 + }, + { + "epoch": 0.37, + "learning_rate": 0.0006052290054898859, + "loss": 0.7243, + "step": 119 + }, + { + "epoch": 0.37, + "learning_rate": 0.0006044007066885458, + "loss": 0.7119, + "step": 120 + }, + { + "epoch": 0.38, + "learning_rate": 0.0006035653934754598, + "loss": 0.7049, + "step": 121 + }, + { + "epoch": 0.38, + "learning_rate": 0.0006027230868213366, + "loss": 0.7424, + "step": 122 + }, + { + "epoch": 0.38, + "learning_rate": 0.0006018738078724563, + "loss": 0.7271, + "step": 123 + }, + { + "epoch": 0.39, + "learning_rate": 0.0006010175779501405, + "loss": 0.7996, + "step": 124 + }, + { + "epoch": 0.39, + "learning_rate": 0.0006001544185502158, + "loss": 0.7468, + "step": 125 + }, + { + "epoch": 0.39, + "learning_rate": 0.0005992843513424754, + "loss": 0.7513, + "step": 126 + }, + { + "epoch": 0.4, + "learning_rate": 0.0005984073981701338, + "loss": 0.7461, + "step": 127 + }, + { + "epoch": 0.4, + "learning_rate": 0.0005975235810492794, + "loss": 0.6821, + "step": 128 + }, + { + "epoch": 0.4, + "learning_rate": 0.0005966329221683215, + "loss": 0.7314, + "step": 129 + }, + { + "epoch": 0.41, + "learning_rate": 0.0005957354438874327, + "loss": 0.714, + "step": 130 + }, + { + "epoch": 0.41, + "learning_rate": 0.0005948311687379884, + "loss": 0.7339, + "step": 131 + }, + { + "epoch": 0.41, + "learning_rate": 0.000593920119422001, + "loss": 0.7021, + "step": 132 + }, + { + "epoch": 0.41, + "learning_rate": 0.0005930023188115492, + "loss": 0.7228, + "step": 133 + }, + { + "epoch": 0.42, + "learning_rate": 0.0005920777899482046, + "loss": 0.7107, + "step": 134 + }, + { + "epoch": 0.42, + "learning_rate": 0.0005911465560424532, + "loss": 0.659, + "step": 135 + }, + { + "epoch": 0.42, + "learning_rate": 0.0005902086404731118, + "loss": 0.7028, + "step": 136 + }, + { + "epoch": 0.43, + "learning_rate": 0.0005892640667867423, + "loss": 0.7275, + "step": 137 + }, + { + "epoch": 0.43, + "learning_rate": 0.00058831285869706, + "loss": 0.6889, + "step": 138 + }, + { + "epoch": 0.43, + "learning_rate": 0.0005873550400843378, + "loss": 0.7891, + "step": 139 + }, + { + "epoch": 0.44, + "learning_rate": 0.0005863906349948074, + "loss": 0.7904, + "step": 140 + }, + { + "epoch": 0.44, + "learning_rate": 0.0005854196676400555, + "loss": 0.6674, + "step": 141 + }, + { + "epoch": 0.44, + "learning_rate": 0.0005844421623964157, + "loss": 0.7352, + "step": 142 + }, + { + "epoch": 0.45, + "learning_rate": 0.0005834581438043563, + "loss": 0.6965, + "step": 143 + }, + { + "epoch": 0.45, + "learning_rate": 0.000582467636567865, + "loss": 0.7238, + "step": 144 + }, + { + "epoch": 0.45, + "learning_rate": 0.0005814706655538279, + "loss": 0.7064, + "step": 145 + }, + { + "epoch": 0.46, + "learning_rate": 0.0005804672557914059, + "loss": 0.6984, + "step": 146 + }, + { + "epoch": 0.46, + "learning_rate": 0.0005794574324714057, + "loss": 0.7594, + "step": 147 + }, + { + "epoch": 0.46, + "learning_rate": 0.0005784412209456479, + "loss": 0.6884, + "step": 148 + }, + { + "epoch": 0.46, + "learning_rate": 0.00057741864672633, + "loss": 0.7141, + "step": 149 + }, + { + "epoch": 0.47, + "learning_rate": 0.0005763897354853866, + "loss": 0.705, + "step": 150 + }, + { + "epoch": 0.47, + "learning_rate": 0.0005753545130538441, + "loss": 0.7613, + "step": 151 + }, + { + "epoch": 0.47, + "learning_rate": 0.0005743130054211732, + "loss": 0.736, + "step": 152 + }, + { + "epoch": 0.48, + "learning_rate": 0.0005732652387346351, + "loss": 0.6814, + "step": 153 + }, + { + "epoch": 0.48, + "learning_rate": 0.0005722112392986265, + "loss": 0.7002, + "step": 154 + }, + { + "epoch": 0.48, + "learning_rate": 0.0005711510335740182, + "loss": 0.7023, + "step": 155 + }, + { + "epoch": 0.49, + "learning_rate": 0.0005700846481774913, + "loss": 0.7617, + "step": 156 + }, + { + "epoch": 0.49, + "learning_rate": 0.0005690121098808687, + "loss": 0.7079, + "step": 157 + }, + { + "epoch": 0.49, + "learning_rate": 0.0005679334456104429, + "loss": 0.7614, + "step": 158 + }, + { + "epoch": 0.5, + "learning_rate": 0.000566848682446301, + "loss": 0.6786, + "step": 159 + }, + { + "epoch": 0.5, + "learning_rate": 0.0005657578476216432, + "loss": 0.6773, + "step": 160 + }, + { + "epoch": 0.5, + "learning_rate": 0.0005646609685221003, + "loss": 0.7085, + "step": 161 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005635580726850462, + "loss": 0.7167, + "step": 162 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005624491877989055, + "loss": 0.7192, + "step": 163 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005613343417024599, + "loss": 0.6761, + "step": 164 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005602135623841478, + "loss": 0.7508, + "step": 165 + }, + { + "epoch": 0.52, + "learning_rate": 0.0005590868779813627, + "loss": 0.6978, + "step": 166 + }, + { + "epoch": 0.52, + "learning_rate": 0.0005579543167797467, + "loss": 0.7459, + "step": 167 + }, + { + "epoch": 0.52, + "learning_rate": 0.0005568159072124794, + "loss": 0.7438, + "step": 168 + }, + { + "epoch": 0.53, + "learning_rate": 0.0005556716778595654, + "loss": 0.7073, + "step": 169 + }, + { + "epoch": 0.53, + "learning_rate": 0.0005545216574471164, + "loss": 0.6385, + "step": 170 + }, + { + "epoch": 0.53, + "learning_rate": 0.0005533658748466291, + "loss": 0.6993, + "step": 171 + }, + { + "epoch": 0.54, + "learning_rate": 0.0005522043590742615, + "loss": 0.7258, + "step": 172 + }, + { + "epoch": 0.54, + "learning_rate": 0.0005510371392901041, + "loss": 0.7405, + "step": 173 + }, + { + "epoch": 0.54, + "learning_rate": 0.0005498642447974479, + "loss": 0.7525, + "step": 174 + }, + { + "epoch": 0.55, + "learning_rate": 0.0005486857050420481, + "loss": 0.6639, + "step": 175 + }, + { + "epoch": 0.55, + "learning_rate": 0.0005475015496113861, + "loss": 0.7415, + "step": 176 + }, + { + "epoch": 0.55, + "learning_rate": 0.0005463118082339253, + "loss": 0.7816, + "step": 177 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005451165107783659, + "loss": 0.711, + "step": 178 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005439156872528941, + "loss": 0.7138, + "step": 179 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005427093678044299, + "loss": 0.7069, + "step": 180 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005414975827178688, + "loss": 0.7553, + "step": 181 + }, + { + "epoch": 0.57, + "learning_rate": 0.000540280362415323, + "loss": 0.7045, + "step": 182 + }, + { + "epoch": 0.57, + "learning_rate": 0.0005390577374553561, + "loss": 0.7011, + "step": 183 + }, + { + "epoch": 0.57, + "learning_rate": 0.0005378297385322177, + "loss": 0.7441, + "step": 184 + }, + { + "epoch": 0.58, + "learning_rate": 0.0005365963964750707, + "loss": 0.6797, + "step": 185 + }, + { + "epoch": 0.58, + "learning_rate": 0.0005353577422472196, + "loss": 0.6901, + "step": 186 + }, + { + "epoch": 0.58, + "learning_rate": 0.0005341138069453313, + "loss": 0.7136, + "step": 187 + }, + { + "epoch": 0.59, + "learning_rate": 0.0005328646217986553, + "loss": 0.7459, + "step": 188 + }, + { + "epoch": 0.59, + "learning_rate": 0.0005316102181682396, + "loss": 0.7064, + "step": 189 + }, + { + "epoch": 0.59, + "learning_rate": 0.0005303506275461433, + "loss": 0.6705, + "step": 190 + }, + { + "epoch": 0.6, + "learning_rate": 0.0005290858815546459, + "loss": 0.7008, + "step": 191 + }, + { + "epoch": 0.6, + "learning_rate": 0.0005278160119454536, + "loss": 0.7538, + "step": 192 + }, + { + "epoch": 0.6, + "learning_rate": 0.0005265410505989021, + "loss": 0.7726, + "step": 193 + }, + { + "epoch": 0.61, + "learning_rate": 0.000525261029523156, + "loss": 0.7532, + "step": 194 + }, + { + "epoch": 0.61, + "learning_rate": 0.0005239759808534055, + "loss": 0.6978, + "step": 195 + }, + { + "epoch": 0.61, + "learning_rate": 0.0005226859368510599, + "loss": 0.7182, + "step": 196 + }, + { + "epoch": 0.61, + "learning_rate": 0.0005213909299029368, + "loss": 0.6776, + "step": 197 + }, + { + "epoch": 0.62, + "learning_rate": 0.0005200909925204501, + "loss": 0.7447, + "step": 198 + }, + { + "epoch": 0.62, + "learning_rate": 0.0005187861573387928, + "loss": 0.7298, + "step": 199 + }, + { + "epoch": 0.62, + "learning_rate": 0.0005174764571161185, + "loss": 0.6833, + "step": 200 + }, + { + "epoch": 0.63, + "learning_rate": 0.0005161619247327185, + "loss": 0.7518, + "step": 201 + }, + { + "epoch": 0.63, + "learning_rate": 0.0005148425931901961, + "loss": 0.7429, + "step": 202 + }, + { + "epoch": 0.63, + "learning_rate": 0.0005135184956106394, + "loss": 0.763, + "step": 203 + }, + { + "epoch": 0.64, + "learning_rate": 0.000512189665235788, + "loss": 0.7682, + "step": 204 + }, + { + "epoch": 0.64, + "learning_rate": 0.0005108561354261996, + "loss": 0.7063, + "step": 205 + }, + { + "epoch": 0.64, + "learning_rate": 0.0005095179396604121, + "loss": 0.6956, + "step": 206 + }, + { + "epoch": 0.65, + "learning_rate": 0.0005081751115341034, + "loss": 0.7434, + "step": 207 + }, + { + "epoch": 0.65, + "learning_rate": 0.0005068276847592474, + "loss": 0.6673, + "step": 208 + }, + { + "epoch": 0.65, + "learning_rate": 0.0005054756931632682, + "loss": 0.6448, + "step": 209 + }, + { + "epoch": 0.65, + "learning_rate": 0.0005041191706881909, + "loss": 0.7095, + "step": 210 + }, + { + "epoch": 0.66, + "learning_rate": 0.0005027581513897888, + "loss": 0.673, + "step": 211 + }, + { + "epoch": 0.66, + "learning_rate": 0.000501392669436729, + "loss": 0.6363, + "step": 212 + }, + { + "epoch": 0.66, + "learning_rate": 0.0005000227591097145, + "loss": 0.6711, + "step": 213 + }, + { + "epoch": 0.67, + "learning_rate": 0.0004986484548006237, + "loss": 0.6375, + "step": 214 + }, + { + "epoch": 0.67, + "learning_rate": 0.0004972697910116468, + "loss": 0.7466, + "step": 215 + }, + { + "epoch": 0.67, + "learning_rate": 0.0004958868023544192, + "loss": 0.7147, + "step": 216 + }, + { + "epoch": 0.68, + "learning_rate": 0.0004944995235491534, + "loss": 0.714, + "step": 217 + }, + { + "epoch": 0.68, + "learning_rate": 0.0004931079894237669, + "loss": 0.7377, + "step": 218 + }, + { + "epoch": 0.68, + "learning_rate": 0.0004917122349130078, + "loss": 0.7087, + "step": 219 + }, + { + "epoch": 0.69, + "learning_rate": 0.000490312295057578, + "loss": 0.6716, + "step": 220 + }, + { + "epoch": 0.69, + "learning_rate": 0.0004889082050032529, + "loss": 0.7298, + "step": 221 + }, + { + "epoch": 0.69, + "learning_rate": 0.0004875, + "loss": 0.6557, + "step": 222 + }, + { + "epoch": 0.7, + "learning_rate": 0.0004860877154010932, + "loss": 0.7042, + "step": 223 + }, + { + "epoch": 0.7, + "learning_rate": 0.00048467138666222534, + "loss": 0.6617, + "step": 224 + }, + { + "epoch": 0.7, + "learning_rate": 0.00048325104934061853, + "loss": 0.7019, + "step": 225 + }, + { + "epoch": 0.7, + "learning_rate": 0.00048182673909413103, + "loss": 0.6756, + "step": 226 + }, + { + "epoch": 0.71, + "learning_rate": 0.00048039849168036205, + "loss": 0.709, + "step": 227 + }, + { + "epoch": 0.71, + "learning_rate": 0.00047896634295575434, + "loss": 0.7434, + "step": 228 + }, + { + "epoch": 0.71, + "learning_rate": 0.00047753032887469385, + "loss": 0.7533, + "step": 229 + }, + { + "epoch": 0.72, + "learning_rate": 0.0004760904854886072, + "loss": 0.7019, + "step": 230 + }, + { + "epoch": 0.72, + "learning_rate": 0.0004746468489450562, + "loss": 0.6852, + "step": 231 + }, + { + "epoch": 0.72, + "learning_rate": 0.0004731994554868307, + "loss": 0.7228, + "step": 232 + }, + { + "epoch": 0.73, + "learning_rate": 0.000471748341451039, + "loss": 0.7513, + "step": 233 + }, + { + "epoch": 0.73, + "learning_rate": 0.0004702935432681949, + "loss": 0.6896, + "step": 234 + }, + { + "epoch": 0.73, + "learning_rate": 0.0004688350974613038, + "loss": 0.6815, + "step": 235 + }, + { + "epoch": 0.74, + "learning_rate": 0.0004673730406449449, + "loss": 0.7682, + "step": 236 + }, + { + "epoch": 0.74, + "learning_rate": 0.00046590740952435323, + "loss": 0.7025, + "step": 237 + }, + { + "epoch": 0.74, + "learning_rate": 0.0004644382408944968, + "loss": 0.6662, + "step": 238 + }, + { + "epoch": 0.75, + "learning_rate": 0.00046296557163915395, + "loss": 0.7541, + "step": 239 + }, + { + "epoch": 0.75, + "learning_rate": 0.0004614894387299867, + "loss": 0.7336, + "step": 240 + }, + { + "epoch": 0.75, + "learning_rate": 0.0004600098792256131, + "loss": 0.6618, + "step": 241 + }, + { + "epoch": 0.75, + "learning_rate": 0.0004585269302706762, + "loss": 0.6729, + "step": 242 + }, + { + "epoch": 0.76, + "learning_rate": 0.0004570406290949121, + "loss": 0.7327, + "step": 243 + }, + { + "epoch": 0.76, + "learning_rate": 0.0004555510130122151, + "loss": 0.6778, + "step": 244 + }, + { + "epoch": 0.76, + "learning_rate": 0.0004540581194197008, + "loss": 0.6219, + "step": 245 + }, + { + "epoch": 0.77, + "learning_rate": 0.00045256198579676755, + "loss": 0.6984, + "step": 246 + }, + { + "epoch": 0.77, + "learning_rate": 0.000451062649704155, + "loss": 0.637, + "step": 247 + }, + { + "epoch": 0.77, + "learning_rate": 0.000449560148783002, + "loss": 0.658, + "step": 248 + }, + { + "epoch": 0.78, + "learning_rate": 0.0004480545207539004, + "loss": 0.7305, + "step": 249 + }, + { + "epoch": 0.78, + "learning_rate": 0.0004465458034159491, + "loss": 0.6788, + "step": 250 + } + ], + "logging_steps": 1, + "max_steps": 640, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "total_flos": 6.485217104687923e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-250/training_args.bin b/checkpoint-250/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d220d7d4abfdc29d8393f3089d80acf2e32cf7d7 --- /dev/null +++ b/checkpoint-250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d688198fe54ccac5c0a98d11fb9e7224690ace94f7e483ba1d16db91cf33a5c4 +size 4664 diff --git a/checkpoint-300/README.md b/checkpoint-300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..69eced3e4fa0cffb8a4f749d70d2150b3b6038f0 --- /dev/null +++ b/checkpoint-300/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: NousResearch/Llama-2-13b-hf +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.7.1 \ No newline at end of file diff --git a/checkpoint-300/adapter_config.json b/checkpoint-300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..089f20ebe8d421867e642bbf0b75c66dc24a7f5f --- /dev/null +++ b/checkpoint-300/adapter_config.json @@ -0,0 +1,31 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "NousResearch/Llama-2-13b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "k_proj", + "gate_proj", + "down_proj", + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-300/adapter_model.safetensors b/checkpoint-300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1a874fba9818e355aa2dd63c5be453cdb64269c2 --- /dev/null +++ b/checkpoint-300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2ba4889ddd334e8b80509f15d40f074b9dfaef2866a04f275eed814dcee579b +size 2002857080 diff --git a/checkpoint-300/optimizer.pt b/checkpoint-300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f68886daaf31bb8aa8e781276230c87448ce1e4 --- /dev/null +++ b/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1105c7c916d0b57c89e82154788da9604d0e8ab6fbaff824e7ab9009a32fdb6 +size 1004005012 diff --git a/checkpoint-300/rng_state.pth b/checkpoint-300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f8291cd6ce87668b786a72f3e93d072fbe54902 --- /dev/null +++ b/checkpoint-300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4c917636c7a58af68a29056522a757e9f9b99005b776641aa157c536967817d +size 14244 diff --git a/checkpoint-300/scheduler.pt b/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b1a0a3fe2c0c200271eee23f759dfaf391c955b --- /dev/null +++ b/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec9001d99362d99da8638538a872bd8142681e341e7c9813a2a8768e2c11b81 +size 1064 diff --git a/checkpoint-300/trainer_state.json b/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0a38f4a560b1f6b1c35f33e596bb19b64d7c9515 --- /dev/null +++ b/checkpoint-300/trainer_state.json @@ -0,0 +1,1821 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.935672514619883, + "eval_steps": 500, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5e-05, + "loss": 1.0506, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001, + "loss": 0.9988, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 0.00015000000000000001, + "loss": 0.9783, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 4 + }, + { + "epoch": 0.02, + "learning_rate": 0.00025, + "loss": 1.0159, + "step": 5 + }, + { + "epoch": 0.02, + "learning_rate": 0.00030000000000000003, + "loss": 0.9847, + "step": 6 + }, + { + "epoch": 0.02, + "learning_rate": 0.00034999999999999994, + "loss": 0.9101, + "step": 7 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 0.9445, + "step": 8 + }, + { + "epoch": 0.03, + "learning_rate": 0.00045, + "loss": 0.8578, + "step": 9 + }, + { + "epoch": 0.03, + "learning_rate": 0.0005, + "loss": 0.9356, + "step": 10 + }, + { + "epoch": 0.03, + "learning_rate": 0.0005499999999999999, + "loss": 0.8395, + "step": 11 + }, + { + "epoch": 0.04, + "learning_rate": 0.0006000000000000001, + "loss": 0.9002, + "step": 12 + }, + { + "epoch": 0.04, + "learning_rate": 0.00065, + "loss": 0.8955, + "step": 13 + }, + { + "epoch": 0.04, + "learning_rate": 0.0006499959204043461, + "loss": 0.902, + "step": 14 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499836817198032, + "loss": 0.8578, + "step": 15 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499632842536263, + "loss": 0.9005, + "step": 16 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499347285178979, + "loss": 0.8539, + "step": 17 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498980152295153, + "loss": 0.8595, + "step": 18 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498531453101735, + "loss": 0.8845, + "step": 19 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498001198863406, + "loss": 0.8924, + "step": 20 + }, + { + "epoch": 0.07, + "learning_rate": 0.000649738940289231, + "loss": 0.8365, + "step": 21 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006496696080547707, + "loss": 0.8462, + "step": 22 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006495921249235596, + "loss": 0.8528, + "step": 23 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006495064928408277, + "loss": 0.8159, + "step": 24 + }, + { + "epoch": 0.08, + "learning_rate": 0.0006494127139563859, + "loss": 0.8245, + "step": 25 + }, + { + "epoch": 0.08, + "learning_rate": 0.000649310790624572, + "loss": 0.8081, + "step": 26 + }, + { + "epoch": 0.08, + "learning_rate": 0.0006492007254041924, + "loss": 0.8535, + "step": 27 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006490825210584566, + "loss": 0.8162, + "step": 28 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006489561805549089, + "loss": 0.8456, + "step": 29 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006488217070653535, + "loss": 0.7799, + "step": 30 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006486791039657748, + "loss": 0.8088, + "step": 31 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006485283748362524, + "loss": 0.8683, + "step": 32 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006483695234608723, + "loss": 0.8871, + "step": 33 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006482025538276304, + "loss": 0.7711, + "step": 34 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006480274701283335, + "loss": 0.7621, + "step": 35 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006478442767584937, + "loss": 0.8243, + "step": 36 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006476529783172177, + "loss": 0.8257, + "step": 37 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006474535796070919, + "loss": 0.8141, + "step": 38 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006472460856340619, + "loss": 0.8109, + "step": 39 + }, + { + "epoch": 0.12, + "learning_rate": 0.000647030501607306, + "loss": 0.7873, + "step": 40 + }, + { + "epoch": 0.13, + "learning_rate": 0.000646806832939105, + "loss": 0.7386, + "step": 41 + }, + { + "epoch": 0.13, + "learning_rate": 0.0006465750852447068, + "loss": 0.8636, + "step": 42 + }, + { + "epoch": 0.13, + "learning_rate": 0.0006463352643421846, + "loss": 0.7357, + "step": 43 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006460873762522906, + "loss": 0.8142, + "step": 44 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006458314271983063, + "loss": 0.7275, + "step": 45 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006455674236058847, + "loss": 0.8029, + "step": 46 + }, + { + "epoch": 0.15, + "learning_rate": 0.00064529537210289, + "loss": 0.7901, + "step": 47 + }, + { + "epoch": 0.15, + "learning_rate": 0.0006450152795192307, + "loss": 0.7788, + "step": 48 + }, + { + "epoch": 0.15, + "learning_rate": 0.0006447271528866881, + "loss": 0.7621, + "step": 49 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006444309994387402, + "loss": 0.7537, + "step": 50 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006441268266103796, + "loss": 0.7917, + "step": 51 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006438146420379274, + "loss": 0.8451, + "step": 52 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006434944535588411, + "loss": 0.8369, + "step": 53 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006431662692115173, + "loss": 0.7637, + "step": 54 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006428300972350914, + "loss": 0.8365, + "step": 55 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006424859460692295, + "loss": 0.7633, + "step": 56 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006421338243539165, + "loss": 0.7718, + "step": 57 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006417737409292403, + "loss": 0.7672, + "step": 58 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006414057048351684, + "loss": 0.8107, + "step": 59 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006410297253113221, + "loss": 0.7979, + "step": 60 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006406458117967443, + "loss": 0.7634, + "step": 61 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006402539739296618, + "loss": 0.7504, + "step": 62 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006398542215472443, + "loss": 0.8082, + "step": 63 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006394465646853571, + "loss": 0.8355, + "step": 64 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006390310135783086, + "loss": 0.7458, + "step": 65 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006386075786585944, + "loss": 0.7525, + "step": 66 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006381762705566343, + "loss": 0.7464, + "step": 67 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006377371001005063, + "loss": 0.78, + "step": 68 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006372900783156745, + "loss": 0.7752, + "step": 69 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006368352164247117, + "loss": 0.7299, + "step": 70 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006363725258470184, + "loss": 0.7722, + "step": 71 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006359020181985365, + "loss": 0.8236, + "step": 72 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006354237052914561, + "loss": 0.7589, + "step": 73 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006349375991339202, + "loss": 0.7948, + "step": 74 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006344437119297233, + "loss": 0.7528, + "step": 75 + }, + { + "epoch": 0.24, + "learning_rate": 0.0006339420560780045, + "loss": 0.7842, + "step": 76 + }, + { + "epoch": 0.24, + "learning_rate": 0.0006334326441729361, + "loss": 0.7541, + "step": 77 + }, + { + "epoch": 0.24, + "learning_rate": 0.000632915489003408, + "loss": 0.7425, + "step": 78 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006323906035527062, + "loss": 0.8168, + "step": 79 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006318580009981871, + "loss": 0.8074, + "step": 80 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006313176947109465, + "loss": 0.7679, + "step": 81 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006307696982554838, + "loss": 0.7465, + "step": 82 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006302140253893622, + "loss": 0.7073, + "step": 83 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006296506900628619, + "loss": 0.7687, + "step": 84 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006290797064186315, + "loss": 0.7578, + "step": 85 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006285010887913319, + "loss": 0.7494, + "step": 86 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006279148517072765, + "loss": 0.7326, + "step": 87 + }, + { + "epoch": 0.27, + "learning_rate": 0.000627321009884067, + "loss": 0.7603, + "step": 88 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006267195782302236, + "loss": 0.8141, + "step": 89 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006261105718448105, + "loss": 0.7542, + "step": 90 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006254940060170575, + "loss": 0.7597, + "step": 91 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006248698962259753, + "loss": 0.7332, + "step": 92 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006242382581399676, + "loss": 0.7031, + "step": 93 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006235991076164375, + "loss": 0.7258, + "step": 94 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006229524607013892, + "loss": 0.7634, + "step": 95 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006222983336290254, + "loss": 0.765, + "step": 96 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006216367428213398, + "loss": 0.7246, + "step": 97 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006209677048877046, + "loss": 0.7115, + "step": 98 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006202912366244535, + "loss": 0.6748, + "step": 99 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006196073550144604, + "loss": 0.6995, + "step": 100 + }, + { + "epoch": 0.32, + "learning_rate": 0.0006189160772267127, + "loss": 0.7764, + "step": 101 + }, + { + "epoch": 0.32, + "learning_rate": 0.00061821742061588, + "loss": 0.8628, + "step": 102 + }, + { + "epoch": 0.32, + "learning_rate": 0.0006175114027218794, + "loss": 0.7266, + "step": 103 + }, + { + "epoch": 0.32, + "learning_rate": 0.0006167980412694342, + "loss": 0.7557, + "step": 104 + }, + { + "epoch": 0.33, + "learning_rate": 0.0006160773541676288, + "loss": 0.7518, + "step": 105 + }, + { + "epoch": 0.33, + "learning_rate": 0.0006153493595094602, + "loss": 0.7589, + "step": 106 + }, + { + "epoch": 0.33, + "learning_rate": 0.000614614075571383, + "loss": 0.7506, + "step": 107 + }, + { + "epoch": 0.34, + "learning_rate": 0.0006138715208128501, + "loss": 0.6617, + "step": 108 + }, + { + "epoch": 0.34, + "learning_rate": 0.0006131217138758505, + "loss": 0.7396, + "step": 109 + }, + { + "epoch": 0.34, + "learning_rate": 0.0006123646735844401, + "loss": 0.7666, + "step": 110 + }, + { + "epoch": 0.35, + "learning_rate": 0.00061160041894427, + "loss": 0.7555, + "step": 111 + }, + { + "epoch": 0.35, + "learning_rate": 0.0006108289691421089, + "loss": 0.7301, + "step": 112 + }, + { + "epoch": 0.35, + "learning_rate": 0.0006100503435453614, + "loss": 0.7364, + "step": 113 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006092645617015822, + "loss": 0.7461, + "step": 114 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006084716433379844, + "loss": 0.8086, + "step": 115 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006076716083609456, + "loss": 0.7577, + "step": 116 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006068644768555068, + "loss": 0.7094, + "step": 117 + }, + { + "epoch": 0.37, + "learning_rate": 0.0006060502690848696, + "loss": 0.726, + "step": 118 + }, + { + "epoch": 0.37, + "learning_rate": 0.0006052290054898859, + "loss": 0.7243, + "step": 119 + }, + { + "epoch": 0.37, + "learning_rate": 0.0006044007066885458, + "loss": 0.7119, + "step": 120 + }, + { + "epoch": 0.38, + "learning_rate": 0.0006035653934754598, + "loss": 0.7049, + "step": 121 + }, + { + "epoch": 0.38, + "learning_rate": 0.0006027230868213366, + "loss": 0.7424, + "step": 122 + }, + { + "epoch": 0.38, + "learning_rate": 0.0006018738078724563, + "loss": 0.7271, + "step": 123 + }, + { + "epoch": 0.39, + "learning_rate": 0.0006010175779501405, + "loss": 0.7996, + "step": 124 + }, + { + "epoch": 0.39, + "learning_rate": 0.0006001544185502158, + "loss": 0.7468, + "step": 125 + }, + { + "epoch": 0.39, + "learning_rate": 0.0005992843513424754, + "loss": 0.7513, + "step": 126 + }, + { + "epoch": 0.4, + "learning_rate": 0.0005984073981701338, + "loss": 0.7461, + "step": 127 + }, + { + "epoch": 0.4, + "learning_rate": 0.0005975235810492794, + "loss": 0.6821, + "step": 128 + }, + { + "epoch": 0.4, + "learning_rate": 0.0005966329221683215, + "loss": 0.7314, + "step": 129 + }, + { + "epoch": 0.41, + "learning_rate": 0.0005957354438874327, + "loss": 0.714, + "step": 130 + }, + { + "epoch": 0.41, + "learning_rate": 0.0005948311687379884, + "loss": 0.7339, + "step": 131 + }, + { + "epoch": 0.41, + "learning_rate": 0.000593920119422001, + "loss": 0.7021, + "step": 132 + }, + { + "epoch": 0.41, + "learning_rate": 0.0005930023188115492, + "loss": 0.7228, + "step": 133 + }, + { + "epoch": 0.42, + "learning_rate": 0.0005920777899482046, + "loss": 0.7107, + "step": 134 + }, + { + "epoch": 0.42, + "learning_rate": 0.0005911465560424532, + "loss": 0.659, + "step": 135 + }, + { + "epoch": 0.42, + "learning_rate": 0.0005902086404731118, + "loss": 0.7028, + "step": 136 + }, + { + "epoch": 0.43, + "learning_rate": 0.0005892640667867423, + "loss": 0.7275, + "step": 137 + }, + { + "epoch": 0.43, + "learning_rate": 0.00058831285869706, + "loss": 0.6889, + "step": 138 + }, + { + "epoch": 0.43, + "learning_rate": 0.0005873550400843378, + "loss": 0.7891, + "step": 139 + }, + { + "epoch": 0.44, + "learning_rate": 0.0005863906349948074, + "loss": 0.7904, + "step": 140 + }, + { + "epoch": 0.44, + "learning_rate": 0.0005854196676400555, + "loss": 0.6674, + "step": 141 + }, + { + "epoch": 0.44, + "learning_rate": 0.0005844421623964157, + "loss": 0.7352, + "step": 142 + }, + { + "epoch": 0.45, + "learning_rate": 0.0005834581438043563, + "loss": 0.6965, + "step": 143 + }, + { + "epoch": 0.45, + "learning_rate": 0.000582467636567865, + "loss": 0.7238, + "step": 144 + }, + { + "epoch": 0.45, + "learning_rate": 0.0005814706655538279, + "loss": 0.7064, + "step": 145 + }, + { + "epoch": 0.46, + "learning_rate": 0.0005804672557914059, + "loss": 0.6984, + "step": 146 + }, + { + "epoch": 0.46, + "learning_rate": 0.0005794574324714057, + "loss": 0.7594, + "step": 147 + }, + { + "epoch": 0.46, + "learning_rate": 0.0005784412209456479, + "loss": 0.6884, + "step": 148 + }, + { + "epoch": 0.46, + "learning_rate": 0.00057741864672633, + "loss": 0.7141, + "step": 149 + }, + { + "epoch": 0.47, + "learning_rate": 0.0005763897354853866, + "loss": 0.705, + "step": 150 + }, + { + "epoch": 0.47, + "learning_rate": 0.0005753545130538441, + "loss": 0.7613, + "step": 151 + }, + { + "epoch": 0.47, + "learning_rate": 0.0005743130054211732, + "loss": 0.736, + "step": 152 + }, + { + "epoch": 0.48, + "learning_rate": 0.0005732652387346351, + "loss": 0.6814, + "step": 153 + }, + { + "epoch": 0.48, + "learning_rate": 0.0005722112392986265, + "loss": 0.7002, + "step": 154 + }, + { + "epoch": 0.48, + "learning_rate": 0.0005711510335740182, + "loss": 0.7023, + "step": 155 + }, + { + "epoch": 0.49, + "learning_rate": 0.0005700846481774913, + "loss": 0.7617, + "step": 156 + }, + { + "epoch": 0.49, + "learning_rate": 0.0005690121098808687, + "loss": 0.7079, + "step": 157 + }, + { + "epoch": 0.49, + "learning_rate": 0.0005679334456104429, + "loss": 0.7614, + "step": 158 + }, + { + "epoch": 0.5, + "learning_rate": 0.000566848682446301, + "loss": 0.6786, + "step": 159 + }, + { + "epoch": 0.5, + "learning_rate": 0.0005657578476216432, + "loss": 0.6773, + "step": 160 + }, + { + "epoch": 0.5, + "learning_rate": 0.0005646609685221003, + "loss": 0.7085, + "step": 161 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005635580726850462, + "loss": 0.7167, + "step": 162 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005624491877989055, + "loss": 0.7192, + "step": 163 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005613343417024599, + "loss": 0.6761, + "step": 164 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005602135623841478, + "loss": 0.7508, + "step": 165 + }, + { + "epoch": 0.52, + "learning_rate": 0.0005590868779813627, + "loss": 0.6978, + "step": 166 + }, + { + "epoch": 0.52, + "learning_rate": 0.0005579543167797467, + "loss": 0.7459, + "step": 167 + }, + { + "epoch": 0.52, + "learning_rate": 0.0005568159072124794, + "loss": 0.7438, + "step": 168 + }, + { + "epoch": 0.53, + "learning_rate": 0.0005556716778595654, + "loss": 0.7073, + "step": 169 + }, + { + "epoch": 0.53, + "learning_rate": 0.0005545216574471164, + "loss": 0.6385, + "step": 170 + }, + { + "epoch": 0.53, + "learning_rate": 0.0005533658748466291, + "loss": 0.6993, + "step": 171 + }, + { + "epoch": 0.54, + "learning_rate": 0.0005522043590742615, + "loss": 0.7258, + "step": 172 + }, + { + "epoch": 0.54, + "learning_rate": 0.0005510371392901041, + "loss": 0.7405, + "step": 173 + }, + { + "epoch": 0.54, + "learning_rate": 0.0005498642447974479, + "loss": 0.7525, + "step": 174 + }, + { + "epoch": 0.55, + "learning_rate": 0.0005486857050420481, + "loss": 0.6639, + "step": 175 + }, + { + "epoch": 0.55, + "learning_rate": 0.0005475015496113861, + "loss": 0.7415, + "step": 176 + }, + { + "epoch": 0.55, + "learning_rate": 0.0005463118082339253, + "loss": 0.7816, + "step": 177 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005451165107783659, + "loss": 0.711, + "step": 178 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005439156872528941, + "loss": 0.7138, + "step": 179 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005427093678044299, + "loss": 0.7069, + "step": 180 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005414975827178688, + "loss": 0.7553, + "step": 181 + }, + { + "epoch": 0.57, + "learning_rate": 0.000540280362415323, + "loss": 0.7045, + "step": 182 + }, + { + "epoch": 0.57, + "learning_rate": 0.0005390577374553561, + "loss": 0.7011, + "step": 183 + }, + { + "epoch": 0.57, + "learning_rate": 0.0005378297385322177, + "loss": 0.7441, + "step": 184 + }, + { + "epoch": 0.58, + "learning_rate": 0.0005365963964750707, + "loss": 0.6797, + "step": 185 + }, + { + "epoch": 0.58, + "learning_rate": 0.0005353577422472196, + "loss": 0.6901, + "step": 186 + }, + { + "epoch": 0.58, + "learning_rate": 0.0005341138069453313, + "loss": 0.7136, + "step": 187 + }, + { + "epoch": 0.59, + "learning_rate": 0.0005328646217986553, + "loss": 0.7459, + "step": 188 + }, + { + "epoch": 0.59, + "learning_rate": 0.0005316102181682396, + "loss": 0.7064, + "step": 189 + }, + { + "epoch": 0.59, + "learning_rate": 0.0005303506275461433, + "loss": 0.6705, + "step": 190 + }, + { + "epoch": 0.6, + "learning_rate": 0.0005290858815546459, + "loss": 0.7008, + "step": 191 + }, + { + "epoch": 0.6, + "learning_rate": 0.0005278160119454536, + "loss": 0.7538, + "step": 192 + }, + { + "epoch": 0.6, + "learning_rate": 0.0005265410505989021, + "loss": 0.7726, + "step": 193 + }, + { + "epoch": 0.61, + "learning_rate": 0.000525261029523156, + "loss": 0.7532, + "step": 194 + }, + { + "epoch": 0.61, + "learning_rate": 0.0005239759808534055, + "loss": 0.6978, + "step": 195 + }, + { + "epoch": 0.61, + "learning_rate": 0.0005226859368510599, + "loss": 0.7182, + "step": 196 + }, + { + "epoch": 0.61, + "learning_rate": 0.0005213909299029368, + "loss": 0.6776, + "step": 197 + }, + { + "epoch": 0.62, + "learning_rate": 0.0005200909925204501, + "loss": 0.7447, + "step": 198 + }, + { + "epoch": 0.62, + "learning_rate": 0.0005187861573387928, + "loss": 0.7298, + "step": 199 + }, + { + "epoch": 0.62, + "learning_rate": 0.0005174764571161185, + "loss": 0.6833, + "step": 200 + }, + { + "epoch": 0.63, + "learning_rate": 0.0005161619247327185, + "loss": 0.7518, + "step": 201 + }, + { + "epoch": 0.63, + "learning_rate": 0.0005148425931901961, + "loss": 0.7429, + "step": 202 + }, + { + "epoch": 0.63, + "learning_rate": 0.0005135184956106394, + "loss": 0.763, + "step": 203 + }, + { + "epoch": 0.64, + "learning_rate": 0.000512189665235788, + "loss": 0.7682, + "step": 204 + }, + { + "epoch": 0.64, + "learning_rate": 0.0005108561354261996, + "loss": 0.7063, + "step": 205 + }, + { + "epoch": 0.64, + "learning_rate": 0.0005095179396604121, + "loss": 0.6956, + "step": 206 + }, + { + "epoch": 0.65, + "learning_rate": 0.0005081751115341034, + "loss": 0.7434, + "step": 207 + }, + { + "epoch": 0.65, + "learning_rate": 0.0005068276847592474, + "loss": 0.6673, + "step": 208 + }, + { + "epoch": 0.65, + "learning_rate": 0.0005054756931632682, + "loss": 0.6448, + "step": 209 + }, + { + "epoch": 0.65, + "learning_rate": 0.0005041191706881909, + "loss": 0.7095, + "step": 210 + }, + { + "epoch": 0.66, + "learning_rate": 0.0005027581513897888, + "loss": 0.673, + "step": 211 + }, + { + "epoch": 0.66, + "learning_rate": 0.000501392669436729, + "loss": 0.6363, + "step": 212 + }, + { + "epoch": 0.66, + "learning_rate": 0.0005000227591097145, + "loss": 0.6711, + "step": 213 + }, + { + "epoch": 0.67, + "learning_rate": 0.0004986484548006237, + "loss": 0.6375, + "step": 214 + }, + { + "epoch": 0.67, + "learning_rate": 0.0004972697910116468, + "loss": 0.7466, + "step": 215 + }, + { + "epoch": 0.67, + "learning_rate": 0.0004958868023544192, + "loss": 0.7147, + "step": 216 + }, + { + "epoch": 0.68, + "learning_rate": 0.0004944995235491534, + "loss": 0.714, + "step": 217 + }, + { + "epoch": 0.68, + "learning_rate": 0.0004931079894237669, + "loss": 0.7377, + "step": 218 + }, + { + "epoch": 0.68, + "learning_rate": 0.0004917122349130078, + "loss": 0.7087, + "step": 219 + }, + { + "epoch": 0.69, + "learning_rate": 0.000490312295057578, + "loss": 0.6716, + "step": 220 + }, + { + "epoch": 0.69, + "learning_rate": 0.0004889082050032529, + "loss": 0.7298, + "step": 221 + }, + { + "epoch": 0.69, + "learning_rate": 0.0004875, + "loss": 0.6557, + "step": 222 + }, + { + "epoch": 0.7, + "learning_rate": 0.0004860877154010932, + "loss": 0.7042, + "step": 223 + }, + { + "epoch": 0.7, + "learning_rate": 0.00048467138666222534, + "loss": 0.6617, + "step": 224 + }, + { + "epoch": 0.7, + "learning_rate": 0.00048325104934061853, + "loss": 0.7019, + "step": 225 + }, + { + "epoch": 0.7, + "learning_rate": 0.00048182673909413103, + "loss": 0.6756, + "step": 226 + }, + { + "epoch": 0.71, + "learning_rate": 0.00048039849168036205, + "loss": 0.709, + "step": 227 + }, + { + "epoch": 0.71, + "learning_rate": 0.00047896634295575434, + "loss": 0.7434, + "step": 228 + }, + { + "epoch": 0.71, + "learning_rate": 0.00047753032887469385, + "loss": 0.7533, + "step": 229 + }, + { + "epoch": 0.72, + "learning_rate": 0.0004760904854886072, + "loss": 0.7019, + "step": 230 + }, + { + "epoch": 0.72, + "learning_rate": 0.0004746468489450562, + "loss": 0.6852, + "step": 231 + }, + { + "epoch": 0.72, + "learning_rate": 0.0004731994554868307, + "loss": 0.7228, + "step": 232 + }, + { + "epoch": 0.73, + "learning_rate": 0.000471748341451039, + "loss": 0.7513, + "step": 233 + }, + { + "epoch": 0.73, + "learning_rate": 0.0004702935432681949, + "loss": 0.6896, + "step": 234 + }, + { + "epoch": 0.73, + "learning_rate": 0.0004688350974613038, + "loss": 0.6815, + "step": 235 + }, + { + "epoch": 0.74, + "learning_rate": 0.0004673730406449449, + "loss": 0.7682, + "step": 236 + }, + { + "epoch": 0.74, + "learning_rate": 0.00046590740952435323, + "loss": 0.7025, + "step": 237 + }, + { + "epoch": 0.74, + "learning_rate": 0.0004644382408944968, + "loss": 0.6662, + "step": 238 + }, + { + "epoch": 0.75, + "learning_rate": 0.00046296557163915395, + "loss": 0.7541, + "step": 239 + }, + { + "epoch": 0.75, + "learning_rate": 0.0004614894387299867, + "loss": 0.7336, + "step": 240 + }, + { + "epoch": 0.75, + "learning_rate": 0.0004600098792256131, + "loss": 0.6618, + "step": 241 + }, + { + "epoch": 0.75, + "learning_rate": 0.0004585269302706762, + "loss": 0.6729, + "step": 242 + }, + { + "epoch": 0.76, + "learning_rate": 0.0004570406290949121, + "loss": 0.7327, + "step": 243 + }, + { + "epoch": 0.76, + "learning_rate": 0.0004555510130122151, + "loss": 0.6778, + "step": 244 + }, + { + "epoch": 0.76, + "learning_rate": 0.0004540581194197008, + "loss": 0.6219, + "step": 245 + }, + { + "epoch": 0.77, + "learning_rate": 0.00045256198579676755, + "loss": 0.6984, + "step": 246 + }, + { + "epoch": 0.77, + "learning_rate": 0.000451062649704155, + "loss": 0.637, + "step": 247 + }, + { + "epoch": 0.77, + "learning_rate": 0.000449560148783002, + "loss": 0.658, + "step": 248 + }, + { + "epoch": 0.78, + "learning_rate": 0.0004480545207539004, + "loss": 0.7305, + "step": 249 + }, + { + "epoch": 0.78, + "learning_rate": 0.0004465458034159491, + "loss": 0.6788, + "step": 250 + }, + { + "epoch": 0.78, + "learning_rate": 0.00044503403464580475, + "loss": 0.7096, + "step": 251 + }, + { + "epoch": 0.79, + "learning_rate": 0.00044351925239673087, + "loss": 0.7108, + "step": 252 + }, + { + "epoch": 0.79, + "learning_rate": 0.0004420014946976447, + "loss": 0.6518, + "step": 253 + }, + { + "epoch": 0.79, + "learning_rate": 0.00044048079965216294, + "loss": 0.7262, + "step": 254 + }, + { + "epoch": 0.8, + "learning_rate": 0.0004389572054376452, + "loss": 0.6988, + "step": 255 + }, + { + "epoch": 0.8, + "learning_rate": 0.00043743075030423475, + "loss": 0.6637, + "step": 256 + }, + { + "epoch": 0.8, + "learning_rate": 0.0004359014725738994, + "loss": 0.7055, + "step": 257 + }, + { + "epoch": 0.8, + "learning_rate": 0.00043436941063946843, + "loss": 0.7179, + "step": 258 + }, + { + "epoch": 0.81, + "learning_rate": 0.0004328346029636694, + "loss": 0.6955, + "step": 259 + }, + { + "epoch": 0.81, + "learning_rate": 0.0004312970880781621, + "loss": 0.6749, + "step": 260 + }, + { + "epoch": 0.81, + "learning_rate": 0.0004297569045825713, + "loss": 0.6711, + "step": 261 + }, + { + "epoch": 0.82, + "learning_rate": 0.00042821409114351803, + "loss": 0.6366, + "step": 262 + }, + { + "epoch": 0.82, + "learning_rate": 0.00042666868649364844, + "loss": 0.7144, + "step": 263 + }, + { + "epoch": 0.82, + "learning_rate": 0.0004251207294306617, + "loss": 0.656, + "step": 264 + }, + { + "epoch": 0.83, + "learning_rate": 0.00042357025881633535, + "loss": 0.6803, + "step": 265 + }, + { + "epoch": 0.83, + "learning_rate": 0.00042201731357555073, + "loss": 0.7044, + "step": 266 + }, + { + "epoch": 0.83, + "learning_rate": 0.0004204619326953149, + "loss": 0.6488, + "step": 267 + }, + { + "epoch": 0.84, + "learning_rate": 0.00041890415522378223, + "loss": 0.6928, + "step": 268 + }, + { + "epoch": 0.84, + "learning_rate": 0.00041734402026927394, + "loss": 0.6764, + "step": 269 + }, + { + "epoch": 0.84, + "learning_rate": 0.00041578156699929636, + "loss": 0.6278, + "step": 270 + }, + { + "epoch": 0.85, + "learning_rate": 0.0004142168346395577, + "loss": 0.691, + "step": 271 + }, + { + "epoch": 0.85, + "learning_rate": 0.0004126498624729829, + "loss": 0.6865, + "step": 272 + }, + { + "epoch": 0.85, + "learning_rate": 0.000411080689838728, + "loss": 0.6715, + "step": 273 + }, + { + "epoch": 0.85, + "learning_rate": 0.00040950935613119226, + "loss": 0.6563, + "step": 274 + }, + { + "epoch": 0.86, + "learning_rate": 0.00040793590079902885, + "loss": 0.7608, + "step": 275 + }, + { + "epoch": 0.86, + "learning_rate": 0.00040636036334415487, + "loss": 0.6189, + "step": 276 + }, + { + "epoch": 0.86, + "learning_rate": 0.0004047827833207597, + "loss": 0.6981, + "step": 277 + }, + { + "epoch": 0.87, + "learning_rate": 0.0004032032003343117, + "loss": 0.644, + "step": 278 + }, + { + "epoch": 0.87, + "learning_rate": 0.0004016216540405639, + "loss": 0.7286, + "step": 279 + }, + { + "epoch": 0.87, + "learning_rate": 0.0004000381841445586, + "loss": 0.6694, + "step": 280 + }, + { + "epoch": 0.88, + "learning_rate": 0.00039845283039963093, + "loss": 0.7204, + "step": 281 + }, + { + "epoch": 0.88, + "learning_rate": 0.0003968656326064099, + "loss": 0.7042, + "step": 282 + }, + { + "epoch": 0.88, + "learning_rate": 0.00039527663061181983, + "loss": 0.712, + "step": 283 + }, + { + "epoch": 0.89, + "learning_rate": 0.00039368586430808014, + "loss": 0.7179, + "step": 284 + }, + { + "epoch": 0.89, + "learning_rate": 0.00039209337363170347, + "loss": 0.6903, + "step": 285 + }, + { + "epoch": 0.89, + "learning_rate": 0.00039049919856249315, + "loss": 0.6924, + "step": 286 + }, + { + "epoch": 0.9, + "learning_rate": 0.0003889033791225395, + "loss": 0.6713, + "step": 287 + }, + { + "epoch": 0.9, + "learning_rate": 0.000387305955375215, + "loss": 0.7852, + "step": 288 + }, + { + "epoch": 0.9, + "learning_rate": 0.0003857069674241689, + "loss": 0.6517, + "step": 289 + }, + { + "epoch": 0.9, + "learning_rate": 0.00038410645541232, + "loss": 0.6764, + "step": 290 + }, + { + "epoch": 0.91, + "learning_rate": 0.0003825044595208488, + "loss": 0.7183, + "step": 291 + }, + { + "epoch": 0.91, + "learning_rate": 0.000380901019968189, + "loss": 0.6826, + "step": 292 + }, + { + "epoch": 0.91, + "learning_rate": 0.0003792961770090178, + "loss": 0.6936, + "step": 293 + }, + { + "epoch": 0.92, + "learning_rate": 0.0003776899709332449, + "loss": 0.718, + "step": 294 + }, + { + "epoch": 0.92, + "learning_rate": 0.00037608244206500176, + "loss": 0.6795, + "step": 295 + }, + { + "epoch": 0.92, + "learning_rate": 0.00037447363076162853, + "loss": 0.6517, + "step": 296 + }, + { + "epoch": 0.93, + "learning_rate": 0.0003728635774126613, + "loss": 0.6849, + "step": 297 + }, + { + "epoch": 0.93, + "learning_rate": 0.0003712523224388177, + "loss": 0.6663, + "step": 298 + }, + { + "epoch": 0.93, + "learning_rate": 0.00036963990629098264, + "loss": 0.6585, + "step": 299 + }, + { + "epoch": 0.94, + "learning_rate": 0.0003680263694491925, + "loss": 0.7054, + "step": 300 + } + ], + "logging_steps": 1, + "max_steps": 640, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "total_flos": 7.783562896461005e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-300/training_args.bin b/checkpoint-300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d220d7d4abfdc29d8393f3089d80acf2e32cf7d7 --- /dev/null +++ b/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d688198fe54ccac5c0a98d11fb9e7224690ace94f7e483ba1d16db91cf33a5c4 +size 4664 diff --git a/checkpoint-350/README.md b/checkpoint-350/README.md new file mode 100644 index 0000000000000000000000000000000000000000..69eced3e4fa0cffb8a4f749d70d2150b3b6038f0 --- /dev/null +++ b/checkpoint-350/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: NousResearch/Llama-2-13b-hf +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.7.1 \ No newline at end of file diff --git a/checkpoint-350/adapter_config.json b/checkpoint-350/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..089f20ebe8d421867e642bbf0b75c66dc24a7f5f --- /dev/null +++ b/checkpoint-350/adapter_config.json @@ -0,0 +1,31 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "NousResearch/Llama-2-13b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "k_proj", + "gate_proj", + "down_proj", + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-350/adapter_model.safetensors b/checkpoint-350/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1e21b9a856e421d2366ee5a707310c226ddcb8f1 --- /dev/null +++ b/checkpoint-350/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c10a1ab49cf385bdd6b9c6f544c8ce4a44c83d0cbc74c586ea355162ce407583 +size 2002857080 diff --git a/checkpoint-350/optimizer.pt b/checkpoint-350/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc1f873adf5867383634a7a54efd862161d897fb --- /dev/null +++ b/checkpoint-350/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b8cc02158584f32821542d6a298df3ad0263592831834e46aa7dff0295d6d98 +size 1004005012 diff --git a/checkpoint-350/rng_state.pth b/checkpoint-350/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f8d20ff01271ccf633065ed8e4601c7b74d10586 --- /dev/null +++ b/checkpoint-350/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:203e9bfabd925cb4ec7129d24877156fcee87215187c35a867e358e56a9425a4 +size 14244 diff --git a/checkpoint-350/scheduler.pt b/checkpoint-350/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..550af4dc676df776f0fc207114810d592a8eabab --- /dev/null +++ b/checkpoint-350/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0646493b7d2b2399260f02aa8eb7889f586cbddf4f800cffd32db5f08f271a69 +size 1064 diff --git a/checkpoint-350/trainer_state.json b/checkpoint-350/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4430282a9b6eb57d208ee01a40dcb3fb089d1342 --- /dev/null +++ b/checkpoint-350/trainer_state.json @@ -0,0 +1,2121 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0916179337231968, + "eval_steps": 500, + "global_step": 350, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5e-05, + "loss": 1.0506, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001, + "loss": 0.9988, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 0.00015000000000000001, + "loss": 0.9783, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 4 + }, + { + "epoch": 0.02, + "learning_rate": 0.00025, + "loss": 1.0159, + "step": 5 + }, + { + "epoch": 0.02, + "learning_rate": 0.00030000000000000003, + "loss": 0.9847, + "step": 6 + }, + { + "epoch": 0.02, + "learning_rate": 0.00034999999999999994, + "loss": 0.9101, + "step": 7 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 0.9445, + "step": 8 + }, + { + "epoch": 0.03, + "learning_rate": 0.00045, + "loss": 0.8578, + "step": 9 + }, + { + "epoch": 0.03, + "learning_rate": 0.0005, + "loss": 0.9356, + "step": 10 + }, + { + "epoch": 0.03, + "learning_rate": 0.0005499999999999999, + "loss": 0.8395, + "step": 11 + }, + { + "epoch": 0.04, + "learning_rate": 0.0006000000000000001, + "loss": 0.9002, + "step": 12 + }, + { + "epoch": 0.04, + "learning_rate": 0.00065, + "loss": 0.8955, + "step": 13 + }, + { + "epoch": 0.04, + "learning_rate": 0.0006499959204043461, + "loss": 0.902, + "step": 14 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499836817198032, + "loss": 0.8578, + "step": 15 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499632842536263, + "loss": 0.9005, + "step": 16 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499347285178979, + "loss": 0.8539, + "step": 17 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498980152295153, + "loss": 0.8595, + "step": 18 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498531453101735, + "loss": 0.8845, + "step": 19 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498001198863406, + "loss": 0.8924, + "step": 20 + }, + { + "epoch": 0.07, + "learning_rate": 0.000649738940289231, + "loss": 0.8365, + "step": 21 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006496696080547707, + "loss": 0.8462, + "step": 22 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006495921249235596, + "loss": 0.8528, + "step": 23 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006495064928408277, + "loss": 0.8159, + "step": 24 + }, + { + "epoch": 0.08, + "learning_rate": 0.0006494127139563859, + "loss": 0.8245, + "step": 25 + }, + { + "epoch": 0.08, + "learning_rate": 0.000649310790624572, + "loss": 0.8081, + "step": 26 + }, + { + "epoch": 0.08, + "learning_rate": 0.0006492007254041924, + "loss": 0.8535, + "step": 27 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006490825210584566, + "loss": 0.8162, + "step": 28 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006489561805549089, + "loss": 0.8456, + "step": 29 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006488217070653535, + "loss": 0.7799, + "step": 30 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006486791039657748, + "loss": 0.8088, + "step": 31 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006485283748362524, + "loss": 0.8683, + "step": 32 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006483695234608723, + "loss": 0.8871, + "step": 33 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006482025538276304, + "loss": 0.7711, + "step": 34 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006480274701283335, + "loss": 0.7621, + "step": 35 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006478442767584937, + "loss": 0.8243, + "step": 36 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006476529783172177, + "loss": 0.8257, + "step": 37 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006474535796070919, + "loss": 0.8141, + "step": 38 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006472460856340619, + "loss": 0.8109, + "step": 39 + }, + { + "epoch": 0.12, + "learning_rate": 0.000647030501607306, + "loss": 0.7873, + "step": 40 + }, + { + "epoch": 0.13, + "learning_rate": 0.000646806832939105, + "loss": 0.7386, + "step": 41 + }, + { + "epoch": 0.13, + "learning_rate": 0.0006465750852447068, + "loss": 0.8636, + "step": 42 + }, + { + "epoch": 0.13, + "learning_rate": 0.0006463352643421846, + "loss": 0.7357, + "step": 43 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006460873762522906, + "loss": 0.8142, + "step": 44 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006458314271983063, + "loss": 0.7275, + "step": 45 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006455674236058847, + "loss": 0.8029, + "step": 46 + }, + { + "epoch": 0.15, + "learning_rate": 0.00064529537210289, + "loss": 0.7901, + "step": 47 + }, + { + "epoch": 0.15, + "learning_rate": 0.0006450152795192307, + "loss": 0.7788, + "step": 48 + }, + { + "epoch": 0.15, + "learning_rate": 0.0006447271528866881, + "loss": 0.7621, + "step": 49 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006444309994387402, + "loss": 0.7537, + "step": 50 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006441268266103796, + "loss": 0.7917, + "step": 51 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006438146420379274, + "loss": 0.8451, + "step": 52 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006434944535588411, + "loss": 0.8369, + "step": 53 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006431662692115173, + "loss": 0.7637, + "step": 54 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006428300972350914, + "loss": 0.8365, + "step": 55 + }, + { + "epoch": 0.17, + "learning_rate": 0.0006424859460692295, + "loss": 0.7633, + "step": 56 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006421338243539165, + "loss": 0.7718, + "step": 57 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006417737409292403, + "loss": 0.7672, + "step": 58 + }, + { + "epoch": 0.18, + "learning_rate": 0.0006414057048351684, + "loss": 0.8107, + "step": 59 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006410297253113221, + "loss": 0.7979, + "step": 60 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006406458117967443, + "loss": 0.7634, + "step": 61 + }, + { + "epoch": 0.19, + "learning_rate": 0.0006402539739296618, + "loss": 0.7504, + "step": 62 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006398542215472443, + "loss": 0.8082, + "step": 63 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006394465646853571, + "loss": 0.8355, + "step": 64 + }, + { + "epoch": 0.2, + "learning_rate": 0.0006390310135783086, + "loss": 0.7458, + "step": 65 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006386075786585944, + "loss": 0.7525, + "step": 66 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006381762705566343, + "loss": 0.7464, + "step": 67 + }, + { + "epoch": 0.21, + "learning_rate": 0.0006377371001005063, + "loss": 0.78, + "step": 68 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006372900783156745, + "loss": 0.7752, + "step": 69 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006368352164247117, + "loss": 0.7299, + "step": 70 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006363725258470184, + "loss": 0.7722, + "step": 71 + }, + { + "epoch": 0.22, + "learning_rate": 0.0006359020181985365, + "loss": 0.8236, + "step": 72 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006354237052914561, + "loss": 0.7589, + "step": 73 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006349375991339202, + "loss": 0.7948, + "step": 74 + }, + { + "epoch": 0.23, + "learning_rate": 0.0006344437119297233, + "loss": 0.7528, + "step": 75 + }, + { + "epoch": 0.24, + "learning_rate": 0.0006339420560780045, + "loss": 0.7842, + "step": 76 + }, + { + "epoch": 0.24, + "learning_rate": 0.0006334326441729361, + "loss": 0.7541, + "step": 77 + }, + { + "epoch": 0.24, + "learning_rate": 0.000632915489003408, + "loss": 0.7425, + "step": 78 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006323906035527062, + "loss": 0.8168, + "step": 79 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006318580009981871, + "loss": 0.8074, + "step": 80 + }, + { + "epoch": 0.25, + "learning_rate": 0.0006313176947109465, + "loss": 0.7679, + "step": 81 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006307696982554838, + "loss": 0.7465, + "step": 82 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006302140253893622, + "loss": 0.7073, + "step": 83 + }, + { + "epoch": 0.26, + "learning_rate": 0.0006296506900628619, + "loss": 0.7687, + "step": 84 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006290797064186315, + "loss": 0.7578, + "step": 85 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006285010887913319, + "loss": 0.7494, + "step": 86 + }, + { + "epoch": 0.27, + "learning_rate": 0.0006279148517072765, + "loss": 0.7326, + "step": 87 + }, + { + "epoch": 0.27, + "learning_rate": 0.000627321009884067, + "loss": 0.7603, + "step": 88 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006267195782302236, + "loss": 0.8141, + "step": 89 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006261105718448105, + "loss": 0.7542, + "step": 90 + }, + { + "epoch": 0.28, + "learning_rate": 0.0006254940060170575, + "loss": 0.7597, + "step": 91 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006248698962259753, + "loss": 0.7332, + "step": 92 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006242382581399676, + "loss": 0.7031, + "step": 93 + }, + { + "epoch": 0.29, + "learning_rate": 0.0006235991076164375, + "loss": 0.7258, + "step": 94 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006229524607013892, + "loss": 0.7634, + "step": 95 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006222983336290254, + "loss": 0.765, + "step": 96 + }, + { + "epoch": 0.3, + "learning_rate": 0.0006216367428213398, + "loss": 0.7246, + "step": 97 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006209677048877046, + "loss": 0.7115, + "step": 98 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006202912366244535, + "loss": 0.6748, + "step": 99 + }, + { + "epoch": 0.31, + "learning_rate": 0.0006196073550144604, + "loss": 0.6995, + "step": 100 + }, + { + "epoch": 0.32, + "learning_rate": 0.0006189160772267127, + "loss": 0.7764, + "step": 101 + }, + { + "epoch": 0.32, + "learning_rate": 0.00061821742061588, + "loss": 0.8628, + "step": 102 + }, + { + "epoch": 0.32, + "learning_rate": 0.0006175114027218794, + "loss": 0.7266, + "step": 103 + }, + { + "epoch": 0.32, + "learning_rate": 0.0006167980412694342, + "loss": 0.7557, + "step": 104 + }, + { + "epoch": 0.33, + "learning_rate": 0.0006160773541676288, + "loss": 0.7518, + "step": 105 + }, + { + "epoch": 0.33, + "learning_rate": 0.0006153493595094602, + "loss": 0.7589, + "step": 106 + }, + { + "epoch": 0.33, + "learning_rate": 0.000614614075571383, + "loss": 0.7506, + "step": 107 + }, + { + "epoch": 0.34, + "learning_rate": 0.0006138715208128501, + "loss": 0.6617, + "step": 108 + }, + { + "epoch": 0.34, + "learning_rate": 0.0006131217138758505, + "loss": 0.7396, + "step": 109 + }, + { + "epoch": 0.34, + "learning_rate": 0.0006123646735844401, + "loss": 0.7666, + "step": 110 + }, + { + "epoch": 0.35, + "learning_rate": 0.00061160041894427, + "loss": 0.7555, + "step": 111 + }, + { + "epoch": 0.35, + "learning_rate": 0.0006108289691421089, + "loss": 0.7301, + "step": 112 + }, + { + "epoch": 0.35, + "learning_rate": 0.0006100503435453614, + "loss": 0.7364, + "step": 113 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006092645617015822, + "loss": 0.7461, + "step": 114 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006084716433379844, + "loss": 0.8086, + "step": 115 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006076716083609456, + "loss": 0.7577, + "step": 116 + }, + { + "epoch": 0.36, + "learning_rate": 0.0006068644768555068, + "loss": 0.7094, + "step": 117 + }, + { + "epoch": 0.37, + "learning_rate": 0.0006060502690848696, + "loss": 0.726, + "step": 118 + }, + { + "epoch": 0.37, + "learning_rate": 0.0006052290054898859, + "loss": 0.7243, + "step": 119 + }, + { + "epoch": 0.37, + "learning_rate": 0.0006044007066885458, + "loss": 0.7119, + "step": 120 + }, + { + "epoch": 0.38, + "learning_rate": 0.0006035653934754598, + "loss": 0.7049, + "step": 121 + }, + { + "epoch": 0.38, + "learning_rate": 0.0006027230868213366, + "loss": 0.7424, + "step": 122 + }, + { + "epoch": 0.38, + "learning_rate": 0.0006018738078724563, + "loss": 0.7271, + "step": 123 + }, + { + "epoch": 0.39, + "learning_rate": 0.0006010175779501405, + "loss": 0.7996, + "step": 124 + }, + { + "epoch": 0.39, + "learning_rate": 0.0006001544185502158, + "loss": 0.7468, + "step": 125 + }, + { + "epoch": 0.39, + "learning_rate": 0.0005992843513424754, + "loss": 0.7513, + "step": 126 + }, + { + "epoch": 0.4, + "learning_rate": 0.0005984073981701338, + "loss": 0.7461, + "step": 127 + }, + { + "epoch": 0.4, + "learning_rate": 0.0005975235810492794, + "loss": 0.6821, + "step": 128 + }, + { + "epoch": 0.4, + "learning_rate": 0.0005966329221683215, + "loss": 0.7314, + "step": 129 + }, + { + "epoch": 0.41, + "learning_rate": 0.0005957354438874327, + "loss": 0.714, + "step": 130 + }, + { + "epoch": 0.41, + "learning_rate": 0.0005948311687379884, + "loss": 0.7339, + "step": 131 + }, + { + "epoch": 0.41, + "learning_rate": 0.000593920119422001, + "loss": 0.7021, + "step": 132 + }, + { + "epoch": 0.41, + "learning_rate": 0.0005930023188115492, + "loss": 0.7228, + "step": 133 + }, + { + "epoch": 0.42, + "learning_rate": 0.0005920777899482046, + "loss": 0.7107, + "step": 134 + }, + { + "epoch": 0.42, + "learning_rate": 0.0005911465560424532, + "loss": 0.659, + "step": 135 + }, + { + "epoch": 0.42, + "learning_rate": 0.0005902086404731118, + "loss": 0.7028, + "step": 136 + }, + { + "epoch": 0.43, + "learning_rate": 0.0005892640667867423, + "loss": 0.7275, + "step": 137 + }, + { + "epoch": 0.43, + "learning_rate": 0.00058831285869706, + "loss": 0.6889, + "step": 138 + }, + { + "epoch": 0.43, + "learning_rate": 0.0005873550400843378, + "loss": 0.7891, + "step": 139 + }, + { + "epoch": 0.44, + "learning_rate": 0.0005863906349948074, + "loss": 0.7904, + "step": 140 + }, + { + "epoch": 0.44, + "learning_rate": 0.0005854196676400555, + "loss": 0.6674, + "step": 141 + }, + { + "epoch": 0.44, + "learning_rate": 0.0005844421623964157, + "loss": 0.7352, + "step": 142 + }, + { + "epoch": 0.45, + "learning_rate": 0.0005834581438043563, + "loss": 0.6965, + "step": 143 + }, + { + "epoch": 0.45, + "learning_rate": 0.000582467636567865, + "loss": 0.7238, + "step": 144 + }, + { + "epoch": 0.45, + "learning_rate": 0.0005814706655538279, + "loss": 0.7064, + "step": 145 + }, + { + "epoch": 0.46, + "learning_rate": 0.0005804672557914059, + "loss": 0.6984, + "step": 146 + }, + { + "epoch": 0.46, + "learning_rate": 0.0005794574324714057, + "loss": 0.7594, + "step": 147 + }, + { + "epoch": 0.46, + "learning_rate": 0.0005784412209456479, + "loss": 0.6884, + "step": 148 + }, + { + "epoch": 0.46, + "learning_rate": 0.00057741864672633, + "loss": 0.7141, + "step": 149 + }, + { + "epoch": 0.47, + "learning_rate": 0.0005763897354853866, + "loss": 0.705, + "step": 150 + }, + { + "epoch": 0.47, + "learning_rate": 0.0005753545130538441, + "loss": 0.7613, + "step": 151 + }, + { + "epoch": 0.47, + "learning_rate": 0.0005743130054211732, + "loss": 0.736, + "step": 152 + }, + { + "epoch": 0.48, + "learning_rate": 0.0005732652387346351, + "loss": 0.6814, + "step": 153 + }, + { + "epoch": 0.48, + "learning_rate": 0.0005722112392986265, + "loss": 0.7002, + "step": 154 + }, + { + "epoch": 0.48, + "learning_rate": 0.0005711510335740182, + "loss": 0.7023, + "step": 155 + }, + { + "epoch": 0.49, + "learning_rate": 0.0005700846481774913, + "loss": 0.7617, + "step": 156 + }, + { + "epoch": 0.49, + "learning_rate": 0.0005690121098808687, + "loss": 0.7079, + "step": 157 + }, + { + "epoch": 0.49, + "learning_rate": 0.0005679334456104429, + "loss": 0.7614, + "step": 158 + }, + { + "epoch": 0.5, + "learning_rate": 0.000566848682446301, + "loss": 0.6786, + "step": 159 + }, + { + "epoch": 0.5, + "learning_rate": 0.0005657578476216432, + "loss": 0.6773, + "step": 160 + }, + { + "epoch": 0.5, + "learning_rate": 0.0005646609685221003, + "loss": 0.7085, + "step": 161 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005635580726850462, + "loss": 0.7167, + "step": 162 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005624491877989055, + "loss": 0.7192, + "step": 163 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005613343417024599, + "loss": 0.6761, + "step": 164 + }, + { + "epoch": 0.51, + "learning_rate": 0.0005602135623841478, + "loss": 0.7508, + "step": 165 + }, + { + "epoch": 0.52, + "learning_rate": 0.0005590868779813627, + "loss": 0.6978, + "step": 166 + }, + { + "epoch": 0.52, + "learning_rate": 0.0005579543167797467, + "loss": 0.7459, + "step": 167 + }, + { + "epoch": 0.52, + "learning_rate": 0.0005568159072124794, + "loss": 0.7438, + "step": 168 + }, + { + "epoch": 0.53, + "learning_rate": 0.0005556716778595654, + "loss": 0.7073, + "step": 169 + }, + { + "epoch": 0.53, + "learning_rate": 0.0005545216574471164, + "loss": 0.6385, + "step": 170 + }, + { + "epoch": 0.53, + "learning_rate": 0.0005533658748466291, + "loss": 0.6993, + "step": 171 + }, + { + "epoch": 0.54, + "learning_rate": 0.0005522043590742615, + "loss": 0.7258, + "step": 172 + }, + { + "epoch": 0.54, + "learning_rate": 0.0005510371392901041, + "loss": 0.7405, + "step": 173 + }, + { + "epoch": 0.54, + "learning_rate": 0.0005498642447974479, + "loss": 0.7525, + "step": 174 + }, + { + "epoch": 0.55, + "learning_rate": 0.0005486857050420481, + "loss": 0.6639, + "step": 175 + }, + { + "epoch": 0.55, + "learning_rate": 0.0005475015496113861, + "loss": 0.7415, + "step": 176 + }, + { + "epoch": 0.55, + "learning_rate": 0.0005463118082339253, + "loss": 0.7816, + "step": 177 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005451165107783659, + "loss": 0.711, + "step": 178 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005439156872528941, + "loss": 0.7138, + "step": 179 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005427093678044299, + "loss": 0.7069, + "step": 180 + }, + { + "epoch": 0.56, + "learning_rate": 0.0005414975827178688, + "loss": 0.7553, + "step": 181 + }, + { + "epoch": 0.57, + "learning_rate": 0.000540280362415323, + "loss": 0.7045, + "step": 182 + }, + { + "epoch": 0.57, + "learning_rate": 0.0005390577374553561, + "loss": 0.7011, + "step": 183 + }, + { + "epoch": 0.57, + "learning_rate": 0.0005378297385322177, + "loss": 0.7441, + "step": 184 + }, + { + "epoch": 0.58, + "learning_rate": 0.0005365963964750707, + "loss": 0.6797, + "step": 185 + }, + { + "epoch": 0.58, + "learning_rate": 0.0005353577422472196, + "loss": 0.6901, + "step": 186 + }, + { + "epoch": 0.58, + "learning_rate": 0.0005341138069453313, + "loss": 0.7136, + "step": 187 + }, + { + "epoch": 0.59, + "learning_rate": 0.0005328646217986553, + "loss": 0.7459, + "step": 188 + }, + { + "epoch": 0.59, + "learning_rate": 0.0005316102181682396, + "loss": 0.7064, + "step": 189 + }, + { + "epoch": 0.59, + "learning_rate": 0.0005303506275461433, + "loss": 0.6705, + "step": 190 + }, + { + "epoch": 0.6, + "learning_rate": 0.0005290858815546459, + "loss": 0.7008, + "step": 191 + }, + { + "epoch": 0.6, + "learning_rate": 0.0005278160119454536, + "loss": 0.7538, + "step": 192 + }, + { + "epoch": 0.6, + "learning_rate": 0.0005265410505989021, + "loss": 0.7726, + "step": 193 + }, + { + "epoch": 0.61, + "learning_rate": 0.000525261029523156, + "loss": 0.7532, + "step": 194 + }, + { + "epoch": 0.61, + "learning_rate": 0.0005239759808534055, + "loss": 0.6978, + "step": 195 + }, + { + "epoch": 0.61, + "learning_rate": 0.0005226859368510599, + "loss": 0.7182, + "step": 196 + }, + { + "epoch": 0.61, + "learning_rate": 0.0005213909299029368, + "loss": 0.6776, + "step": 197 + }, + { + "epoch": 0.62, + "learning_rate": 0.0005200909925204501, + "loss": 0.7447, + "step": 198 + }, + { + "epoch": 0.62, + "learning_rate": 0.0005187861573387928, + "loss": 0.7298, + "step": 199 + }, + { + "epoch": 0.62, + "learning_rate": 0.0005174764571161185, + "loss": 0.6833, + "step": 200 + }, + { + "epoch": 0.63, + "learning_rate": 0.0005161619247327185, + "loss": 0.7518, + "step": 201 + }, + { + "epoch": 0.63, + "learning_rate": 0.0005148425931901961, + "loss": 0.7429, + "step": 202 + }, + { + "epoch": 0.63, + "learning_rate": 0.0005135184956106394, + "loss": 0.763, + "step": 203 + }, + { + "epoch": 0.64, + "learning_rate": 0.000512189665235788, + "loss": 0.7682, + "step": 204 + }, + { + "epoch": 0.64, + "learning_rate": 0.0005108561354261996, + "loss": 0.7063, + "step": 205 + }, + { + "epoch": 0.64, + "learning_rate": 0.0005095179396604121, + "loss": 0.6956, + "step": 206 + }, + { + "epoch": 0.65, + "learning_rate": 0.0005081751115341034, + "loss": 0.7434, + "step": 207 + }, + { + "epoch": 0.65, + "learning_rate": 0.0005068276847592474, + "loss": 0.6673, + "step": 208 + }, + { + "epoch": 0.65, + "learning_rate": 0.0005054756931632682, + "loss": 0.6448, + "step": 209 + }, + { + "epoch": 0.65, + "learning_rate": 0.0005041191706881909, + "loss": 0.7095, + "step": 210 + }, + { + "epoch": 0.66, + "learning_rate": 0.0005027581513897888, + "loss": 0.673, + "step": 211 + }, + { + "epoch": 0.66, + "learning_rate": 0.000501392669436729, + "loss": 0.6363, + "step": 212 + }, + { + "epoch": 0.66, + "learning_rate": 0.0005000227591097145, + "loss": 0.6711, + "step": 213 + }, + { + "epoch": 0.67, + "learning_rate": 0.0004986484548006237, + "loss": 0.6375, + "step": 214 + }, + { + "epoch": 0.67, + "learning_rate": 0.0004972697910116468, + "loss": 0.7466, + "step": 215 + }, + { + "epoch": 0.67, + "learning_rate": 0.0004958868023544192, + "loss": 0.7147, + "step": 216 + }, + { + "epoch": 0.68, + "learning_rate": 0.0004944995235491534, + "loss": 0.714, + "step": 217 + }, + { + "epoch": 0.68, + "learning_rate": 0.0004931079894237669, + "loss": 0.7377, + "step": 218 + }, + { + "epoch": 0.68, + "learning_rate": 0.0004917122349130078, + "loss": 0.7087, + "step": 219 + }, + { + "epoch": 0.69, + "learning_rate": 0.000490312295057578, + "loss": 0.6716, + "step": 220 + }, + { + "epoch": 0.69, + "learning_rate": 0.0004889082050032529, + "loss": 0.7298, + "step": 221 + }, + { + "epoch": 0.69, + "learning_rate": 0.0004875, + "loss": 0.6557, + "step": 222 + }, + { + "epoch": 0.7, + "learning_rate": 0.0004860877154010932, + "loss": 0.7042, + "step": 223 + }, + { + "epoch": 0.7, + "learning_rate": 0.00048467138666222534, + "loss": 0.6617, + "step": 224 + }, + { + "epoch": 0.7, + "learning_rate": 0.00048325104934061853, + "loss": 0.7019, + "step": 225 + }, + { + "epoch": 0.7, + "learning_rate": 0.00048182673909413103, + "loss": 0.6756, + "step": 226 + }, + { + "epoch": 0.71, + "learning_rate": 0.00048039849168036205, + "loss": 0.709, + "step": 227 + }, + { + "epoch": 0.71, + "learning_rate": 0.00047896634295575434, + "loss": 0.7434, + "step": 228 + }, + { + "epoch": 0.71, + "learning_rate": 0.00047753032887469385, + "loss": 0.7533, + "step": 229 + }, + { + "epoch": 0.72, + "learning_rate": 0.0004760904854886072, + "loss": 0.7019, + "step": 230 + }, + { + "epoch": 0.72, + "learning_rate": 0.0004746468489450562, + "loss": 0.6852, + "step": 231 + }, + { + "epoch": 0.72, + "learning_rate": 0.0004731994554868307, + "loss": 0.7228, + "step": 232 + }, + { + "epoch": 0.73, + "learning_rate": 0.000471748341451039, + "loss": 0.7513, + "step": 233 + }, + { + "epoch": 0.73, + "learning_rate": 0.0004702935432681949, + "loss": 0.6896, + "step": 234 + }, + { + "epoch": 0.73, + "learning_rate": 0.0004688350974613038, + "loss": 0.6815, + "step": 235 + }, + { + "epoch": 0.74, + "learning_rate": 0.0004673730406449449, + "loss": 0.7682, + "step": 236 + }, + { + "epoch": 0.74, + "learning_rate": 0.00046590740952435323, + "loss": 0.7025, + "step": 237 + }, + { + "epoch": 0.74, + "learning_rate": 0.0004644382408944968, + "loss": 0.6662, + "step": 238 + }, + { + "epoch": 0.75, + "learning_rate": 0.00046296557163915395, + "loss": 0.7541, + "step": 239 + }, + { + "epoch": 0.75, + "learning_rate": 0.0004614894387299867, + "loss": 0.7336, + "step": 240 + }, + { + "epoch": 0.75, + "learning_rate": 0.0004600098792256131, + "loss": 0.6618, + "step": 241 + }, + { + "epoch": 0.75, + "learning_rate": 0.0004585269302706762, + "loss": 0.6729, + "step": 242 + }, + { + "epoch": 0.76, + "learning_rate": 0.0004570406290949121, + "loss": 0.7327, + "step": 243 + }, + { + "epoch": 0.76, + "learning_rate": 0.0004555510130122151, + "loss": 0.6778, + "step": 244 + }, + { + "epoch": 0.76, + "learning_rate": 0.0004540581194197008, + "loss": 0.6219, + "step": 245 + }, + { + "epoch": 0.77, + "learning_rate": 0.00045256198579676755, + "loss": 0.6984, + "step": 246 + }, + { + "epoch": 0.77, + "learning_rate": 0.000451062649704155, + "loss": 0.637, + "step": 247 + }, + { + "epoch": 0.77, + "learning_rate": 0.000449560148783002, + "loss": 0.658, + "step": 248 + }, + { + "epoch": 0.78, + "learning_rate": 0.0004480545207539004, + "loss": 0.7305, + "step": 249 + }, + { + "epoch": 0.78, + "learning_rate": 0.0004465458034159491, + "loss": 0.6788, + "step": 250 + }, + { + "epoch": 0.78, + "learning_rate": 0.00044503403464580475, + "loss": 0.7096, + "step": 251 + }, + { + "epoch": 0.79, + "learning_rate": 0.00044351925239673087, + "loss": 0.7108, + "step": 252 + }, + { + "epoch": 0.79, + "learning_rate": 0.0004420014946976447, + "loss": 0.6518, + "step": 253 + }, + { + "epoch": 0.79, + "learning_rate": 0.00044048079965216294, + "loss": 0.7262, + "step": 254 + }, + { + "epoch": 0.8, + "learning_rate": 0.0004389572054376452, + "loss": 0.6988, + "step": 255 + }, + { + "epoch": 0.8, + "learning_rate": 0.00043743075030423475, + "loss": 0.6637, + "step": 256 + }, + { + "epoch": 0.8, + "learning_rate": 0.0004359014725738994, + "loss": 0.7055, + "step": 257 + }, + { + "epoch": 0.8, + "learning_rate": 0.00043436941063946843, + "loss": 0.7179, + "step": 258 + }, + { + "epoch": 0.81, + "learning_rate": 0.0004328346029636694, + "loss": 0.6955, + "step": 259 + }, + { + "epoch": 0.81, + "learning_rate": 0.0004312970880781621, + "loss": 0.6749, + "step": 260 + }, + { + "epoch": 0.81, + "learning_rate": 0.0004297569045825713, + "loss": 0.6711, + "step": 261 + }, + { + "epoch": 0.82, + "learning_rate": 0.00042821409114351803, + "loss": 0.6366, + "step": 262 + }, + { + "epoch": 0.82, + "learning_rate": 0.00042666868649364844, + "loss": 0.7144, + "step": 263 + }, + { + "epoch": 0.82, + "learning_rate": 0.0004251207294306617, + "loss": 0.656, + "step": 264 + }, + { + "epoch": 0.83, + "learning_rate": 0.00042357025881633535, + "loss": 0.6803, + "step": 265 + }, + { + "epoch": 0.83, + "learning_rate": 0.00042201731357555073, + "loss": 0.7044, + "step": 266 + }, + { + "epoch": 0.83, + "learning_rate": 0.0004204619326953149, + "loss": 0.6488, + "step": 267 + }, + { + "epoch": 0.84, + "learning_rate": 0.00041890415522378223, + "loss": 0.6928, + "step": 268 + }, + { + "epoch": 0.84, + "learning_rate": 0.00041734402026927394, + "loss": 0.6764, + "step": 269 + }, + { + "epoch": 0.84, + "learning_rate": 0.00041578156699929636, + "loss": 0.6278, + "step": 270 + }, + { + "epoch": 0.85, + "learning_rate": 0.0004142168346395577, + "loss": 0.691, + "step": 271 + }, + { + "epoch": 0.85, + "learning_rate": 0.0004126498624729829, + "loss": 0.6865, + "step": 272 + }, + { + "epoch": 0.85, + "learning_rate": 0.000411080689838728, + "loss": 0.6715, + "step": 273 + }, + { + "epoch": 0.85, + "learning_rate": 0.00040950935613119226, + "loss": 0.6563, + "step": 274 + }, + { + "epoch": 0.86, + "learning_rate": 0.00040793590079902885, + "loss": 0.7608, + "step": 275 + }, + { + "epoch": 0.86, + "learning_rate": 0.00040636036334415487, + "loss": 0.6189, + "step": 276 + }, + { + "epoch": 0.86, + "learning_rate": 0.0004047827833207597, + "loss": 0.6981, + "step": 277 + }, + { + "epoch": 0.87, + "learning_rate": 0.0004032032003343117, + "loss": 0.644, + "step": 278 + }, + { + "epoch": 0.87, + "learning_rate": 0.0004016216540405639, + "loss": 0.7286, + "step": 279 + }, + { + "epoch": 0.87, + "learning_rate": 0.0004000381841445586, + "loss": 0.6694, + "step": 280 + }, + { + "epoch": 0.88, + "learning_rate": 0.00039845283039963093, + "loss": 0.7204, + "step": 281 + }, + { + "epoch": 0.88, + "learning_rate": 0.0003968656326064099, + "loss": 0.7042, + "step": 282 + }, + { + "epoch": 0.88, + "learning_rate": 0.00039527663061181983, + "loss": 0.712, + "step": 283 + }, + { + "epoch": 0.89, + "learning_rate": 0.00039368586430808014, + "loss": 0.7179, + "step": 284 + }, + { + "epoch": 0.89, + "learning_rate": 0.00039209337363170347, + "loss": 0.6903, + "step": 285 + }, + { + "epoch": 0.89, + "learning_rate": 0.00039049919856249315, + "loss": 0.6924, + "step": 286 + }, + { + "epoch": 0.9, + "learning_rate": 0.0003889033791225395, + "loss": 0.6713, + "step": 287 + }, + { + "epoch": 0.9, + "learning_rate": 0.000387305955375215, + "loss": 0.7852, + "step": 288 + }, + { + "epoch": 0.9, + "learning_rate": 0.0003857069674241689, + "loss": 0.6517, + "step": 289 + }, + { + "epoch": 0.9, + "learning_rate": 0.00038410645541232, + "loss": 0.6764, + "step": 290 + }, + { + "epoch": 0.91, + "learning_rate": 0.0003825044595208488, + "loss": 0.7183, + "step": 291 + }, + { + "epoch": 0.91, + "learning_rate": 0.000380901019968189, + "loss": 0.6826, + "step": 292 + }, + { + "epoch": 0.91, + "learning_rate": 0.0003792961770090178, + "loss": 0.6936, + "step": 293 + }, + { + "epoch": 0.92, + "learning_rate": 0.0003776899709332449, + "loss": 0.718, + "step": 294 + }, + { + "epoch": 0.92, + "learning_rate": 0.00037608244206500176, + "loss": 0.6795, + "step": 295 + }, + { + "epoch": 0.92, + "learning_rate": 0.00037447363076162853, + "loss": 0.6517, + "step": 296 + }, + { + "epoch": 0.93, + "learning_rate": 0.0003728635774126613, + "loss": 0.6849, + "step": 297 + }, + { + "epoch": 0.93, + "learning_rate": 0.0003712523224388177, + "loss": 0.6663, + "step": 298 + }, + { + "epoch": 0.93, + "learning_rate": 0.00036963990629098264, + "loss": 0.6585, + "step": 299 + }, + { + "epoch": 0.94, + "learning_rate": 0.0003680263694491925, + "loss": 0.7054, + "step": 300 + }, + { + "epoch": 0.94, + "learning_rate": 0.00036641175242161907, + "loss": 0.6662, + "step": 301 + }, + { + "epoch": 0.94, + "learning_rate": 0.000364796095743552, + "loss": 0.6306, + "step": 302 + }, + { + "epoch": 0.95, + "learning_rate": 0.00036317943997638187, + "loss": 0.6544, + "step": 303 + }, + { + "epoch": 0.95, + "learning_rate": 0.0003615618257065817, + "loss": 0.7078, + "step": 304 + }, + { + "epoch": 0.95, + "learning_rate": 0.00035994329354468763, + "loss": 0.6511, + "step": 305 + }, + { + "epoch": 0.95, + "learning_rate": 0.00035832388412427983, + "loss": 0.668, + "step": 306 + }, + { + "epoch": 0.96, + "learning_rate": 0.00035670363810096214, + "loss": 0.678, + "step": 307 + }, + { + "epoch": 0.96, + "learning_rate": 0.0003550825961513418, + "loss": 0.6596, + "step": 308 + }, + { + "epoch": 0.96, + "learning_rate": 0.00035346079897200736, + "loss": 0.674, + "step": 309 + }, + { + "epoch": 0.97, + "learning_rate": 0.00035183828727850804, + "loss": 0.6888, + "step": 310 + }, + { + "epoch": 0.97, + "learning_rate": 0.0003502151018043309, + "loss": 0.6864, + "step": 311 + }, + { + "epoch": 0.97, + "learning_rate": 0.0003485912832998785, + "loss": 0.6576, + "step": 312 + }, + { + "epoch": 0.98, + "learning_rate": 0.0003469668725314458, + "loss": 0.6989, + "step": 313 + }, + { + "epoch": 0.98, + "learning_rate": 0.0003453419102801962, + "loss": 0.6519, + "step": 314 + }, + { + "epoch": 0.98, + "learning_rate": 0.0003437164373411389, + "loss": 0.6754, + "step": 315 + }, + { + "epoch": 0.99, + "learning_rate": 0.00034209049452210347, + "loss": 0.6706, + "step": 316 + }, + { + "epoch": 0.99, + "learning_rate": 0.0003404641226427163, + "loss": 0.7295, + "step": 317 + }, + { + "epoch": 0.99, + "learning_rate": 0.000338837362533375, + "loss": 0.7137, + "step": 318 + }, + { + "epoch": 0.99, + "learning_rate": 0.0003372102550342242, + "loss": 0.7131, + "step": 319 + }, + { + "epoch": 1.0, + "learning_rate": 0.0003355828409941296, + "loss": 0.6404, + "step": 320 + }, + { + "epoch": 1.0, + "learning_rate": 0.00033395516126965267, + "loss": 0.6896, + "step": 321 + }, + { + "epoch": 1.0, + "learning_rate": 0.0003323272567240249, + "loss": 0.5439, + "step": 322 + }, + { + "epoch": 1.01, + "learning_rate": 0.0003306991682261223, + "loss": 0.5435, + "step": 323 + }, + { + "epoch": 1.01, + "learning_rate": 0.0003290709366494386, + "loss": 0.5861, + "step": 324 + }, + { + "epoch": 1.01, + "learning_rate": 0.0003274426028710596, + "loss": 0.5743, + "step": 325 + }, + { + "epoch": 1.02, + "learning_rate": 0.0003258142077706373, + "loss": 0.4928, + "step": 326 + }, + { + "epoch": 1.02, + "learning_rate": 0.0003241857922293627, + "loss": 0.5045, + "step": 327 + }, + { + "epoch": 1.02, + "learning_rate": 0.00032255739712894036, + "loss": 0.5733, + "step": 328 + }, + { + "epoch": 1.03, + "learning_rate": 0.00032092906335056147, + "loss": 0.517, + "step": 329 + }, + { + "epoch": 1.03, + "learning_rate": 0.00031930083177387765, + "loss": 0.5836, + "step": 330 + }, + { + "epoch": 1.03, + "learning_rate": 0.0003176727432759751, + "loss": 0.5745, + "step": 331 + }, + { + "epoch": 1.04, + "learning_rate": 0.00031604483873034735, + "loss": 0.5165, + "step": 332 + }, + { + "epoch": 1.04, + "learning_rate": 0.0003144171590058705, + "loss": 0.5716, + "step": 333 + }, + { + "epoch": 1.04, + "learning_rate": 0.0003127897449657758, + "loss": 0.5605, + "step": 334 + }, + { + "epoch": 1.04, + "learning_rate": 0.0003111626374666249, + "loss": 0.5454, + "step": 335 + }, + { + "epoch": 1.05, + "learning_rate": 0.00030953587735728377, + "loss": 0.5743, + "step": 336 + }, + { + "epoch": 1.05, + "learning_rate": 0.0003079095054778965, + "loss": 0.5874, + "step": 337 + }, + { + "epoch": 1.05, + "learning_rate": 0.0003062835626588612, + "loss": 0.5518, + "step": 338 + }, + { + "epoch": 1.06, + "learning_rate": 0.0003046580897198038, + "loss": 0.5589, + "step": 339 + }, + { + "epoch": 1.06, + "learning_rate": 0.00030303312746855434, + "loss": 0.5639, + "step": 340 + }, + { + "epoch": 1.06, + "learning_rate": 0.0003014087167001215, + "loss": 0.5249, + "step": 341 + }, + { + "epoch": 1.07, + "learning_rate": 0.00029978489819566903, + "loss": 0.5328, + "step": 342 + }, + { + "epoch": 1.07, + "learning_rate": 0.000298161712721492, + "loss": 0.5494, + "step": 343 + }, + { + "epoch": 1.07, + "learning_rate": 0.00029653920102799266, + "loss": 0.5593, + "step": 344 + }, + { + "epoch": 1.08, + "learning_rate": 0.00029491740384865835, + "loss": 0.5149, + "step": 345 + }, + { + "epoch": 1.08, + "learning_rate": 0.00029329636189903783, + "loss": 0.5434, + "step": 346 + }, + { + "epoch": 1.08, + "learning_rate": 0.00029167611587572014, + "loss": 0.5099, + "step": 347 + }, + { + "epoch": 1.09, + "learning_rate": 0.0002900567064553124, + "loss": 0.5175, + "step": 348 + }, + { + "epoch": 1.09, + "learning_rate": 0.00028843817429341826, + "loss": 0.5598, + "step": 349 + }, + { + "epoch": 1.09, + "learning_rate": 0.00028682056002361816, + "loss": 0.524, + "step": 350 + } + ], + "logging_steps": 1, + "max_steps": 640, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "total_flos": 9.080037171088589e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-350/training_args.bin b/checkpoint-350/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d220d7d4abfdc29d8393f3089d80acf2e32cf7d7 --- /dev/null +++ b/checkpoint-350/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d688198fe54ccac5c0a98d11fb9e7224690ace94f7e483ba1d16db91cf33a5c4 +size 4664 diff --git a/checkpoint-50/README.md b/checkpoint-50/README.md new file mode 100644 index 0000000000000000000000000000000000000000..69eced3e4fa0cffb8a4f749d70d2150b3b6038f0 --- /dev/null +++ b/checkpoint-50/README.md @@ -0,0 +1,204 @@ +--- +library_name: peft +base_model: NousResearch/Llama-2-13b-hf +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] + + +### Framework versions + +- PEFT 0.7.1 \ No newline at end of file diff --git a/checkpoint-50/adapter_config.json b/checkpoint-50/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..089f20ebe8d421867e642bbf0b75c66dc24a7f5f --- /dev/null +++ b/checkpoint-50/adapter_config.json @@ -0,0 +1,31 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "NousResearch/Llama-2-13b-hf", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "up_proj", + "k_proj", + "gate_proj", + "down_proj", + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/checkpoint-50/adapter_model.safetensors b/checkpoint-50/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eca4f31a9d0fdc53c02305501eb676dcfb4a4352 --- /dev/null +++ b/checkpoint-50/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c043bb9b4116116f476e23080ee1c4fa1dd3a2dee316c356137d4e76eb8200b4 +size 2002857080 diff --git a/checkpoint-50/optimizer.pt b/checkpoint-50/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d54498b8cb08429bbfe3850b006bff1b35c2c73a --- /dev/null +++ b/checkpoint-50/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e43d68b79e7510b2b2084dfa62c7f0f425b9e2799d9f3c816b4d366b3b79128e +size 1004004436 diff --git a/checkpoint-50/rng_state.pth b/checkpoint-50/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f8291cd6ce87668b786a72f3e93d072fbe54902 --- /dev/null +++ b/checkpoint-50/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4c917636c7a58af68a29056522a757e9f9b99005b776641aa157c536967817d +size 14244 diff --git a/checkpoint-50/scheduler.pt b/checkpoint-50/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..29fbd2f078dbf789579a5362c06aa5e193391025 --- /dev/null +++ b/checkpoint-50/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee2ce89ed7c0a337b7134e240b52d4db7bbe1a6fd5d4c7dae4a889ca774c1f33 +size 1064 diff --git a/checkpoint-50/trainer_state.json b/checkpoint-50/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d60cf598849c0eebe95e0fc52c859715b69a2b12 --- /dev/null +++ b/checkpoint-50/trainer_state.json @@ -0,0 +1,321 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.15594541910331383, + "eval_steps": 500, + "global_step": 50, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5e-05, + "loss": 1.0506, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001, + "loss": 0.9988, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 0.00015000000000000001, + "loss": 0.9783, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002, + "loss": 0.9849, + "step": 4 + }, + { + "epoch": 0.02, + "learning_rate": 0.00025, + "loss": 1.0159, + "step": 5 + }, + { + "epoch": 0.02, + "learning_rate": 0.00030000000000000003, + "loss": 0.9847, + "step": 6 + }, + { + "epoch": 0.02, + "learning_rate": 0.00034999999999999994, + "loss": 0.9101, + "step": 7 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 0.9445, + "step": 8 + }, + { + "epoch": 0.03, + "learning_rate": 0.00045, + "loss": 0.8578, + "step": 9 + }, + { + "epoch": 0.03, + "learning_rate": 0.0005, + "loss": 0.9356, + "step": 10 + }, + { + "epoch": 0.03, + "learning_rate": 0.0005499999999999999, + "loss": 0.8395, + "step": 11 + }, + { + "epoch": 0.04, + "learning_rate": 0.0006000000000000001, + "loss": 0.9002, + "step": 12 + }, + { + "epoch": 0.04, + "learning_rate": 0.00065, + "loss": 0.8955, + "step": 13 + }, + { + "epoch": 0.04, + "learning_rate": 0.0006499959204043461, + "loss": 0.902, + "step": 14 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499836817198032, + "loss": 0.8578, + "step": 15 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499632842536263, + "loss": 0.9005, + "step": 16 + }, + { + "epoch": 0.05, + "learning_rate": 0.0006499347285178979, + "loss": 0.8539, + "step": 17 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498980152295153, + "loss": 0.8595, + "step": 18 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498531453101735, + "loss": 0.8845, + "step": 19 + }, + { + "epoch": 0.06, + "learning_rate": 0.0006498001198863406, + "loss": 0.8924, + "step": 20 + }, + { + "epoch": 0.07, + "learning_rate": 0.000649738940289231, + "loss": 0.8365, + "step": 21 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006496696080547707, + "loss": 0.8462, + "step": 22 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006495921249235596, + "loss": 0.8528, + "step": 23 + }, + { + "epoch": 0.07, + "learning_rate": 0.0006495064928408277, + "loss": 0.8159, + "step": 24 + }, + { + "epoch": 0.08, + "learning_rate": 0.0006494127139563859, + "loss": 0.8245, + "step": 25 + }, + { + "epoch": 0.08, + "learning_rate": 0.000649310790624572, + "loss": 0.8081, + "step": 26 + }, + { + "epoch": 0.08, + "learning_rate": 0.0006492007254041924, + "loss": 0.8535, + "step": 27 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006490825210584566, + "loss": 0.8162, + "step": 28 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006489561805549089, + "loss": 0.8456, + "step": 29 + }, + { + "epoch": 0.09, + "learning_rate": 0.0006488217070653535, + "loss": 0.7799, + "step": 30 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006486791039657748, + "loss": 0.8088, + "step": 31 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006485283748362524, + "loss": 0.8683, + "step": 32 + }, + { + "epoch": 0.1, + "learning_rate": 0.0006483695234608723, + "loss": 0.8871, + "step": 33 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006482025538276304, + "loss": 0.7711, + "step": 34 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006480274701283335, + "loss": 0.7621, + "step": 35 + }, + { + "epoch": 0.11, + "learning_rate": 0.0006478442767584937, + "loss": 0.8243, + "step": 36 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006476529783172177, + "loss": 0.8257, + "step": 37 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006474535796070919, + "loss": 0.8141, + "step": 38 + }, + { + "epoch": 0.12, + "learning_rate": 0.0006472460856340619, + "loss": 0.8109, + "step": 39 + }, + { + "epoch": 0.12, + "learning_rate": 0.000647030501607306, + "loss": 0.7873, + "step": 40 + }, + { + "epoch": 0.13, + "learning_rate": 0.000646806832939105, + "loss": 0.7386, + "step": 41 + }, + { + "epoch": 0.13, + "learning_rate": 0.0006465750852447068, + "loss": 0.8636, + "step": 42 + }, + { + "epoch": 0.13, + "learning_rate": 0.0006463352643421846, + "loss": 0.7357, + "step": 43 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006460873762522906, + "loss": 0.8142, + "step": 44 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006458314271983063, + "loss": 0.7275, + "step": 45 + }, + { + "epoch": 0.14, + "learning_rate": 0.0006455674236058847, + "loss": 0.8029, + "step": 46 + }, + { + "epoch": 0.15, + "learning_rate": 0.00064529537210289, + "loss": 0.7901, + "step": 47 + }, + { + "epoch": 0.15, + "learning_rate": 0.0006450152795192307, + "loss": 0.7788, + "step": 48 + }, + { + "epoch": 0.15, + "learning_rate": 0.0006447271528866881, + "loss": 0.7621, + "step": 49 + }, + { + "epoch": 0.16, + "learning_rate": 0.0006444309994387402, + "loss": 0.7537, + "step": 50 + } + ], + "logging_steps": 1, + "max_steps": 640, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "total_flos": 1.2960384418676736e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-50/training_args.bin b/checkpoint-50/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d220d7d4abfdc29d8393f3089d80acf2e32cf7d7 --- /dev/null +++ b/checkpoint-50/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d688198fe54ccac5c0a98d11fb9e7224690ace94f7e483ba1d16db91cf33a5c4 +size 4664