diff --git a/README.md b/README.md index 7b95401dc46245ac339fc25059d4a56d90b4cde5..0cb113cb3ce645fcca40180da47ba9b7bc8e5f20 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,156 @@ ---- -license: apache-2.0 ---- +--- +license: apache-2.0 +library_name: peft +tags: +- generated_from_trainer +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +model-index: +- name: outputs/lora-out + results: [] +--- + + + +[Built with Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) +
See axolotl config + +axolotl version: `0.4.0` +```yaml +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +model_type: LlamaForCausalLM +tokenizer_type: LlamaTokenizer + +load_in_8bit: true +load_in_4bit: false +strict: false + +datasets: + - path: burkelibbey/colors + type: + field_instruction: color + field_output: description + conversation: chatml +chat_template: chatml + +dataset_prepared_path: +val_set_size: 0.05 +output_dir: ./outputs/lora-out + +sequence_len: 4096 +sample_packing: true +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_linear: true +lora_fan_in_fan_out: + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +num_epochs: 4 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +train_on_inputs: false +group_by_length: false +bf16: auto +fp16: +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 10 +evals_per_epoch: 4 +saves_per_epoch: 1 +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: + +``` + +

+ +# outputs/lora-out + +This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T) on the None dataset. +It achieves the following results on the evaluation set: +- Loss: 1.2375 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 0.0002 +- train_batch_size: 2 +- eval_batch_size: 2 +- seed: 42 +- gradient_accumulation_steps: 4 +- total_train_batch_size: 8 +- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 10 +- num_epochs: 4 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | +|:-------------:|:------:|:----:|:---------------:| +| 2.7509 | 0.0204 | 1 | 2.6902 | +| 1.8064 | 0.2653 | 13 | 1.6735 | +| 1.5513 | 0.5306 | 26 | 1.4832 | +| 1.482 | 0.7959 | 39 | 1.4111 | +| 1.392 | 1.0408 | 52 | 1.3677 | +| 1.3414 | 1.3061 | 65 | 1.3319 | +| 1.3213 | 1.5714 | 78 | 1.3029 | +| 1.3028 | 1.8367 | 91 | 1.2795 | +| 1.2761 | 2.0816 | 104 | 1.2697 | +| 1.2509 | 2.3469 | 117 | 1.2587 | +| 1.2884 | 2.6122 | 130 | 1.2472 | +| 1.254 | 2.8776 | 143 | 1.2410 | +| 1.2523 | 3.1224 | 156 | 1.2403 | +| 1.2468 | 3.3878 | 169 | 1.2385 | +| 1.2476 | 3.6531 | 182 | 1.2370 | +| 1.2366 | 3.9184 | 195 | 1.2375 | + + +### Framework versions + +- PEFT 0.10.0 +- Transformers 4.40.2 +- Pytorch 2.1.2+cu118 +- Datasets 2.19.1 +- Tokenizers 0.19.1 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3a88eecac3a652d9c19fe6d8e555dcac4859ccd2 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "k_proj", + "q_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.bin b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bd33a9c099c600f1f3e9c7f8d607fe50db66f7e --- /dev/null +++ b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1f20d120db2b5b6953281cb7fa6e550c36182e6da8f44b598738a5995d5be6f +size 101036698 diff --git a/checkpoint-147/README.md b/checkpoint-147/README.md new file mode 100644 index 0000000000000000000000000000000000000000..78aadd3258d05c05cabc678473b6f0942cd441c7 --- /dev/null +++ b/checkpoint-147/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.10.0 \ No newline at end of file diff --git a/checkpoint-147/adapter_config.json b/checkpoint-147/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3a88eecac3a652d9c19fe6d8e555dcac4859ccd2 --- /dev/null +++ b/checkpoint-147/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "k_proj", + "q_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-147/adapter_model.safetensors b/checkpoint-147/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..152ab9385147f50b5fe7e616f8f39ac7695d27ac --- /dev/null +++ b/checkpoint-147/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb2ebdaf4b36ef443d056e4e52b5f0bf8223038232557b97bb7ce888df4d3c48 +size 100966336 diff --git a/checkpoint-147/optimizer.pt b/checkpoint-147/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7bb6a65695119caab2edb42b70a1b5714b780127 --- /dev/null +++ b/checkpoint-147/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88cf550811bb96f9852bdb7a8952d49f6f0bf413e95b0759a8db28fcab406988 +size 50916644 diff --git a/checkpoint-147/rng_state.pth b/checkpoint-147/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b7ac6c67855d700e14ab4f2e12228c273b396659 --- /dev/null +++ b/checkpoint-147/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e8611d6bcf761201e741bdb2188a6ac976702d2e3f1a3ecc21fff90ea8a001 +size 14244 diff --git a/checkpoint-147/scheduler.pt b/checkpoint-147/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..097460d4c90beb69cee0fa370724efdc76d22114 --- /dev/null +++ b/checkpoint-147/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83c1e2e1bea1da15cd4a47196fc191277510622d916f0b4b5e8c95f3258d5825 +size 1064 diff --git a/checkpoint-147/special_tokens_map.json b/checkpoint-147/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-147/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-147/tokenizer.model b/checkpoint-147/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-147/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-147/tokenizer_config.json b/checkpoint-147/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0da2d3613b2ca0d42b48dea2162c06006f52b837 --- /dev/null +++ b/checkpoint-147/tokenizer_config.json @@ -0,0 +1,45 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/checkpoint-147/trainer_state.json b/checkpoint-147/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d0ffdde8040e0fba5f1aa6e5c1dc5908f2b58d9e --- /dev/null +++ b/checkpoint-147/trainer_state.json @@ -0,0 +1,1146 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9591836734693877, + "eval_steps": 13, + "global_step": 147, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02040816326530612, + "grad_norm": 0.7881951332092285, + "learning_rate": 2e-05, + "loss": 2.7509, + "step": 1 + }, + { + "epoch": 0.02040816326530612, + "eval_loss": 2.6902382373809814, + "eval_runtime": 269.5606, + "eval_samples_per_second": 6.288, + "eval_steps_per_second": 3.146, + "step": 1 + }, + { + "epoch": 0.04081632653061224, + "grad_norm": 0.789082407951355, + "learning_rate": 4e-05, + "loss": 2.7449, + "step": 2 + }, + { + "epoch": 0.061224489795918366, + "grad_norm": 0.7354114055633545, + "learning_rate": 6e-05, + "loss": 2.7164, + "step": 3 + }, + { + "epoch": 0.08163265306122448, + "grad_norm": 0.7292255759239197, + "learning_rate": 8e-05, + "loss": 2.7174, + "step": 4 + }, + { + "epoch": 0.10204081632653061, + "grad_norm": 0.6898028254508972, + "learning_rate": 0.0001, + "loss": 2.6891, + "step": 5 + }, + { + "epoch": 0.12244897959183673, + "grad_norm": 0.6861400604248047, + "learning_rate": 0.00012, + "loss": 2.6545, + "step": 6 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.7510350346565247, + "learning_rate": 0.00014, + "loss": 2.5656, + "step": 7 + }, + { + "epoch": 0.16326530612244897, + "grad_norm": 0.8011165261268616, + "learning_rate": 0.00016, + "loss": 2.4519, + "step": 8 + }, + { + "epoch": 0.1836734693877551, + "grad_norm": 0.8624005317687988, + "learning_rate": 0.00018, + "loss": 2.3178, + "step": 9 + }, + { + "epoch": 0.20408163265306123, + "grad_norm": 0.8004987835884094, + "learning_rate": 0.0002, + "loss": 2.1783, + "step": 10 + }, + { + "epoch": 0.22448979591836735, + "grad_norm": 0.6362400054931641, + "learning_rate": 0.000199985736255971, + "loss": 2.0252, + "step": 11 + }, + { + "epoch": 0.24489795918367346, + "grad_norm": 0.7930936217308044, + "learning_rate": 0.0001999429490929718, + "loss": 1.8839, + "step": 12 + }, + { + "epoch": 0.2653061224489796, + "grad_norm": 0.5149843096733093, + "learning_rate": 0.00019987165071710527, + "loss": 1.8064, + "step": 13 + }, + { + "epoch": 0.2653061224489796, + "eval_loss": 1.6734941005706787, + "eval_runtime": 271.2615, + "eval_samples_per_second": 6.249, + "eval_steps_per_second": 3.126, + "step": 13 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.42121434211730957, + "learning_rate": 0.00019977186146800707, + "loss": 1.7922, + "step": 14 + }, + { + "epoch": 0.30612244897959184, + "grad_norm": 0.3523242771625519, + "learning_rate": 0.0001996436098130433, + "loss": 1.7711, + "step": 15 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 0.3384595215320587, + "learning_rate": 0.00019948693233918952, + "loss": 1.7152, + "step": 16 + }, + { + "epoch": 0.3469387755102041, + "grad_norm": 0.34942421317100525, + "learning_rate": 0.00019930187374259337, + "loss": 1.7112, + "step": 17 + }, + { + "epoch": 0.3673469387755102, + "grad_norm": 0.31712639331817627, + "learning_rate": 0.00019908848681582391, + "loss": 1.7059, + "step": 18 + }, + { + "epoch": 0.3877551020408163, + "grad_norm": 0.2875436842441559, + "learning_rate": 0.00019884683243281116, + "loss": 1.6468, + "step": 19 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 0.24433130025863647, + "learning_rate": 0.00019857697953148037, + "loss": 1.6408, + "step": 20 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.21414674818515778, + "learning_rate": 0.00019827900509408581, + "loss": 1.616, + "step": 21 + }, + { + "epoch": 0.4489795918367347, + "grad_norm": 0.21537622809410095, + "learning_rate": 0.00019795299412524945, + "loss": 1.609, + "step": 22 + }, + { + "epoch": 0.46938775510204084, + "grad_norm": 0.2432074397802353, + "learning_rate": 0.00019759903962771156, + "loss": 1.6066, + "step": 23 + }, + { + "epoch": 0.4897959183673469, + "grad_norm": 0.2359839379787445, + "learning_rate": 0.00019721724257579907, + "loss": 1.5851, + "step": 24 + }, + { + "epoch": 0.5102040816326531, + "grad_norm": 0.22065888345241547, + "learning_rate": 0.00019680771188662044, + "loss": 1.5739, + "step": 25 + }, + { + "epoch": 0.5306122448979592, + "grad_norm": 0.20339132845401764, + "learning_rate": 0.0001963705643889941, + "loss": 1.5513, + "step": 26 + }, + { + "epoch": 0.5306122448979592, + "eval_loss": 1.4832030534744263, + "eval_runtime": 271.2449, + "eval_samples_per_second": 6.249, + "eval_steps_per_second": 3.126, + "step": 26 + }, + { + "epoch": 0.5510204081632653, + "grad_norm": 0.18875224888324738, + "learning_rate": 0.00019590592479012023, + "loss": 1.5378, + "step": 27 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.18564417958259583, + "learning_rate": 0.00019541392564000488, + "loss": 1.5212, + "step": 28 + }, + { + "epoch": 0.5918367346938775, + "grad_norm": 0.16226942837238312, + "learning_rate": 0.00019489470729364692, + "loss": 1.5391, + "step": 29 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 0.15650039911270142, + "learning_rate": 0.00019434841787099803, + "loss": 1.511, + "step": 30 + }, + { + "epoch": 0.6326530612244898, + "grad_norm": 0.15976540744304657, + "learning_rate": 0.00019377521321470805, + "loss": 1.5119, + "step": 31 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 0.16409288346767426, + "learning_rate": 0.00019317525684566685, + "loss": 1.4909, + "step": 32 + }, + { + "epoch": 0.673469387755102, + "grad_norm": 0.15468019247055054, + "learning_rate": 0.00019254871991635598, + "loss": 1.4951, + "step": 33 + }, + { + "epoch": 0.6938775510204082, + "grad_norm": 0.1462036371231079, + "learning_rate": 0.00019189578116202307, + "loss": 1.4643, + "step": 34 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.1541963368654251, + "learning_rate": 0.00019121662684969335, + "loss": 1.5159, + "step": 35 + }, + { + "epoch": 0.7346938775510204, + "grad_norm": 0.14798064529895782, + "learning_rate": 0.00019051145072503215, + "loss": 1.4741, + "step": 36 + }, + { + "epoch": 0.7551020408163265, + "grad_norm": 0.13914817571640015, + "learning_rate": 0.00018978045395707418, + "loss": 1.4788, + "step": 37 + }, + { + "epoch": 0.7755102040816326, + "grad_norm": 0.15608824789524078, + "learning_rate": 0.00018902384508083517, + "loss": 1.4687, + "step": 38 + }, + { + "epoch": 0.7959183673469388, + "grad_norm": 0.14460116624832153, + "learning_rate": 0.00018824183993782192, + "loss": 1.482, + "step": 39 + }, + { + "epoch": 0.7959183673469388, + "eval_loss": 1.411073088645935, + "eval_runtime": 271.292, + "eval_samples_per_second": 6.248, + "eval_steps_per_second": 3.126, + "step": 39 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 0.15740551054477692, + "learning_rate": 0.00018743466161445823, + "loss": 1.4486, + "step": 40 + }, + { + "epoch": 0.8367346938775511, + "grad_norm": 0.14149661362171173, + "learning_rate": 0.00018660254037844388, + "loss": 1.4353, + "step": 41 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.14034292101860046, + "learning_rate": 0.0001857457136130651, + "loss": 1.4523, + "step": 42 + }, + { + "epoch": 0.8775510204081632, + "grad_norm": 0.1487722396850586, + "learning_rate": 0.00018486442574947511, + "loss": 1.4095, + "step": 43 + }, + { + "epoch": 0.8979591836734694, + "grad_norm": 0.17400234937667847, + "learning_rate": 0.00018395892819696389, + "loss": 1.4414, + "step": 44 + }, + { + "epoch": 0.9183673469387755, + "grad_norm": 0.1741325408220291, + "learning_rate": 0.00018302947927123766, + "loss": 1.4379, + "step": 45 + }, + { + "epoch": 0.9387755102040817, + "grad_norm": 0.15319454669952393, + "learning_rate": 0.00018207634412072764, + "loss": 1.405, + "step": 46 + }, + { + "epoch": 0.9591836734693877, + "grad_norm": 0.15876264870166779, + "learning_rate": 0.00018109979465095013, + "loss": 1.4122, + "step": 47 + }, + { + "epoch": 0.9795918367346939, + "grad_norm": 0.17120805382728577, + "learning_rate": 0.00018010010944693848, + "loss": 1.4132, + "step": 48 + }, + { + "epoch": 1.0, + "grad_norm": 0.1436116099357605, + "learning_rate": 0.00017907757369376985, + "loss": 1.416, + "step": 49 + }, + { + "epoch": 1.0204081632653061, + "grad_norm": 0.1707429438829422, + "learning_rate": 0.0001780324790952092, + "loss": 1.3913, + "step": 50 + }, + { + "epoch": 1.0204081632653061, + "grad_norm": 0.17117524147033691, + "learning_rate": 0.00017696512379049325, + "loss": 1.3963, + "step": 51 + }, + { + "epoch": 1.0408163265306123, + "grad_norm": 0.13410089910030365, + "learning_rate": 0.0001758758122692791, + "loss": 1.392, + "step": 52 + }, + { + "epoch": 1.0408163265306123, + "eval_loss": 1.3676769733428955, + "eval_runtime": 270.8566, + "eval_samples_per_second": 6.258, + "eval_steps_per_second": 3.131, + "step": 52 + }, + { + "epoch": 1.0612244897959184, + "grad_norm": 0.18877607583999634, + "learning_rate": 0.00017476485528478093, + "loss": 1.3854, + "step": 53 + }, + { + "epoch": 1.0816326530612246, + "grad_norm": 0.1752927452325821, + "learning_rate": 0.00017363256976511972, + "loss": 1.3759, + "step": 54 + }, + { + "epoch": 1.1020408163265305, + "grad_norm": 0.17180170118808746, + "learning_rate": 0.000172479278722912, + "loss": 1.3614, + "step": 55 + }, + { + "epoch": 1.1224489795918366, + "grad_norm": 0.1640290915966034, + "learning_rate": 0.00017130531116312203, + "loss": 1.3853, + "step": 56 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.2047068476676941, + "learning_rate": 0.0001701110019892053, + "loss": 1.3699, + "step": 57 + }, + { + "epoch": 1.163265306122449, + "grad_norm": 0.1835869997739792, + "learning_rate": 0.00016889669190756868, + "loss": 1.3403, + "step": 58 + }, + { + "epoch": 1.183673469387755, + "grad_norm": 0.16733241081237793, + "learning_rate": 0.00016766272733037576, + "loss": 1.3609, + "step": 59 + }, + { + "epoch": 1.2040816326530612, + "grad_norm": 0.178726926445961, + "learning_rate": 0.00016640946027672392, + "loss": 1.3651, + "step": 60 + }, + { + "epoch": 1.2244897959183674, + "grad_norm": 0.16719630360603333, + "learning_rate": 0.00016513724827222227, + "loss": 1.3676, + "step": 61 + }, + { + "epoch": 1.2448979591836735, + "grad_norm": 0.15999363362789154, + "learning_rate": 0.00016384645424699835, + "loss": 1.3651, + "step": 62 + }, + { + "epoch": 1.2653061224489797, + "grad_norm": 0.1705988198518753, + "learning_rate": 0.00016253744643216368, + "loss": 1.3757, + "step": 63 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.14996370673179626, + "learning_rate": 0.0001612105982547663, + "loss": 1.3474, + "step": 64 + }, + { + "epoch": 1.306122448979592, + "grad_norm": 0.19127260148525238, + "learning_rate": 0.0001598662882312615, + "loss": 1.3414, + "step": 65 + }, + { + "epoch": 1.306122448979592, + "eval_loss": 1.331880807876587, + "eval_runtime": 270.8424, + "eval_samples_per_second": 6.258, + "eval_steps_per_second": 3.131, + "step": 65 + }, + { + "epoch": 1.3265306122448979, + "grad_norm": 0.16125527024269104, + "learning_rate": 0.00015850489985953076, + "loss": 1.3509, + "step": 66 + }, + { + "epoch": 1.346938775510204, + "grad_norm": 0.1979473978281021, + "learning_rate": 0.00015712682150947923, + "loss": 1.3579, + "step": 67 + }, + { + "epoch": 1.3673469387755102, + "grad_norm": 0.18317992985248566, + "learning_rate": 0.00015573244631224365, + "loss": 1.3341, + "step": 68 + }, + { + "epoch": 1.3877551020408163, + "grad_norm": 0.1646898239850998, + "learning_rate": 0.0001543221720480419, + "loss": 1.3361, + "step": 69 + }, + { + "epoch": 1.4081632653061225, + "grad_norm": 0.1760271042585373, + "learning_rate": 0.00015289640103269625, + "loss": 1.358, + "step": 70 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.165283203125, + "learning_rate": 0.0001514555400028629, + "loss": 1.3072, + "step": 71 + }, + { + "epoch": 1.4489795918367347, + "grad_norm": 0.1507076472043991, + "learning_rate": 0.00015000000000000001, + "loss": 1.3133, + "step": 72 + }, + { + "epoch": 1.469387755102041, + "grad_norm": 0.16913647949695587, + "learning_rate": 0.00014853019625310813, + "loss": 1.3232, + "step": 73 + }, + { + "epoch": 1.489795918367347, + "grad_norm": 0.18266479671001434, + "learning_rate": 0.0001470465480602756, + "loss": 1.3512, + "step": 74 + }, + { + "epoch": 1.510204081632653, + "grad_norm": 0.19301828742027283, + "learning_rate": 0.0001455494786690634, + "loss": 1.3241, + "step": 75 + }, + { + "epoch": 1.5306122448979593, + "grad_norm": 0.16109652817249298, + "learning_rate": 0.00014403941515576344, + "loss": 1.3256, + "step": 76 + }, + { + "epoch": 1.5510204081632653, + "grad_norm": 0.17053867876529694, + "learning_rate": 0.00014251678830356408, + "loss": 1.3162, + "step": 77 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.17348544299602509, + "learning_rate": 0.00014098203247965875, + "loss": 1.3213, + "step": 78 + }, + { + "epoch": 1.5714285714285714, + "eval_loss": 1.3028697967529297, + "eval_runtime": 270.8095, + "eval_samples_per_second": 6.259, + "eval_steps_per_second": 3.131, + "step": 78 + }, + { + "epoch": 1.5918367346938775, + "grad_norm": 0.1703907549381256, + "learning_rate": 0.00013943558551133186, + "loss": 1.3073, + "step": 79 + }, + { + "epoch": 1.6122448979591837, + "grad_norm": 0.17313100397586823, + "learning_rate": 0.0001378778885610576, + "loss": 1.3232, + "step": 80 + }, + { + "epoch": 1.6326530612244898, + "grad_norm": 0.17237025499343872, + "learning_rate": 0.00013630938600064747, + "loss": 1.3406, + "step": 81 + }, + { + "epoch": 1.6530612244897958, + "grad_norm": 0.19658459722995758, + "learning_rate": 0.00013473052528448201, + "loss": 1.3114, + "step": 82 + }, + { + "epoch": 1.6734693877551021, + "grad_norm": 0.20599938929080963, + "learning_rate": 0.0001331417568218636, + "loss": 1.3288, + "step": 83 + }, + { + "epoch": 1.693877551020408, + "grad_norm": 0.17759399116039276, + "learning_rate": 0.00013154353384852558, + "loss": 1.2995, + "step": 84 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.18712250888347626, + "learning_rate": 0.00012993631229733582, + "loss": 1.2895, + "step": 85 + }, + { + "epoch": 1.7346938775510203, + "grad_norm": 0.1991330236196518, + "learning_rate": 0.00012832055066823038, + "loss": 1.2886, + "step": 86 + }, + { + "epoch": 1.7551020408163265, + "grad_norm": 0.22125203907489777, + "learning_rate": 0.00012669670989741517, + "loss": 1.3233, + "step": 87 + }, + { + "epoch": 1.7755102040816326, + "grad_norm": 0.2052813619375229, + "learning_rate": 0.00012506525322587207, + "loss": 1.3079, + "step": 88 + }, + { + "epoch": 1.7959183673469388, + "grad_norm": 0.19290736317634583, + "learning_rate": 0.00012342664606720822, + "loss": 1.3174, + "step": 89 + }, + { + "epoch": 1.816326530612245, + "grad_norm": 0.20912542939186096, + "learning_rate": 0.00012178135587488515, + "loss": 1.2915, + "step": 90 + }, + { + "epoch": 1.836734693877551, + "grad_norm": 0.20760588347911835, + "learning_rate": 0.00012012985200886602, + "loss": 1.3028, + "step": 91 + }, + { + "epoch": 1.836734693877551, + "eval_loss": 1.2795333862304688, + "eval_runtime": 270.6525, + "eval_samples_per_second": 6.263, + "eval_steps_per_second": 3.133, + "step": 91 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.1996900886297226, + "learning_rate": 0.00011847260560171896, + "loss": 1.3119, + "step": 92 + }, + { + "epoch": 1.8775510204081631, + "grad_norm": 0.23766876757144928, + "learning_rate": 0.00011681008942421483, + "loss": 1.2978, + "step": 93 + }, + { + "epoch": 1.8979591836734695, + "grad_norm": 0.19782397150993347, + "learning_rate": 0.00011514277775045768, + "loss": 1.2955, + "step": 94 + }, + { + "epoch": 1.9183673469387754, + "grad_norm": 0.22519494593143463, + "learning_rate": 0.00011347114622258612, + "loss": 1.2957, + "step": 95 + }, + { + "epoch": 1.9387755102040818, + "grad_norm": 0.2590245306491852, + "learning_rate": 0.00011179567171508463, + "loss": 1.2809, + "step": 96 + }, + { + "epoch": 1.9591836734693877, + "grad_norm": 0.2235420197248459, + "learning_rate": 0.00011011683219874323, + "loss": 1.2784, + "step": 97 + }, + { + "epoch": 1.9795918367346939, + "grad_norm": 0.285740464925766, + "learning_rate": 0.00010843510660430447, + "loss": 1.309, + "step": 98 + }, + { + "epoch": 2.0, + "grad_norm": 0.20554350316524506, + "learning_rate": 0.00010675097468583652, + "loss": 1.273, + "step": 99 + }, + { + "epoch": 2.020408163265306, + "grad_norm": 0.24468418955802917, + "learning_rate": 0.00010506491688387127, + "loss": 1.2833, + "step": 100 + }, + { + "epoch": 2.020408163265306, + "grad_norm": 0.21553528308868408, + "learning_rate": 0.00010337741418834684, + "loss": 1.2669, + "step": 101 + }, + { + "epoch": 2.0408163265306123, + "grad_norm": 0.22015659511089325, + "learning_rate": 0.0001016889480013931, + "loss": 1.2795, + "step": 102 + }, + { + "epoch": 2.061224489795918, + "grad_norm": 0.2028799206018448, + "learning_rate": 0.0001, + "loss": 1.2584, + "step": 103 + }, + { + "epoch": 2.0816326530612246, + "grad_norm": 0.23474323749542236, + "learning_rate": 9.83110519986069e-05, + "loss": 1.2761, + "step": 104 + }, + { + "epoch": 2.0816326530612246, + "eval_loss": 1.2696796655654907, + "eval_runtime": 270.6586, + "eval_samples_per_second": 6.263, + "eval_steps_per_second": 3.133, + "step": 104 + }, + { + "epoch": 2.1020408163265305, + "grad_norm": 0.21070216596126556, + "learning_rate": 9.662258581165319e-05, + "loss": 1.2808, + "step": 105 + }, + { + "epoch": 2.122448979591837, + "grad_norm": 0.21867221593856812, + "learning_rate": 9.493508311612874e-05, + "loss": 1.2873, + "step": 106 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.21630822122097015, + "learning_rate": 9.324902531416349e-05, + "loss": 1.2527, + "step": 107 + }, + { + "epoch": 2.163265306122449, + "grad_norm": 0.2134082019329071, + "learning_rate": 9.156489339569554e-05, + "loss": 1.2755, + "step": 108 + }, + { + "epoch": 2.183673469387755, + "grad_norm": 0.22310714423656464, + "learning_rate": 8.98831678012568e-05, + "loss": 1.2512, + "step": 109 + }, + { + "epoch": 2.204081632653061, + "grad_norm": 0.2365124374628067, + "learning_rate": 8.820432828491542e-05, + "loss": 1.2725, + "step": 110 + }, + { + "epoch": 2.2244897959183674, + "grad_norm": 0.2086496651172638, + "learning_rate": 8.652885377741393e-05, + "loss": 1.2488, + "step": 111 + }, + { + "epoch": 2.2448979591836733, + "grad_norm": 0.20848101377487183, + "learning_rate": 8.485722224954237e-05, + "loss": 1.2793, + "step": 112 + }, + { + "epoch": 2.2653061224489797, + "grad_norm": 0.20784686505794525, + "learning_rate": 8.31899105757852e-05, + "loss": 1.2564, + "step": 113 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.21896174550056458, + "learning_rate": 8.15273943982811e-05, + "loss": 1.2515, + "step": 114 + }, + { + "epoch": 2.306122448979592, + "grad_norm": 0.21367855370044708, + "learning_rate": 7.987014799113397e-05, + "loss": 1.248, + "step": 115 + }, + { + "epoch": 2.326530612244898, + "grad_norm": 0.20891636610031128, + "learning_rate": 7.821864412511485e-05, + "loss": 1.2753, + "step": 116 + }, + { + "epoch": 2.3469387755102042, + "grad_norm": 0.2092975378036499, + "learning_rate": 7.65733539327918e-05, + "loss": 1.2509, + "step": 117 + }, + { + "epoch": 2.3469387755102042, + "eval_loss": 1.258699655532837, + "eval_runtime": 270.5384, + "eval_samples_per_second": 6.265, + "eval_steps_per_second": 3.134, + "step": 117 + }, + { + "epoch": 2.36734693877551, + "grad_norm": 0.1905972808599472, + "learning_rate": 7.493474677412794e-05, + "loss": 1.2516, + "step": 118 + }, + { + "epoch": 2.387755102040816, + "grad_norm": 0.19716158509254456, + "learning_rate": 7.330329010258483e-05, + "loss": 1.2665, + "step": 119 + }, + { + "epoch": 2.4081632653061225, + "grad_norm": 0.1953389048576355, + "learning_rate": 7.16794493317696e-05, + "loss": 1.2661, + "step": 120 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.1990067958831787, + "learning_rate": 7.006368770266421e-05, + "loss": 1.2619, + "step": 121 + }, + { + "epoch": 2.4489795918367347, + "grad_norm": 0.1954919546842575, + "learning_rate": 6.845646615147445e-05, + "loss": 1.2736, + "step": 122 + }, + { + "epoch": 2.4693877551020407, + "grad_norm": 0.18382853269577026, + "learning_rate": 6.685824317813643e-05, + "loss": 1.2732, + "step": 123 + }, + { + "epoch": 2.489795918367347, + "grad_norm": 0.18729491531848907, + "learning_rate": 6.526947471551798e-05, + "loss": 1.2509, + "step": 124 + }, + { + "epoch": 2.510204081632653, + "grad_norm": 0.2034740000963211, + "learning_rate": 6.369061399935255e-05, + "loss": 1.2829, + "step": 125 + }, + { + "epoch": 2.5306122448979593, + "grad_norm": 0.1952620893716812, + "learning_rate": 6.21221114389424e-05, + "loss": 1.2689, + "step": 126 + }, + { + "epoch": 2.5510204081632653, + "grad_norm": 0.1986168622970581, + "learning_rate": 6.0564414488668165e-05, + "loss": 1.2644, + "step": 127 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.19526751339435577, + "learning_rate": 5.901796752034128e-05, + "loss": 1.265, + "step": 128 + }, + { + "epoch": 2.5918367346938775, + "grad_norm": 0.195367693901062, + "learning_rate": 5.748321169643596e-05, + "loss": 1.2782, + "step": 129 + }, + { + "epoch": 2.612244897959184, + "grad_norm": 0.18351928889751434, + "learning_rate": 5.596058484423656e-05, + "loss": 1.2884, + "step": 130 + }, + { + "epoch": 2.612244897959184, + "eval_loss": 1.2471545934677124, + "eval_runtime": 270.4953, + "eval_samples_per_second": 6.266, + "eval_steps_per_second": 3.135, + "step": 130 + }, + { + "epoch": 2.63265306122449, + "grad_norm": 0.2015760987997055, + "learning_rate": 5.44505213309366e-05, + "loss": 1.2536, + "step": 131 + }, + { + "epoch": 2.6530612244897958, + "grad_norm": 0.1734190732240677, + "learning_rate": 5.2953451939724454e-05, + "loss": 1.2628, + "step": 132 + }, + { + "epoch": 2.673469387755102, + "grad_norm": 0.214066281914711, + "learning_rate": 5.146980374689192e-05, + "loss": 1.2543, + "step": 133 + }, + { + "epoch": 2.693877551020408, + "grad_norm": 0.17507924139499664, + "learning_rate": 5.000000000000002e-05, + "loss": 1.2665, + "step": 134 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 0.1778109222650528, + "learning_rate": 4.854445999713715e-05, + "loss": 1.2789, + "step": 135 + }, + { + "epoch": 2.7346938775510203, + "grad_norm": 0.1856827288866043, + "learning_rate": 4.710359896730379e-05, + "loss": 1.2481, + "step": 136 + }, + { + "epoch": 2.7551020408163263, + "grad_norm": 0.17856694757938385, + "learning_rate": 4.567782795195816e-05, + "loss": 1.2732, + "step": 137 + }, + { + "epoch": 2.7755102040816326, + "grad_norm": 0.21598489582538605, + "learning_rate": 4.426755368775637e-05, + "loss": 1.2525, + "step": 138 + }, + { + "epoch": 2.795918367346939, + "grad_norm": 0.17308436334133148, + "learning_rate": 4.287317849052075e-05, + "loss": 1.2665, + "step": 139 + }, + { + "epoch": 2.816326530612245, + "grad_norm": 0.19207212328910828, + "learning_rate": 4.149510014046922e-05, + "loss": 1.2681, + "step": 140 + }, + { + "epoch": 2.836734693877551, + "grad_norm": 0.19626958668231964, + "learning_rate": 4.013371176873849e-05, + "loss": 1.2727, + "step": 141 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.1986483484506607, + "learning_rate": 3.878940174523371e-05, + "loss": 1.2414, + "step": 142 + }, + { + "epoch": 2.877551020408163, + "grad_norm": 0.19369089603424072, + "learning_rate": 3.746255356783632e-05, + "loss": 1.254, + "step": 143 + }, + { + "epoch": 2.877551020408163, + "eval_loss": 1.2410293817520142, + "eval_runtime": 270.6762, + "eval_samples_per_second": 6.262, + "eval_steps_per_second": 3.133, + "step": 143 + }, + { + "epoch": 2.8979591836734695, + "grad_norm": 0.20910531282424927, + "learning_rate": 3.615354575300166e-05, + "loss": 1.2541, + "step": 144 + }, + { + "epoch": 2.9183673469387754, + "grad_norm": 0.19536806643009186, + "learning_rate": 3.4862751727777797e-05, + "loss": 1.2517, + "step": 145 + }, + { + "epoch": 2.938775510204082, + "grad_norm": 0.18630966544151306, + "learning_rate": 3.3590539723276083e-05, + "loss": 1.2473, + "step": 146 + }, + { + "epoch": 2.9591836734693877, + "grad_norm": 0.1874723732471466, + "learning_rate": 3.233727266962425e-05, + "loss": 1.244, + "step": 147 + } + ], + "logging_steps": 1, + "max_steps": 196, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 49, + "total_flos": 3.0628052408991744e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-147/training_args.bin b/checkpoint-147/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..be517173d48b34e38c41c52f74f7e02495562a61 --- /dev/null +++ b/checkpoint-147/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9cc4ff61a4799ac22d5c627a0169a20cc75619ae1b5871f2d114f95284d87a6 +size 5816 diff --git a/checkpoint-196/README.md b/checkpoint-196/README.md new file mode 100644 index 0000000000000000000000000000000000000000..78aadd3258d05c05cabc678473b6f0942cd441c7 --- /dev/null +++ b/checkpoint-196/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.10.0 \ No newline at end of file diff --git a/checkpoint-196/adapter_config.json b/checkpoint-196/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3a88eecac3a652d9c19fe6d8e555dcac4859ccd2 --- /dev/null +++ b/checkpoint-196/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "k_proj", + "q_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-196/adapter_model.safetensors b/checkpoint-196/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d41bc9e3dcdf277e604cca1cf5b2badf5137c297 --- /dev/null +++ b/checkpoint-196/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7149dfd1479c35b75fc75c4e9be3785070da91bd7c29d040e9a259ea5111014 +size 100966336 diff --git a/checkpoint-196/optimizer.pt b/checkpoint-196/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b4f86376842744d04c564484c1f9305e286115e --- /dev/null +++ b/checkpoint-196/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96f2429392a17aa7909b16091d5a0b62592f80090a1a9943b203b1e1c29e66f8 +size 50916644 diff --git a/checkpoint-196/rng_state.pth b/checkpoint-196/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9366e994ff6abae6251829869eedce66f55dc840 --- /dev/null +++ b/checkpoint-196/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a160c2864b63ef158843056f3ba263b2da60c6bef707459f056731cde2e27043 +size 14244 diff --git a/checkpoint-196/scheduler.pt b/checkpoint-196/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..74c904cc3f97ac3da7ec72be923894928f8e70c1 --- /dev/null +++ b/checkpoint-196/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e22ca0a50bab80d00c8b8910bffb983a348f8762b7cf025e6f8e64a05a938289 +size 1064 diff --git a/checkpoint-196/special_tokens_map.json b/checkpoint-196/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-196/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-196/tokenizer.model b/checkpoint-196/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-196/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-196/tokenizer_config.json b/checkpoint-196/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0da2d3613b2ca0d42b48dea2162c06006f52b837 --- /dev/null +++ b/checkpoint-196/tokenizer_config.json @@ -0,0 +1,45 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/checkpoint-196/trainer_state.json b/checkpoint-196/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f2a21907b42052c3b124b28477dd1724df48ffb8 --- /dev/null +++ b/checkpoint-196/trainer_state.json @@ -0,0 +1,1521 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.938775510204082, + "eval_steps": 13, + "global_step": 196, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02040816326530612, + "grad_norm": 0.7881951332092285, + "learning_rate": 2e-05, + "loss": 2.7509, + "step": 1 + }, + { + "epoch": 0.02040816326530612, + "eval_loss": 2.6902382373809814, + "eval_runtime": 269.5606, + "eval_samples_per_second": 6.288, + "eval_steps_per_second": 3.146, + "step": 1 + }, + { + "epoch": 0.04081632653061224, + "grad_norm": 0.789082407951355, + "learning_rate": 4e-05, + "loss": 2.7449, + "step": 2 + }, + { + "epoch": 0.061224489795918366, + "grad_norm": 0.7354114055633545, + "learning_rate": 6e-05, + "loss": 2.7164, + "step": 3 + }, + { + "epoch": 0.08163265306122448, + "grad_norm": 0.7292255759239197, + "learning_rate": 8e-05, + "loss": 2.7174, + "step": 4 + }, + { + "epoch": 0.10204081632653061, + "grad_norm": 0.6898028254508972, + "learning_rate": 0.0001, + "loss": 2.6891, + "step": 5 + }, + { + "epoch": 0.12244897959183673, + "grad_norm": 0.6861400604248047, + "learning_rate": 0.00012, + "loss": 2.6545, + "step": 6 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.7510350346565247, + "learning_rate": 0.00014, + "loss": 2.5656, + "step": 7 + }, + { + "epoch": 0.16326530612244897, + "grad_norm": 0.8011165261268616, + "learning_rate": 0.00016, + "loss": 2.4519, + "step": 8 + }, + { + "epoch": 0.1836734693877551, + "grad_norm": 0.8624005317687988, + "learning_rate": 0.00018, + "loss": 2.3178, + "step": 9 + }, + { + "epoch": 0.20408163265306123, + "grad_norm": 0.8004987835884094, + "learning_rate": 0.0002, + "loss": 2.1783, + "step": 10 + }, + { + "epoch": 0.22448979591836735, + "grad_norm": 0.6362400054931641, + "learning_rate": 0.000199985736255971, + "loss": 2.0252, + "step": 11 + }, + { + "epoch": 0.24489795918367346, + "grad_norm": 0.7930936217308044, + "learning_rate": 0.0001999429490929718, + "loss": 1.8839, + "step": 12 + }, + { + "epoch": 0.2653061224489796, + "grad_norm": 0.5149843096733093, + "learning_rate": 0.00019987165071710527, + "loss": 1.8064, + "step": 13 + }, + { + "epoch": 0.2653061224489796, + "eval_loss": 1.6734941005706787, + "eval_runtime": 271.2615, + "eval_samples_per_second": 6.249, + "eval_steps_per_second": 3.126, + "step": 13 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.42121434211730957, + "learning_rate": 0.00019977186146800707, + "loss": 1.7922, + "step": 14 + }, + { + "epoch": 0.30612244897959184, + "grad_norm": 0.3523242771625519, + "learning_rate": 0.0001996436098130433, + "loss": 1.7711, + "step": 15 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 0.3384595215320587, + "learning_rate": 0.00019948693233918952, + "loss": 1.7152, + "step": 16 + }, + { + "epoch": 0.3469387755102041, + "grad_norm": 0.34942421317100525, + "learning_rate": 0.00019930187374259337, + "loss": 1.7112, + "step": 17 + }, + { + "epoch": 0.3673469387755102, + "grad_norm": 0.31712639331817627, + "learning_rate": 0.00019908848681582391, + "loss": 1.7059, + "step": 18 + }, + { + "epoch": 0.3877551020408163, + "grad_norm": 0.2875436842441559, + "learning_rate": 0.00019884683243281116, + "loss": 1.6468, + "step": 19 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 0.24433130025863647, + "learning_rate": 0.00019857697953148037, + "loss": 1.6408, + "step": 20 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.21414674818515778, + "learning_rate": 0.00019827900509408581, + "loss": 1.616, + "step": 21 + }, + { + "epoch": 0.4489795918367347, + "grad_norm": 0.21537622809410095, + "learning_rate": 0.00019795299412524945, + "loss": 1.609, + "step": 22 + }, + { + "epoch": 0.46938775510204084, + "grad_norm": 0.2432074397802353, + "learning_rate": 0.00019759903962771156, + "loss": 1.6066, + "step": 23 + }, + { + "epoch": 0.4897959183673469, + "grad_norm": 0.2359839379787445, + "learning_rate": 0.00019721724257579907, + "loss": 1.5851, + "step": 24 + }, + { + "epoch": 0.5102040816326531, + "grad_norm": 0.22065888345241547, + "learning_rate": 0.00019680771188662044, + "loss": 1.5739, + "step": 25 + }, + { + "epoch": 0.5306122448979592, + "grad_norm": 0.20339132845401764, + "learning_rate": 0.0001963705643889941, + "loss": 1.5513, + "step": 26 + }, + { + "epoch": 0.5306122448979592, + "eval_loss": 1.4832030534744263, + "eval_runtime": 271.2449, + "eval_samples_per_second": 6.249, + "eval_steps_per_second": 3.126, + "step": 26 + }, + { + "epoch": 0.5510204081632653, + "grad_norm": 0.18875224888324738, + "learning_rate": 0.00019590592479012023, + "loss": 1.5378, + "step": 27 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.18564417958259583, + "learning_rate": 0.00019541392564000488, + "loss": 1.5212, + "step": 28 + }, + { + "epoch": 0.5918367346938775, + "grad_norm": 0.16226942837238312, + "learning_rate": 0.00019489470729364692, + "loss": 1.5391, + "step": 29 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 0.15650039911270142, + "learning_rate": 0.00019434841787099803, + "loss": 1.511, + "step": 30 + }, + { + "epoch": 0.6326530612244898, + "grad_norm": 0.15976540744304657, + "learning_rate": 0.00019377521321470805, + "loss": 1.5119, + "step": 31 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 0.16409288346767426, + "learning_rate": 0.00019317525684566685, + "loss": 1.4909, + "step": 32 + }, + { + "epoch": 0.673469387755102, + "grad_norm": 0.15468019247055054, + "learning_rate": 0.00019254871991635598, + "loss": 1.4951, + "step": 33 + }, + { + "epoch": 0.6938775510204082, + "grad_norm": 0.1462036371231079, + "learning_rate": 0.00019189578116202307, + "loss": 1.4643, + "step": 34 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.1541963368654251, + "learning_rate": 0.00019121662684969335, + "loss": 1.5159, + "step": 35 + }, + { + "epoch": 0.7346938775510204, + "grad_norm": 0.14798064529895782, + "learning_rate": 0.00019051145072503215, + "loss": 1.4741, + "step": 36 + }, + { + "epoch": 0.7551020408163265, + "grad_norm": 0.13914817571640015, + "learning_rate": 0.00018978045395707418, + "loss": 1.4788, + "step": 37 + }, + { + "epoch": 0.7755102040816326, + "grad_norm": 0.15608824789524078, + "learning_rate": 0.00018902384508083517, + "loss": 1.4687, + "step": 38 + }, + { + "epoch": 0.7959183673469388, + "grad_norm": 0.14460116624832153, + "learning_rate": 0.00018824183993782192, + "loss": 1.482, + "step": 39 + }, + { + "epoch": 0.7959183673469388, + "eval_loss": 1.411073088645935, + "eval_runtime": 271.292, + "eval_samples_per_second": 6.248, + "eval_steps_per_second": 3.126, + "step": 39 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 0.15740551054477692, + "learning_rate": 0.00018743466161445823, + "loss": 1.4486, + "step": 40 + }, + { + "epoch": 0.8367346938775511, + "grad_norm": 0.14149661362171173, + "learning_rate": 0.00018660254037844388, + "loss": 1.4353, + "step": 41 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.14034292101860046, + "learning_rate": 0.0001857457136130651, + "loss": 1.4523, + "step": 42 + }, + { + "epoch": 0.8775510204081632, + "grad_norm": 0.1487722396850586, + "learning_rate": 0.00018486442574947511, + "loss": 1.4095, + "step": 43 + }, + { + "epoch": 0.8979591836734694, + "grad_norm": 0.17400234937667847, + "learning_rate": 0.00018395892819696389, + "loss": 1.4414, + "step": 44 + }, + { + "epoch": 0.9183673469387755, + "grad_norm": 0.1741325408220291, + "learning_rate": 0.00018302947927123766, + "loss": 1.4379, + "step": 45 + }, + { + "epoch": 0.9387755102040817, + "grad_norm": 0.15319454669952393, + "learning_rate": 0.00018207634412072764, + "loss": 1.405, + "step": 46 + }, + { + "epoch": 0.9591836734693877, + "grad_norm": 0.15876264870166779, + "learning_rate": 0.00018109979465095013, + "loss": 1.4122, + "step": 47 + }, + { + "epoch": 0.9795918367346939, + "grad_norm": 0.17120805382728577, + "learning_rate": 0.00018010010944693848, + "loss": 1.4132, + "step": 48 + }, + { + "epoch": 1.0, + "grad_norm": 0.1436116099357605, + "learning_rate": 0.00017907757369376985, + "loss": 1.416, + "step": 49 + }, + { + "epoch": 1.0204081632653061, + "grad_norm": 0.1707429438829422, + "learning_rate": 0.0001780324790952092, + "loss": 1.3913, + "step": 50 + }, + { + "epoch": 1.0204081632653061, + "grad_norm": 0.17117524147033691, + "learning_rate": 0.00017696512379049325, + "loss": 1.3963, + "step": 51 + }, + { + "epoch": 1.0408163265306123, + "grad_norm": 0.13410089910030365, + "learning_rate": 0.0001758758122692791, + "loss": 1.392, + "step": 52 + }, + { + "epoch": 1.0408163265306123, + "eval_loss": 1.3676769733428955, + "eval_runtime": 270.8566, + "eval_samples_per_second": 6.258, + "eval_steps_per_second": 3.131, + "step": 52 + }, + { + "epoch": 1.0612244897959184, + "grad_norm": 0.18877607583999634, + "learning_rate": 0.00017476485528478093, + "loss": 1.3854, + "step": 53 + }, + { + "epoch": 1.0816326530612246, + "grad_norm": 0.1752927452325821, + "learning_rate": 0.00017363256976511972, + "loss": 1.3759, + "step": 54 + }, + { + "epoch": 1.1020408163265305, + "grad_norm": 0.17180170118808746, + "learning_rate": 0.000172479278722912, + "loss": 1.3614, + "step": 55 + }, + { + "epoch": 1.1224489795918366, + "grad_norm": 0.1640290915966034, + "learning_rate": 0.00017130531116312203, + "loss": 1.3853, + "step": 56 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.2047068476676941, + "learning_rate": 0.0001701110019892053, + "loss": 1.3699, + "step": 57 + }, + { + "epoch": 1.163265306122449, + "grad_norm": 0.1835869997739792, + "learning_rate": 0.00016889669190756868, + "loss": 1.3403, + "step": 58 + }, + { + "epoch": 1.183673469387755, + "grad_norm": 0.16733241081237793, + "learning_rate": 0.00016766272733037576, + "loss": 1.3609, + "step": 59 + }, + { + "epoch": 1.2040816326530612, + "grad_norm": 0.178726926445961, + "learning_rate": 0.00016640946027672392, + "loss": 1.3651, + "step": 60 + }, + { + "epoch": 1.2244897959183674, + "grad_norm": 0.16719630360603333, + "learning_rate": 0.00016513724827222227, + "loss": 1.3676, + "step": 61 + }, + { + "epoch": 1.2448979591836735, + "grad_norm": 0.15999363362789154, + "learning_rate": 0.00016384645424699835, + "loss": 1.3651, + "step": 62 + }, + { + "epoch": 1.2653061224489797, + "grad_norm": 0.1705988198518753, + "learning_rate": 0.00016253744643216368, + "loss": 1.3757, + "step": 63 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.14996370673179626, + "learning_rate": 0.0001612105982547663, + "loss": 1.3474, + "step": 64 + }, + { + "epoch": 1.306122448979592, + "grad_norm": 0.19127260148525238, + "learning_rate": 0.0001598662882312615, + "loss": 1.3414, + "step": 65 + }, + { + "epoch": 1.306122448979592, + "eval_loss": 1.331880807876587, + "eval_runtime": 270.8424, + "eval_samples_per_second": 6.258, + "eval_steps_per_second": 3.131, + "step": 65 + }, + { + "epoch": 1.3265306122448979, + "grad_norm": 0.16125527024269104, + "learning_rate": 0.00015850489985953076, + "loss": 1.3509, + "step": 66 + }, + { + "epoch": 1.346938775510204, + "grad_norm": 0.1979473978281021, + "learning_rate": 0.00015712682150947923, + "loss": 1.3579, + "step": 67 + }, + { + "epoch": 1.3673469387755102, + "grad_norm": 0.18317992985248566, + "learning_rate": 0.00015573244631224365, + "loss": 1.3341, + "step": 68 + }, + { + "epoch": 1.3877551020408163, + "grad_norm": 0.1646898239850998, + "learning_rate": 0.0001543221720480419, + "loss": 1.3361, + "step": 69 + }, + { + "epoch": 1.4081632653061225, + "grad_norm": 0.1760271042585373, + "learning_rate": 0.00015289640103269625, + "loss": 1.358, + "step": 70 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.165283203125, + "learning_rate": 0.0001514555400028629, + "loss": 1.3072, + "step": 71 + }, + { + "epoch": 1.4489795918367347, + "grad_norm": 0.1507076472043991, + "learning_rate": 0.00015000000000000001, + "loss": 1.3133, + "step": 72 + }, + { + "epoch": 1.469387755102041, + "grad_norm": 0.16913647949695587, + "learning_rate": 0.00014853019625310813, + "loss": 1.3232, + "step": 73 + }, + { + "epoch": 1.489795918367347, + "grad_norm": 0.18266479671001434, + "learning_rate": 0.0001470465480602756, + "loss": 1.3512, + "step": 74 + }, + { + "epoch": 1.510204081632653, + "grad_norm": 0.19301828742027283, + "learning_rate": 0.0001455494786690634, + "loss": 1.3241, + "step": 75 + }, + { + "epoch": 1.5306122448979593, + "grad_norm": 0.16109652817249298, + "learning_rate": 0.00014403941515576344, + "loss": 1.3256, + "step": 76 + }, + { + "epoch": 1.5510204081632653, + "grad_norm": 0.17053867876529694, + "learning_rate": 0.00014251678830356408, + "loss": 1.3162, + "step": 77 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.17348544299602509, + "learning_rate": 0.00014098203247965875, + "loss": 1.3213, + "step": 78 + }, + { + "epoch": 1.5714285714285714, + "eval_loss": 1.3028697967529297, + "eval_runtime": 270.8095, + "eval_samples_per_second": 6.259, + "eval_steps_per_second": 3.131, + "step": 78 + }, + { + "epoch": 1.5918367346938775, + "grad_norm": 0.1703907549381256, + "learning_rate": 0.00013943558551133186, + "loss": 1.3073, + "step": 79 + }, + { + "epoch": 1.6122448979591837, + "grad_norm": 0.17313100397586823, + "learning_rate": 0.0001378778885610576, + "loss": 1.3232, + "step": 80 + }, + { + "epoch": 1.6326530612244898, + "grad_norm": 0.17237025499343872, + "learning_rate": 0.00013630938600064747, + "loss": 1.3406, + "step": 81 + }, + { + "epoch": 1.6530612244897958, + "grad_norm": 0.19658459722995758, + "learning_rate": 0.00013473052528448201, + "loss": 1.3114, + "step": 82 + }, + { + "epoch": 1.6734693877551021, + "grad_norm": 0.20599938929080963, + "learning_rate": 0.0001331417568218636, + "loss": 1.3288, + "step": 83 + }, + { + "epoch": 1.693877551020408, + "grad_norm": 0.17759399116039276, + "learning_rate": 0.00013154353384852558, + "loss": 1.2995, + "step": 84 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.18712250888347626, + "learning_rate": 0.00012993631229733582, + "loss": 1.2895, + "step": 85 + }, + { + "epoch": 1.7346938775510203, + "grad_norm": 0.1991330236196518, + "learning_rate": 0.00012832055066823038, + "loss": 1.2886, + "step": 86 + }, + { + "epoch": 1.7551020408163265, + "grad_norm": 0.22125203907489777, + "learning_rate": 0.00012669670989741517, + "loss": 1.3233, + "step": 87 + }, + { + "epoch": 1.7755102040816326, + "grad_norm": 0.2052813619375229, + "learning_rate": 0.00012506525322587207, + "loss": 1.3079, + "step": 88 + }, + { + "epoch": 1.7959183673469388, + "grad_norm": 0.19290736317634583, + "learning_rate": 0.00012342664606720822, + "loss": 1.3174, + "step": 89 + }, + { + "epoch": 1.816326530612245, + "grad_norm": 0.20912542939186096, + "learning_rate": 0.00012178135587488515, + "loss": 1.2915, + "step": 90 + }, + { + "epoch": 1.836734693877551, + "grad_norm": 0.20760588347911835, + "learning_rate": 0.00012012985200886602, + "loss": 1.3028, + "step": 91 + }, + { + "epoch": 1.836734693877551, + "eval_loss": 1.2795333862304688, + "eval_runtime": 270.6525, + "eval_samples_per_second": 6.263, + "eval_steps_per_second": 3.133, + "step": 91 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.1996900886297226, + "learning_rate": 0.00011847260560171896, + "loss": 1.3119, + "step": 92 + }, + { + "epoch": 1.8775510204081631, + "grad_norm": 0.23766876757144928, + "learning_rate": 0.00011681008942421483, + "loss": 1.2978, + "step": 93 + }, + { + "epoch": 1.8979591836734695, + "grad_norm": 0.19782397150993347, + "learning_rate": 0.00011514277775045768, + "loss": 1.2955, + "step": 94 + }, + { + "epoch": 1.9183673469387754, + "grad_norm": 0.22519494593143463, + "learning_rate": 0.00011347114622258612, + "loss": 1.2957, + "step": 95 + }, + { + "epoch": 1.9387755102040818, + "grad_norm": 0.2590245306491852, + "learning_rate": 0.00011179567171508463, + "loss": 1.2809, + "step": 96 + }, + { + "epoch": 1.9591836734693877, + "grad_norm": 0.2235420197248459, + "learning_rate": 0.00011011683219874323, + "loss": 1.2784, + "step": 97 + }, + { + "epoch": 1.9795918367346939, + "grad_norm": 0.285740464925766, + "learning_rate": 0.00010843510660430447, + "loss": 1.309, + "step": 98 + }, + { + "epoch": 2.0, + "grad_norm": 0.20554350316524506, + "learning_rate": 0.00010675097468583652, + "loss": 1.273, + "step": 99 + }, + { + "epoch": 2.020408163265306, + "grad_norm": 0.24468418955802917, + "learning_rate": 0.00010506491688387127, + "loss": 1.2833, + "step": 100 + }, + { + "epoch": 2.020408163265306, + "grad_norm": 0.21553528308868408, + "learning_rate": 0.00010337741418834684, + "loss": 1.2669, + "step": 101 + }, + { + "epoch": 2.0408163265306123, + "grad_norm": 0.22015659511089325, + "learning_rate": 0.0001016889480013931, + "loss": 1.2795, + "step": 102 + }, + { + "epoch": 2.061224489795918, + "grad_norm": 0.2028799206018448, + "learning_rate": 0.0001, + "loss": 1.2584, + "step": 103 + }, + { + "epoch": 2.0816326530612246, + "grad_norm": 0.23474323749542236, + "learning_rate": 9.83110519986069e-05, + "loss": 1.2761, + "step": 104 + }, + { + "epoch": 2.0816326530612246, + "eval_loss": 1.2696796655654907, + "eval_runtime": 270.6586, + "eval_samples_per_second": 6.263, + "eval_steps_per_second": 3.133, + "step": 104 + }, + { + "epoch": 2.1020408163265305, + "grad_norm": 0.21070216596126556, + "learning_rate": 9.662258581165319e-05, + "loss": 1.2808, + "step": 105 + }, + { + "epoch": 2.122448979591837, + "grad_norm": 0.21867221593856812, + "learning_rate": 9.493508311612874e-05, + "loss": 1.2873, + "step": 106 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.21630822122097015, + "learning_rate": 9.324902531416349e-05, + "loss": 1.2527, + "step": 107 + }, + { + "epoch": 2.163265306122449, + "grad_norm": 0.2134082019329071, + "learning_rate": 9.156489339569554e-05, + "loss": 1.2755, + "step": 108 + }, + { + "epoch": 2.183673469387755, + "grad_norm": 0.22310714423656464, + "learning_rate": 8.98831678012568e-05, + "loss": 1.2512, + "step": 109 + }, + { + "epoch": 2.204081632653061, + "grad_norm": 0.2365124374628067, + "learning_rate": 8.820432828491542e-05, + "loss": 1.2725, + "step": 110 + }, + { + "epoch": 2.2244897959183674, + "grad_norm": 0.2086496651172638, + "learning_rate": 8.652885377741393e-05, + "loss": 1.2488, + "step": 111 + }, + { + "epoch": 2.2448979591836733, + "grad_norm": 0.20848101377487183, + "learning_rate": 8.485722224954237e-05, + "loss": 1.2793, + "step": 112 + }, + { + "epoch": 2.2653061224489797, + "grad_norm": 0.20784686505794525, + "learning_rate": 8.31899105757852e-05, + "loss": 1.2564, + "step": 113 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.21896174550056458, + "learning_rate": 8.15273943982811e-05, + "loss": 1.2515, + "step": 114 + }, + { + "epoch": 2.306122448979592, + "grad_norm": 0.21367855370044708, + "learning_rate": 7.987014799113397e-05, + "loss": 1.248, + "step": 115 + }, + { + "epoch": 2.326530612244898, + "grad_norm": 0.20891636610031128, + "learning_rate": 7.821864412511485e-05, + "loss": 1.2753, + "step": 116 + }, + { + "epoch": 2.3469387755102042, + "grad_norm": 0.2092975378036499, + "learning_rate": 7.65733539327918e-05, + "loss": 1.2509, + "step": 117 + }, + { + "epoch": 2.3469387755102042, + "eval_loss": 1.258699655532837, + "eval_runtime": 270.5384, + "eval_samples_per_second": 6.265, + "eval_steps_per_second": 3.134, + "step": 117 + }, + { + "epoch": 2.36734693877551, + "grad_norm": 0.1905972808599472, + "learning_rate": 7.493474677412794e-05, + "loss": 1.2516, + "step": 118 + }, + { + "epoch": 2.387755102040816, + "grad_norm": 0.19716158509254456, + "learning_rate": 7.330329010258483e-05, + "loss": 1.2665, + "step": 119 + }, + { + "epoch": 2.4081632653061225, + "grad_norm": 0.1953389048576355, + "learning_rate": 7.16794493317696e-05, + "loss": 1.2661, + "step": 120 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.1990067958831787, + "learning_rate": 7.006368770266421e-05, + "loss": 1.2619, + "step": 121 + }, + { + "epoch": 2.4489795918367347, + "grad_norm": 0.1954919546842575, + "learning_rate": 6.845646615147445e-05, + "loss": 1.2736, + "step": 122 + }, + { + "epoch": 2.4693877551020407, + "grad_norm": 0.18382853269577026, + "learning_rate": 6.685824317813643e-05, + "loss": 1.2732, + "step": 123 + }, + { + "epoch": 2.489795918367347, + "grad_norm": 0.18729491531848907, + "learning_rate": 6.526947471551798e-05, + "loss": 1.2509, + "step": 124 + }, + { + "epoch": 2.510204081632653, + "grad_norm": 0.2034740000963211, + "learning_rate": 6.369061399935255e-05, + "loss": 1.2829, + "step": 125 + }, + { + "epoch": 2.5306122448979593, + "grad_norm": 0.1952620893716812, + "learning_rate": 6.21221114389424e-05, + "loss": 1.2689, + "step": 126 + }, + { + "epoch": 2.5510204081632653, + "grad_norm": 0.1986168622970581, + "learning_rate": 6.0564414488668165e-05, + "loss": 1.2644, + "step": 127 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.19526751339435577, + "learning_rate": 5.901796752034128e-05, + "loss": 1.265, + "step": 128 + }, + { + "epoch": 2.5918367346938775, + "grad_norm": 0.195367693901062, + "learning_rate": 5.748321169643596e-05, + "loss": 1.2782, + "step": 129 + }, + { + "epoch": 2.612244897959184, + "grad_norm": 0.18351928889751434, + "learning_rate": 5.596058484423656e-05, + "loss": 1.2884, + "step": 130 + }, + { + "epoch": 2.612244897959184, + "eval_loss": 1.2471545934677124, + "eval_runtime": 270.4953, + "eval_samples_per_second": 6.266, + "eval_steps_per_second": 3.135, + "step": 130 + }, + { + "epoch": 2.63265306122449, + "grad_norm": 0.2015760987997055, + "learning_rate": 5.44505213309366e-05, + "loss": 1.2536, + "step": 131 + }, + { + "epoch": 2.6530612244897958, + "grad_norm": 0.1734190732240677, + "learning_rate": 5.2953451939724454e-05, + "loss": 1.2628, + "step": 132 + }, + { + "epoch": 2.673469387755102, + "grad_norm": 0.214066281914711, + "learning_rate": 5.146980374689192e-05, + "loss": 1.2543, + "step": 133 + }, + { + "epoch": 2.693877551020408, + "grad_norm": 0.17507924139499664, + "learning_rate": 5.000000000000002e-05, + "loss": 1.2665, + "step": 134 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 0.1778109222650528, + "learning_rate": 4.854445999713715e-05, + "loss": 1.2789, + "step": 135 + }, + { + "epoch": 2.7346938775510203, + "grad_norm": 0.1856827288866043, + "learning_rate": 4.710359896730379e-05, + "loss": 1.2481, + "step": 136 + }, + { + "epoch": 2.7551020408163263, + "grad_norm": 0.17856694757938385, + "learning_rate": 4.567782795195816e-05, + "loss": 1.2732, + "step": 137 + }, + { + "epoch": 2.7755102040816326, + "grad_norm": 0.21598489582538605, + "learning_rate": 4.426755368775637e-05, + "loss": 1.2525, + "step": 138 + }, + { + "epoch": 2.795918367346939, + "grad_norm": 0.17308436334133148, + "learning_rate": 4.287317849052075e-05, + "loss": 1.2665, + "step": 139 + }, + { + "epoch": 2.816326530612245, + "grad_norm": 0.19207212328910828, + "learning_rate": 4.149510014046922e-05, + "loss": 1.2681, + "step": 140 + }, + { + "epoch": 2.836734693877551, + "grad_norm": 0.19626958668231964, + "learning_rate": 4.013371176873849e-05, + "loss": 1.2727, + "step": 141 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.1986483484506607, + "learning_rate": 3.878940174523371e-05, + "loss": 1.2414, + "step": 142 + }, + { + "epoch": 2.877551020408163, + "grad_norm": 0.19369089603424072, + "learning_rate": 3.746255356783632e-05, + "loss": 1.254, + "step": 143 + }, + { + "epoch": 2.877551020408163, + "eval_loss": 1.2410293817520142, + "eval_runtime": 270.6762, + "eval_samples_per_second": 6.262, + "eval_steps_per_second": 3.133, + "step": 143 + }, + { + "epoch": 2.8979591836734695, + "grad_norm": 0.20910531282424927, + "learning_rate": 3.615354575300166e-05, + "loss": 1.2541, + "step": 144 + }, + { + "epoch": 2.9183673469387754, + "grad_norm": 0.19536806643009186, + "learning_rate": 3.4862751727777797e-05, + "loss": 1.2517, + "step": 145 + }, + { + "epoch": 2.938775510204082, + "grad_norm": 0.18630966544151306, + "learning_rate": 3.3590539723276083e-05, + "loss": 1.2473, + "step": 146 + }, + { + "epoch": 2.9591836734693877, + "grad_norm": 0.1874723732471466, + "learning_rate": 3.233727266962425e-05, + "loss": 1.244, + "step": 147 + }, + { + "epoch": 2.979591836734694, + "grad_norm": 0.1764463186264038, + "learning_rate": 3.110330809243134e-05, + "loss": 1.2465, + "step": 148 + }, + { + "epoch": 3.0, + "grad_norm": 0.16570010781288147, + "learning_rate": 2.9888998010794743e-05, + "loss": 1.2443, + "step": 149 + }, + { + "epoch": 3.020408163265306, + "grad_norm": 0.18820856511592865, + "learning_rate": 2.869468883687798e-05, + "loss": 1.2694, + "step": 150 + }, + { + "epoch": 3.020408163265306, + "grad_norm": 0.2009415626525879, + "learning_rate": 2.7520721277088024e-05, + "loss": 1.2185, + "step": 151 + }, + { + "epoch": 3.0408163265306123, + "grad_norm": 0.1824546605348587, + "learning_rate": 2.6367430234880284e-05, + "loss": 1.2222, + "step": 152 + }, + { + "epoch": 3.061224489795918, + "grad_norm": 0.180531844496727, + "learning_rate": 2.523514471521913e-05, + "loss": 1.2592, + "step": 153 + }, + { + "epoch": 3.0816326530612246, + "grad_norm": 0.17422904074192047, + "learning_rate": 2.4124187730720917e-05, + "loss": 1.2429, + "step": 154 + }, + { + "epoch": 3.1020408163265305, + "grad_norm": 0.17531636357307434, + "learning_rate": 2.3034876209506772e-05, + "loss": 1.2459, + "step": 155 + }, + { + "epoch": 3.122448979591837, + "grad_norm": 0.17256909608840942, + "learning_rate": 2.1967520904790827e-05, + "loss": 1.2523, + "step": 156 + }, + { + "epoch": 3.122448979591837, + "eval_loss": 1.240277886390686, + "eval_runtime": 270.7279, + "eval_samples_per_second": 6.261, + "eval_steps_per_second": 3.132, + "step": 156 + }, + { + "epoch": 3.142857142857143, + "grad_norm": 0.17711801826953888, + "learning_rate": 2.092242630623016e-05, + "loss": 1.2416, + "step": 157 + }, + { + "epoch": 3.163265306122449, + "grad_norm": 0.1642543524503708, + "learning_rate": 1.9899890553061562e-05, + "loss": 1.2563, + "step": 158 + }, + { + "epoch": 3.183673469387755, + "grad_norm": 0.17609795928001404, + "learning_rate": 1.8900205349049904e-05, + "loss": 1.2406, + "step": 159 + }, + { + "epoch": 3.204081632653061, + "grad_norm": 0.18534283339977264, + "learning_rate": 1.7923655879272393e-05, + "loss": 1.2522, + "step": 160 + }, + { + "epoch": 3.2244897959183674, + "grad_norm": 0.17926208674907684, + "learning_rate": 1.6970520728762375e-05, + "loss": 1.2315, + "step": 161 + }, + { + "epoch": 3.2448979591836733, + "grad_norm": 0.18245543539524078, + "learning_rate": 1.60410718030361e-05, + "loss": 1.2493, + "step": 162 + }, + { + "epoch": 3.2653061224489797, + "grad_norm": 0.16576482355594635, + "learning_rate": 1.5135574250524897e-05, + "loss": 1.2633, + "step": 163 + }, + { + "epoch": 3.2857142857142856, + "grad_norm": 0.1768399477005005, + "learning_rate": 1.425428638693489e-05, + "loss": 1.2399, + "step": 164 + }, + { + "epoch": 3.306122448979592, + "grad_norm": 0.17402540147304535, + "learning_rate": 1.339745962155613e-05, + "loss": 1.2574, + "step": 165 + }, + { + "epoch": 3.326530612244898, + "grad_norm": 0.17550399899482727, + "learning_rate": 1.2565338385541792e-05, + "loss": 1.2429, + "step": 166 + }, + { + "epoch": 3.3469387755102042, + "grad_norm": 0.18776686489582062, + "learning_rate": 1.1758160062178093e-05, + "loss": 1.2378, + "step": 167 + }, + { + "epoch": 3.36734693877551, + "grad_norm": 0.1816324144601822, + "learning_rate": 1.097615491916485e-05, + "loss": 1.2503, + "step": 168 + }, + { + "epoch": 3.387755102040816, + "grad_norm": 0.17802877724170685, + "learning_rate": 1.0219546042925843e-05, + "loss": 1.2468, + "step": 169 + }, + { + "epoch": 3.387755102040816, + "eval_loss": 1.2385426759719849, + "eval_runtime": 270.6389, + "eval_samples_per_second": 6.263, + "eval_steps_per_second": 3.133, + "step": 169 + }, + { + "epoch": 3.4081632653061225, + "grad_norm": 0.1731177568435669, + "learning_rate": 9.488549274967872e-06, + "loss": 1.2431, + "step": 170 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 0.16203820705413818, + "learning_rate": 8.783373150306661e-06, + "loss": 1.2394, + "step": 171 + }, + { + "epoch": 3.4489795918367347, + "grad_norm": 0.1603914201259613, + "learning_rate": 8.10421883797694e-06, + "loss": 1.2317, + "step": 172 + }, + { + "epoch": 3.4693877551020407, + "grad_norm": 0.16672447323799133, + "learning_rate": 7.4512800836440525e-06, + "loss": 1.2382, + "step": 173 + }, + { + "epoch": 3.489795918367347, + "grad_norm": 0.16903318464756012, + "learning_rate": 6.824743154333157e-06, + "loss": 1.2406, + "step": 174 + }, + { + "epoch": 3.510204081632653, + "grad_norm": 0.16718582808971405, + "learning_rate": 6.22478678529197e-06, + "loss": 1.2253, + "step": 175 + }, + { + "epoch": 3.5306122448979593, + "grad_norm": 0.16773243248462677, + "learning_rate": 5.651582129001986e-06, + "loss": 1.2545, + "step": 176 + }, + { + "epoch": 3.5510204081632653, + "grad_norm": 0.16658060252666473, + "learning_rate": 5.105292706353093e-06, + "loss": 1.2329, + "step": 177 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 0.16760899126529694, + "learning_rate": 4.586074359995119e-06, + "loss": 1.2218, + "step": 178 + }, + { + "epoch": 3.5918367346938775, + "grad_norm": 0.17462213337421417, + "learning_rate": 4.094075209879788e-06, + "loss": 1.236, + "step": 179 + }, + { + "epoch": 3.612244897959184, + "grad_norm": 0.16253593564033508, + "learning_rate": 3.6294356110059157e-06, + "loss": 1.2518, + "step": 180 + }, + { + "epoch": 3.63265306122449, + "grad_norm": 0.16653120517730713, + "learning_rate": 3.1922881133795825e-06, + "loss": 1.2171, + "step": 181 + }, + { + "epoch": 3.6530612244897958, + "grad_norm": 0.1757594645023346, + "learning_rate": 2.7827574242009437e-06, + "loss": 1.2476, + "step": 182 + }, + { + "epoch": 3.6530612244897958, + "eval_loss": 1.237037181854248, + "eval_runtime": 270.3815, + "eval_samples_per_second": 6.269, + "eval_steps_per_second": 3.136, + "step": 182 + }, + { + "epoch": 3.673469387755102, + "grad_norm": 0.1665186882019043, + "learning_rate": 2.4009603722884742e-06, + "loss": 1.2497, + "step": 183 + }, + { + "epoch": 3.693877551020408, + "grad_norm": 0.17469817399978638, + "learning_rate": 2.0470058747505516e-06, + "loss": 1.2426, + "step": 184 + }, + { + "epoch": 3.7142857142857144, + "grad_norm": 0.17130160331726074, + "learning_rate": 1.7209949059142083e-06, + "loss": 1.2255, + "step": 185 + }, + { + "epoch": 3.7346938775510203, + "grad_norm": 0.1677573323249817, + "learning_rate": 1.4230204685196203e-06, + "loss": 1.2643, + "step": 186 + }, + { + "epoch": 3.7551020408163263, + "grad_norm": 0.16778886318206787, + "learning_rate": 1.1531675671888619e-06, + "loss": 1.234, + "step": 187 + }, + { + "epoch": 3.7755102040816326, + "grad_norm": 0.16397559642791748, + "learning_rate": 9.11513184176116e-07, + "loss": 1.2509, + "step": 188 + }, + { + "epoch": 3.795918367346939, + "grad_norm": 0.16539420187473297, + "learning_rate": 6.981262574066394e-07, + "loss": 1.2425, + "step": 189 + }, + { + "epoch": 3.816326530612245, + "grad_norm": 0.18255014717578888, + "learning_rate": 5.130676608104845e-07, + "loss": 1.2628, + "step": 190 + }, + { + "epoch": 3.836734693877551, + "grad_norm": 0.16024163365364075, + "learning_rate": 3.56390186956701e-07, + "loss": 1.2331, + "step": 191 + }, + { + "epoch": 3.857142857142857, + "grad_norm": 0.17575234174728394, + "learning_rate": 2.2813853199292746e-07, + "loss": 1.2497, + "step": 192 + }, + { + "epoch": 3.877551020408163, + "grad_norm": 0.1590609848499298, + "learning_rate": 1.2834928289472416e-07, + "loss": 1.2436, + "step": 193 + }, + { + "epoch": 3.8979591836734695, + "grad_norm": 0.17772971093654633, + "learning_rate": 5.705090702819993e-08, + "loss": 1.2361, + "step": 194 + }, + { + "epoch": 3.9183673469387754, + "grad_norm": 0.15970654785633087, + "learning_rate": 1.426374402901942e-08, + "loss": 1.2366, + "step": 195 + }, + { + "epoch": 3.9183673469387754, + "eval_loss": 1.2375136613845825, + "eval_runtime": 270.7418, + "eval_samples_per_second": 6.261, + "eval_steps_per_second": 3.132, + "step": 195 + }, + { + "epoch": 3.938775510204082, + "grad_norm": 0.15187527239322662, + "learning_rate": 0.0, + "loss": 1.2409, + "step": 196 + } + ], + "logging_steps": 1, + "max_steps": 196, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 49, + "total_flos": 4.083740321198899e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-196/training_args.bin b/checkpoint-196/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..be517173d48b34e38c41c52f74f7e02495562a61 --- /dev/null +++ b/checkpoint-196/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9cc4ff61a4799ac22d5c627a0169a20cc75619ae1b5871f2d114f95284d87a6 +size 5816 diff --git a/checkpoint-49/README.md b/checkpoint-49/README.md new file mode 100644 index 0000000000000000000000000000000000000000..78aadd3258d05c05cabc678473b6f0942cd441c7 --- /dev/null +++ b/checkpoint-49/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.10.0 \ No newline at end of file diff --git a/checkpoint-49/adapter_config.json b/checkpoint-49/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3a88eecac3a652d9c19fe6d8e555dcac4859ccd2 --- /dev/null +++ b/checkpoint-49/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "k_proj", + "q_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-49/adapter_model.safetensors b/checkpoint-49/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2be6674fb682f39b548ed9e9bd5da5609f3dc247 --- /dev/null +++ b/checkpoint-49/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15a7afbbb6db02fdac7ffe868d42729e1c9515f835763d3b9551db4ae31e3529 +size 100966336 diff --git a/checkpoint-49/optimizer.pt b/checkpoint-49/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..25c358c12ab69ae68df830c4eac3f66906a29560 --- /dev/null +++ b/checkpoint-49/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9db4145fa287fcc2dc98bac341ab537efce6a4407796361cd24ac6b2176f6a70 +size 50916644 diff --git a/checkpoint-49/rng_state.pth b/checkpoint-49/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..da1cdd4a55a9f91350d2cac6a5db9d6937576d0a --- /dev/null +++ b/checkpoint-49/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41e595b32f221472ac195c50986dfcd13bac01a4909d487f497aaa38e078d0c2 +size 14244 diff --git a/checkpoint-49/scheduler.pt b/checkpoint-49/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..17d9c23995b4c00077c7f6144a172ccd082a6603 --- /dev/null +++ b/checkpoint-49/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e5af14094f757ccb041613325b6c93fe808050ec47f3a4ec285ab4a0e229950 +size 1064 diff --git a/checkpoint-49/special_tokens_map.json b/checkpoint-49/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-49/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-49/tokenizer.model b/checkpoint-49/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-49/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-49/tokenizer_config.json b/checkpoint-49/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0da2d3613b2ca0d42b48dea2162c06006f52b837 --- /dev/null +++ b/checkpoint-49/tokenizer_config.json @@ -0,0 +1,45 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/checkpoint-49/trainer_state.json b/checkpoint-49/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b2df254a5cae83ed593542b19908da9e31dbd7c7 --- /dev/null +++ b/checkpoint-49/trainer_state.json @@ -0,0 +1,396 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 13, + "global_step": 49, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02040816326530612, + "grad_norm": 0.7881951332092285, + "learning_rate": 2e-05, + "loss": 2.7509, + "step": 1 + }, + { + "epoch": 0.02040816326530612, + "eval_loss": 2.6902382373809814, + "eval_runtime": 269.5606, + "eval_samples_per_second": 6.288, + "eval_steps_per_second": 3.146, + "step": 1 + }, + { + "epoch": 0.04081632653061224, + "grad_norm": 0.789082407951355, + "learning_rate": 4e-05, + "loss": 2.7449, + "step": 2 + }, + { + "epoch": 0.061224489795918366, + "grad_norm": 0.7354114055633545, + "learning_rate": 6e-05, + "loss": 2.7164, + "step": 3 + }, + { + "epoch": 0.08163265306122448, + "grad_norm": 0.7292255759239197, + "learning_rate": 8e-05, + "loss": 2.7174, + "step": 4 + }, + { + "epoch": 0.10204081632653061, + "grad_norm": 0.6898028254508972, + "learning_rate": 0.0001, + "loss": 2.6891, + "step": 5 + }, + { + "epoch": 0.12244897959183673, + "grad_norm": 0.6861400604248047, + "learning_rate": 0.00012, + "loss": 2.6545, + "step": 6 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.7510350346565247, + "learning_rate": 0.00014, + "loss": 2.5656, + "step": 7 + }, + { + "epoch": 0.16326530612244897, + "grad_norm": 0.8011165261268616, + "learning_rate": 0.00016, + "loss": 2.4519, + "step": 8 + }, + { + "epoch": 0.1836734693877551, + "grad_norm": 0.8624005317687988, + "learning_rate": 0.00018, + "loss": 2.3178, + "step": 9 + }, + { + "epoch": 0.20408163265306123, + "grad_norm": 0.8004987835884094, + "learning_rate": 0.0002, + "loss": 2.1783, + "step": 10 + }, + { + "epoch": 0.22448979591836735, + "grad_norm": 0.6362400054931641, + "learning_rate": 0.000199985736255971, + "loss": 2.0252, + "step": 11 + }, + { + "epoch": 0.24489795918367346, + "grad_norm": 0.7930936217308044, + "learning_rate": 0.0001999429490929718, + "loss": 1.8839, + "step": 12 + }, + { + "epoch": 0.2653061224489796, + "grad_norm": 0.5149843096733093, + "learning_rate": 0.00019987165071710527, + "loss": 1.8064, + "step": 13 + }, + { + "epoch": 0.2653061224489796, + "eval_loss": 1.6734941005706787, + "eval_runtime": 271.2615, + "eval_samples_per_second": 6.249, + "eval_steps_per_second": 3.126, + "step": 13 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.42121434211730957, + "learning_rate": 0.00019977186146800707, + "loss": 1.7922, + "step": 14 + }, + { + "epoch": 0.30612244897959184, + "grad_norm": 0.3523242771625519, + "learning_rate": 0.0001996436098130433, + "loss": 1.7711, + "step": 15 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 0.3384595215320587, + "learning_rate": 0.00019948693233918952, + "loss": 1.7152, + "step": 16 + }, + { + "epoch": 0.3469387755102041, + "grad_norm": 0.34942421317100525, + "learning_rate": 0.00019930187374259337, + "loss": 1.7112, + "step": 17 + }, + { + "epoch": 0.3673469387755102, + "grad_norm": 0.31712639331817627, + "learning_rate": 0.00019908848681582391, + "loss": 1.7059, + "step": 18 + }, + { + "epoch": 0.3877551020408163, + "grad_norm": 0.2875436842441559, + "learning_rate": 0.00019884683243281116, + "loss": 1.6468, + "step": 19 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 0.24433130025863647, + "learning_rate": 0.00019857697953148037, + "loss": 1.6408, + "step": 20 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.21414674818515778, + "learning_rate": 0.00019827900509408581, + "loss": 1.616, + "step": 21 + }, + { + "epoch": 0.4489795918367347, + "grad_norm": 0.21537622809410095, + "learning_rate": 0.00019795299412524945, + "loss": 1.609, + "step": 22 + }, + { + "epoch": 0.46938775510204084, + "grad_norm": 0.2432074397802353, + "learning_rate": 0.00019759903962771156, + "loss": 1.6066, + "step": 23 + }, + { + "epoch": 0.4897959183673469, + "grad_norm": 0.2359839379787445, + "learning_rate": 0.00019721724257579907, + "loss": 1.5851, + "step": 24 + }, + { + "epoch": 0.5102040816326531, + "grad_norm": 0.22065888345241547, + "learning_rate": 0.00019680771188662044, + "loss": 1.5739, + "step": 25 + }, + { + "epoch": 0.5306122448979592, + "grad_norm": 0.20339132845401764, + "learning_rate": 0.0001963705643889941, + "loss": 1.5513, + "step": 26 + }, + { + "epoch": 0.5306122448979592, + "eval_loss": 1.4832030534744263, + "eval_runtime": 271.2449, + "eval_samples_per_second": 6.249, + "eval_steps_per_second": 3.126, + "step": 26 + }, + { + "epoch": 0.5510204081632653, + "grad_norm": 0.18875224888324738, + "learning_rate": 0.00019590592479012023, + "loss": 1.5378, + "step": 27 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.18564417958259583, + "learning_rate": 0.00019541392564000488, + "loss": 1.5212, + "step": 28 + }, + { + "epoch": 0.5918367346938775, + "grad_norm": 0.16226942837238312, + "learning_rate": 0.00019489470729364692, + "loss": 1.5391, + "step": 29 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 0.15650039911270142, + "learning_rate": 0.00019434841787099803, + "loss": 1.511, + "step": 30 + }, + { + "epoch": 0.6326530612244898, + "grad_norm": 0.15976540744304657, + "learning_rate": 0.00019377521321470805, + "loss": 1.5119, + "step": 31 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 0.16409288346767426, + "learning_rate": 0.00019317525684566685, + "loss": 1.4909, + "step": 32 + }, + { + "epoch": 0.673469387755102, + "grad_norm": 0.15468019247055054, + "learning_rate": 0.00019254871991635598, + "loss": 1.4951, + "step": 33 + }, + { + "epoch": 0.6938775510204082, + "grad_norm": 0.1462036371231079, + "learning_rate": 0.00019189578116202307, + "loss": 1.4643, + "step": 34 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.1541963368654251, + "learning_rate": 0.00019121662684969335, + "loss": 1.5159, + "step": 35 + }, + { + "epoch": 0.7346938775510204, + "grad_norm": 0.14798064529895782, + "learning_rate": 0.00019051145072503215, + "loss": 1.4741, + "step": 36 + }, + { + "epoch": 0.7551020408163265, + "grad_norm": 0.13914817571640015, + "learning_rate": 0.00018978045395707418, + "loss": 1.4788, + "step": 37 + }, + { + "epoch": 0.7755102040816326, + "grad_norm": 0.15608824789524078, + "learning_rate": 0.00018902384508083517, + "loss": 1.4687, + "step": 38 + }, + { + "epoch": 0.7959183673469388, + "grad_norm": 0.14460116624832153, + "learning_rate": 0.00018824183993782192, + "loss": 1.482, + "step": 39 + }, + { + "epoch": 0.7959183673469388, + "eval_loss": 1.411073088645935, + "eval_runtime": 271.292, + "eval_samples_per_second": 6.248, + "eval_steps_per_second": 3.126, + "step": 39 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 0.15740551054477692, + "learning_rate": 0.00018743466161445823, + "loss": 1.4486, + "step": 40 + }, + { + "epoch": 0.8367346938775511, + "grad_norm": 0.14149661362171173, + "learning_rate": 0.00018660254037844388, + "loss": 1.4353, + "step": 41 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.14034292101860046, + "learning_rate": 0.0001857457136130651, + "loss": 1.4523, + "step": 42 + }, + { + "epoch": 0.8775510204081632, + "grad_norm": 0.1487722396850586, + "learning_rate": 0.00018486442574947511, + "loss": 1.4095, + "step": 43 + }, + { + "epoch": 0.8979591836734694, + "grad_norm": 0.17400234937667847, + "learning_rate": 0.00018395892819696389, + "loss": 1.4414, + "step": 44 + }, + { + "epoch": 0.9183673469387755, + "grad_norm": 0.1741325408220291, + "learning_rate": 0.00018302947927123766, + "loss": 1.4379, + "step": 45 + }, + { + "epoch": 0.9387755102040817, + "grad_norm": 0.15319454669952393, + "learning_rate": 0.00018207634412072764, + "loss": 1.405, + "step": 46 + }, + { + "epoch": 0.9591836734693877, + "grad_norm": 0.15876264870166779, + "learning_rate": 0.00018109979465095013, + "loss": 1.4122, + "step": 47 + }, + { + "epoch": 0.9795918367346939, + "grad_norm": 0.17120805382728577, + "learning_rate": 0.00018010010944693848, + "loss": 1.4132, + "step": 48 + }, + { + "epoch": 1.0, + "grad_norm": 0.1436116099357605, + "learning_rate": 0.00017907757369376985, + "loss": 1.416, + "step": 49 + } + ], + "logging_steps": 1, + "max_steps": 196, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 49, + "total_flos": 1.0209350802997248e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-49/training_args.bin b/checkpoint-49/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..be517173d48b34e38c41c52f74f7e02495562a61 --- /dev/null +++ b/checkpoint-49/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9cc4ff61a4799ac22d5c627a0169a20cc75619ae1b5871f2d114f95284d87a6 +size 5816 diff --git a/checkpoint-98/README.md b/checkpoint-98/README.md new file mode 100644 index 0000000000000000000000000000000000000000..78aadd3258d05c05cabc678473b6f0942cd441c7 --- /dev/null +++ b/checkpoint-98/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.10.0 \ No newline at end of file diff --git a/checkpoint-98/adapter_config.json b/checkpoint-98/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3a88eecac3a652d9c19fe6d8e555dcac4859ccd2 --- /dev/null +++ b/checkpoint-98/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "k_proj", + "q_proj", + "gate_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-98/adapter_model.safetensors b/checkpoint-98/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c25b0063b69349b0cb6176f283c217cdd9246694 --- /dev/null +++ b/checkpoint-98/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88f74a76e06a6e5698ca16a682f4fa5d7e5c10182d165fe6c9327116444b10d0 +size 100966336 diff --git a/checkpoint-98/optimizer.pt b/checkpoint-98/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..eec9febc5f3485dcb35178d817f3969b8c7725b8 --- /dev/null +++ b/checkpoint-98/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c01f653a0ce9ea304a86d075b21cd51ea729659b91629c555eec65181dd1818 +size 50916644 diff --git a/checkpoint-98/rng_state.pth b/checkpoint-98/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4f8bffc16c04a6d97a2f70d9897cc2b0f5b5ec32 --- /dev/null +++ b/checkpoint-98/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff339d3bf5bb702320fd9a759e0988b159a701364f186575c95d51b72519d7a1 +size 14244 diff --git a/checkpoint-98/scheduler.pt b/checkpoint-98/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b09734329c851152a05af267cd70e47082a0b481 --- /dev/null +++ b/checkpoint-98/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e31465eabc96d2c0b0dc68386782c8ea3a5771edcba13d0d620c4297cd31957 +size 1064 diff --git a/checkpoint-98/special_tokens_map.json b/checkpoint-98/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-98/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-98/tokenizer.model b/checkpoint-98/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-98/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-98/tokenizer_config.json b/checkpoint-98/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0da2d3613b2ca0d42b48dea2162c06006f52b837 --- /dev/null +++ b/checkpoint-98/tokenizer_config.json @@ -0,0 +1,45 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/checkpoint-98/trainer_state.json b/checkpoint-98/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c68a94122621cbfdc175825e3de764c888b9d078 --- /dev/null +++ b/checkpoint-98/trainer_state.json @@ -0,0 +1,771 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9795918367346939, + "eval_steps": 13, + "global_step": 98, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02040816326530612, + "grad_norm": 0.7881951332092285, + "learning_rate": 2e-05, + "loss": 2.7509, + "step": 1 + }, + { + "epoch": 0.02040816326530612, + "eval_loss": 2.6902382373809814, + "eval_runtime": 269.5606, + "eval_samples_per_second": 6.288, + "eval_steps_per_second": 3.146, + "step": 1 + }, + { + "epoch": 0.04081632653061224, + "grad_norm": 0.789082407951355, + "learning_rate": 4e-05, + "loss": 2.7449, + "step": 2 + }, + { + "epoch": 0.061224489795918366, + "grad_norm": 0.7354114055633545, + "learning_rate": 6e-05, + "loss": 2.7164, + "step": 3 + }, + { + "epoch": 0.08163265306122448, + "grad_norm": 0.7292255759239197, + "learning_rate": 8e-05, + "loss": 2.7174, + "step": 4 + }, + { + "epoch": 0.10204081632653061, + "grad_norm": 0.6898028254508972, + "learning_rate": 0.0001, + "loss": 2.6891, + "step": 5 + }, + { + "epoch": 0.12244897959183673, + "grad_norm": 0.6861400604248047, + "learning_rate": 0.00012, + "loss": 2.6545, + "step": 6 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.7510350346565247, + "learning_rate": 0.00014, + "loss": 2.5656, + "step": 7 + }, + { + "epoch": 0.16326530612244897, + "grad_norm": 0.8011165261268616, + "learning_rate": 0.00016, + "loss": 2.4519, + "step": 8 + }, + { + "epoch": 0.1836734693877551, + "grad_norm": 0.8624005317687988, + "learning_rate": 0.00018, + "loss": 2.3178, + "step": 9 + }, + { + "epoch": 0.20408163265306123, + "grad_norm": 0.8004987835884094, + "learning_rate": 0.0002, + "loss": 2.1783, + "step": 10 + }, + { + "epoch": 0.22448979591836735, + "grad_norm": 0.6362400054931641, + "learning_rate": 0.000199985736255971, + "loss": 2.0252, + "step": 11 + }, + { + "epoch": 0.24489795918367346, + "grad_norm": 0.7930936217308044, + "learning_rate": 0.0001999429490929718, + "loss": 1.8839, + "step": 12 + }, + { + "epoch": 0.2653061224489796, + "grad_norm": 0.5149843096733093, + "learning_rate": 0.00019987165071710527, + "loss": 1.8064, + "step": 13 + }, + { + "epoch": 0.2653061224489796, + "eval_loss": 1.6734941005706787, + "eval_runtime": 271.2615, + "eval_samples_per_second": 6.249, + "eval_steps_per_second": 3.126, + "step": 13 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.42121434211730957, + "learning_rate": 0.00019977186146800707, + "loss": 1.7922, + "step": 14 + }, + { + "epoch": 0.30612244897959184, + "grad_norm": 0.3523242771625519, + "learning_rate": 0.0001996436098130433, + "loss": 1.7711, + "step": 15 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 0.3384595215320587, + "learning_rate": 0.00019948693233918952, + "loss": 1.7152, + "step": 16 + }, + { + "epoch": 0.3469387755102041, + "grad_norm": 0.34942421317100525, + "learning_rate": 0.00019930187374259337, + "loss": 1.7112, + "step": 17 + }, + { + "epoch": 0.3673469387755102, + "grad_norm": 0.31712639331817627, + "learning_rate": 0.00019908848681582391, + "loss": 1.7059, + "step": 18 + }, + { + "epoch": 0.3877551020408163, + "grad_norm": 0.2875436842441559, + "learning_rate": 0.00019884683243281116, + "loss": 1.6468, + "step": 19 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 0.24433130025863647, + "learning_rate": 0.00019857697953148037, + "loss": 1.6408, + "step": 20 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.21414674818515778, + "learning_rate": 0.00019827900509408581, + "loss": 1.616, + "step": 21 + }, + { + "epoch": 0.4489795918367347, + "grad_norm": 0.21537622809410095, + "learning_rate": 0.00019795299412524945, + "loss": 1.609, + "step": 22 + }, + { + "epoch": 0.46938775510204084, + "grad_norm": 0.2432074397802353, + "learning_rate": 0.00019759903962771156, + "loss": 1.6066, + "step": 23 + }, + { + "epoch": 0.4897959183673469, + "grad_norm": 0.2359839379787445, + "learning_rate": 0.00019721724257579907, + "loss": 1.5851, + "step": 24 + }, + { + "epoch": 0.5102040816326531, + "grad_norm": 0.22065888345241547, + "learning_rate": 0.00019680771188662044, + "loss": 1.5739, + "step": 25 + }, + { + "epoch": 0.5306122448979592, + "grad_norm": 0.20339132845401764, + "learning_rate": 0.0001963705643889941, + "loss": 1.5513, + "step": 26 + }, + { + "epoch": 0.5306122448979592, + "eval_loss": 1.4832030534744263, + "eval_runtime": 271.2449, + "eval_samples_per_second": 6.249, + "eval_steps_per_second": 3.126, + "step": 26 + }, + { + "epoch": 0.5510204081632653, + "grad_norm": 0.18875224888324738, + "learning_rate": 0.00019590592479012023, + "loss": 1.5378, + "step": 27 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.18564417958259583, + "learning_rate": 0.00019541392564000488, + "loss": 1.5212, + "step": 28 + }, + { + "epoch": 0.5918367346938775, + "grad_norm": 0.16226942837238312, + "learning_rate": 0.00019489470729364692, + "loss": 1.5391, + "step": 29 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 0.15650039911270142, + "learning_rate": 0.00019434841787099803, + "loss": 1.511, + "step": 30 + }, + { + "epoch": 0.6326530612244898, + "grad_norm": 0.15976540744304657, + "learning_rate": 0.00019377521321470805, + "loss": 1.5119, + "step": 31 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 0.16409288346767426, + "learning_rate": 0.00019317525684566685, + "loss": 1.4909, + "step": 32 + }, + { + "epoch": 0.673469387755102, + "grad_norm": 0.15468019247055054, + "learning_rate": 0.00019254871991635598, + "loss": 1.4951, + "step": 33 + }, + { + "epoch": 0.6938775510204082, + "grad_norm": 0.1462036371231079, + "learning_rate": 0.00019189578116202307, + "loss": 1.4643, + "step": 34 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.1541963368654251, + "learning_rate": 0.00019121662684969335, + "loss": 1.5159, + "step": 35 + }, + { + "epoch": 0.7346938775510204, + "grad_norm": 0.14798064529895782, + "learning_rate": 0.00019051145072503215, + "loss": 1.4741, + "step": 36 + }, + { + "epoch": 0.7551020408163265, + "grad_norm": 0.13914817571640015, + "learning_rate": 0.00018978045395707418, + "loss": 1.4788, + "step": 37 + }, + { + "epoch": 0.7755102040816326, + "grad_norm": 0.15608824789524078, + "learning_rate": 0.00018902384508083517, + "loss": 1.4687, + "step": 38 + }, + { + "epoch": 0.7959183673469388, + "grad_norm": 0.14460116624832153, + "learning_rate": 0.00018824183993782192, + "loss": 1.482, + "step": 39 + }, + { + "epoch": 0.7959183673469388, + "eval_loss": 1.411073088645935, + "eval_runtime": 271.292, + "eval_samples_per_second": 6.248, + "eval_steps_per_second": 3.126, + "step": 39 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 0.15740551054477692, + "learning_rate": 0.00018743466161445823, + "loss": 1.4486, + "step": 40 + }, + { + "epoch": 0.8367346938775511, + "grad_norm": 0.14149661362171173, + "learning_rate": 0.00018660254037844388, + "loss": 1.4353, + "step": 41 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.14034292101860046, + "learning_rate": 0.0001857457136130651, + "loss": 1.4523, + "step": 42 + }, + { + "epoch": 0.8775510204081632, + "grad_norm": 0.1487722396850586, + "learning_rate": 0.00018486442574947511, + "loss": 1.4095, + "step": 43 + }, + { + "epoch": 0.8979591836734694, + "grad_norm": 0.17400234937667847, + "learning_rate": 0.00018395892819696389, + "loss": 1.4414, + "step": 44 + }, + { + "epoch": 0.9183673469387755, + "grad_norm": 0.1741325408220291, + "learning_rate": 0.00018302947927123766, + "loss": 1.4379, + "step": 45 + }, + { + "epoch": 0.9387755102040817, + "grad_norm": 0.15319454669952393, + "learning_rate": 0.00018207634412072764, + "loss": 1.405, + "step": 46 + }, + { + "epoch": 0.9591836734693877, + "grad_norm": 0.15876264870166779, + "learning_rate": 0.00018109979465095013, + "loss": 1.4122, + "step": 47 + }, + { + "epoch": 0.9795918367346939, + "grad_norm": 0.17120805382728577, + "learning_rate": 0.00018010010944693848, + "loss": 1.4132, + "step": 48 + }, + { + "epoch": 1.0, + "grad_norm": 0.1436116099357605, + "learning_rate": 0.00017907757369376985, + "loss": 1.416, + "step": 49 + }, + { + "epoch": 1.0204081632653061, + "grad_norm": 0.1707429438829422, + "learning_rate": 0.0001780324790952092, + "loss": 1.3913, + "step": 50 + }, + { + "epoch": 1.0204081632653061, + "grad_norm": 0.17117524147033691, + "learning_rate": 0.00017696512379049325, + "loss": 1.3963, + "step": 51 + }, + { + "epoch": 1.0408163265306123, + "grad_norm": 0.13410089910030365, + "learning_rate": 0.0001758758122692791, + "loss": 1.392, + "step": 52 + }, + { + "epoch": 1.0408163265306123, + "eval_loss": 1.3676769733428955, + "eval_runtime": 270.8566, + "eval_samples_per_second": 6.258, + "eval_steps_per_second": 3.131, + "step": 52 + }, + { + "epoch": 1.0612244897959184, + "grad_norm": 0.18877607583999634, + "learning_rate": 0.00017476485528478093, + "loss": 1.3854, + "step": 53 + }, + { + "epoch": 1.0816326530612246, + "grad_norm": 0.1752927452325821, + "learning_rate": 0.00017363256976511972, + "loss": 1.3759, + "step": 54 + }, + { + "epoch": 1.1020408163265305, + "grad_norm": 0.17180170118808746, + "learning_rate": 0.000172479278722912, + "loss": 1.3614, + "step": 55 + }, + { + "epoch": 1.1224489795918366, + "grad_norm": 0.1640290915966034, + "learning_rate": 0.00017130531116312203, + "loss": 1.3853, + "step": 56 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.2047068476676941, + "learning_rate": 0.0001701110019892053, + "loss": 1.3699, + "step": 57 + }, + { + "epoch": 1.163265306122449, + "grad_norm": 0.1835869997739792, + "learning_rate": 0.00016889669190756868, + "loss": 1.3403, + "step": 58 + }, + { + "epoch": 1.183673469387755, + "grad_norm": 0.16733241081237793, + "learning_rate": 0.00016766272733037576, + "loss": 1.3609, + "step": 59 + }, + { + "epoch": 1.2040816326530612, + "grad_norm": 0.178726926445961, + "learning_rate": 0.00016640946027672392, + "loss": 1.3651, + "step": 60 + }, + { + "epoch": 1.2244897959183674, + "grad_norm": 0.16719630360603333, + "learning_rate": 0.00016513724827222227, + "loss": 1.3676, + "step": 61 + }, + { + "epoch": 1.2448979591836735, + "grad_norm": 0.15999363362789154, + "learning_rate": 0.00016384645424699835, + "loss": 1.3651, + "step": 62 + }, + { + "epoch": 1.2653061224489797, + "grad_norm": 0.1705988198518753, + "learning_rate": 0.00016253744643216368, + "loss": 1.3757, + "step": 63 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.14996370673179626, + "learning_rate": 0.0001612105982547663, + "loss": 1.3474, + "step": 64 + }, + { + "epoch": 1.306122448979592, + "grad_norm": 0.19127260148525238, + "learning_rate": 0.0001598662882312615, + "loss": 1.3414, + "step": 65 + }, + { + "epoch": 1.306122448979592, + "eval_loss": 1.331880807876587, + "eval_runtime": 270.8424, + "eval_samples_per_second": 6.258, + "eval_steps_per_second": 3.131, + "step": 65 + }, + { + "epoch": 1.3265306122448979, + "grad_norm": 0.16125527024269104, + "learning_rate": 0.00015850489985953076, + "loss": 1.3509, + "step": 66 + }, + { + "epoch": 1.346938775510204, + "grad_norm": 0.1979473978281021, + "learning_rate": 0.00015712682150947923, + "loss": 1.3579, + "step": 67 + }, + { + "epoch": 1.3673469387755102, + "grad_norm": 0.18317992985248566, + "learning_rate": 0.00015573244631224365, + "loss": 1.3341, + "step": 68 + }, + { + "epoch": 1.3877551020408163, + "grad_norm": 0.1646898239850998, + "learning_rate": 0.0001543221720480419, + "loss": 1.3361, + "step": 69 + }, + { + "epoch": 1.4081632653061225, + "grad_norm": 0.1760271042585373, + "learning_rate": 0.00015289640103269625, + "loss": 1.358, + "step": 70 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.165283203125, + "learning_rate": 0.0001514555400028629, + "loss": 1.3072, + "step": 71 + }, + { + "epoch": 1.4489795918367347, + "grad_norm": 0.1507076472043991, + "learning_rate": 0.00015000000000000001, + "loss": 1.3133, + "step": 72 + }, + { + "epoch": 1.469387755102041, + "grad_norm": 0.16913647949695587, + "learning_rate": 0.00014853019625310813, + "loss": 1.3232, + "step": 73 + }, + { + "epoch": 1.489795918367347, + "grad_norm": 0.18266479671001434, + "learning_rate": 0.0001470465480602756, + "loss": 1.3512, + "step": 74 + }, + { + "epoch": 1.510204081632653, + "grad_norm": 0.19301828742027283, + "learning_rate": 0.0001455494786690634, + "loss": 1.3241, + "step": 75 + }, + { + "epoch": 1.5306122448979593, + "grad_norm": 0.16109652817249298, + "learning_rate": 0.00014403941515576344, + "loss": 1.3256, + "step": 76 + }, + { + "epoch": 1.5510204081632653, + "grad_norm": 0.17053867876529694, + "learning_rate": 0.00014251678830356408, + "loss": 1.3162, + "step": 77 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.17348544299602509, + "learning_rate": 0.00014098203247965875, + "loss": 1.3213, + "step": 78 + }, + { + "epoch": 1.5714285714285714, + "eval_loss": 1.3028697967529297, + "eval_runtime": 270.8095, + "eval_samples_per_second": 6.259, + "eval_steps_per_second": 3.131, + "step": 78 + }, + { + "epoch": 1.5918367346938775, + "grad_norm": 0.1703907549381256, + "learning_rate": 0.00013943558551133186, + "loss": 1.3073, + "step": 79 + }, + { + "epoch": 1.6122448979591837, + "grad_norm": 0.17313100397586823, + "learning_rate": 0.0001378778885610576, + "loss": 1.3232, + "step": 80 + }, + { + "epoch": 1.6326530612244898, + "grad_norm": 0.17237025499343872, + "learning_rate": 0.00013630938600064747, + "loss": 1.3406, + "step": 81 + }, + { + "epoch": 1.6530612244897958, + "grad_norm": 0.19658459722995758, + "learning_rate": 0.00013473052528448201, + "loss": 1.3114, + "step": 82 + }, + { + "epoch": 1.6734693877551021, + "grad_norm": 0.20599938929080963, + "learning_rate": 0.0001331417568218636, + "loss": 1.3288, + "step": 83 + }, + { + "epoch": 1.693877551020408, + "grad_norm": 0.17759399116039276, + "learning_rate": 0.00013154353384852558, + "loss": 1.2995, + "step": 84 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.18712250888347626, + "learning_rate": 0.00012993631229733582, + "loss": 1.2895, + "step": 85 + }, + { + "epoch": 1.7346938775510203, + "grad_norm": 0.1991330236196518, + "learning_rate": 0.00012832055066823038, + "loss": 1.2886, + "step": 86 + }, + { + "epoch": 1.7551020408163265, + "grad_norm": 0.22125203907489777, + "learning_rate": 0.00012669670989741517, + "loss": 1.3233, + "step": 87 + }, + { + "epoch": 1.7755102040816326, + "grad_norm": 0.2052813619375229, + "learning_rate": 0.00012506525322587207, + "loss": 1.3079, + "step": 88 + }, + { + "epoch": 1.7959183673469388, + "grad_norm": 0.19290736317634583, + "learning_rate": 0.00012342664606720822, + "loss": 1.3174, + "step": 89 + }, + { + "epoch": 1.816326530612245, + "grad_norm": 0.20912542939186096, + "learning_rate": 0.00012178135587488515, + "loss": 1.2915, + "step": 90 + }, + { + "epoch": 1.836734693877551, + "grad_norm": 0.20760588347911835, + "learning_rate": 0.00012012985200886602, + "loss": 1.3028, + "step": 91 + }, + { + "epoch": 1.836734693877551, + "eval_loss": 1.2795333862304688, + "eval_runtime": 270.6525, + "eval_samples_per_second": 6.263, + "eval_steps_per_second": 3.133, + "step": 91 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.1996900886297226, + "learning_rate": 0.00011847260560171896, + "loss": 1.3119, + "step": 92 + }, + { + "epoch": 1.8775510204081631, + "grad_norm": 0.23766876757144928, + "learning_rate": 0.00011681008942421483, + "loss": 1.2978, + "step": 93 + }, + { + "epoch": 1.8979591836734695, + "grad_norm": 0.19782397150993347, + "learning_rate": 0.00011514277775045768, + "loss": 1.2955, + "step": 94 + }, + { + "epoch": 1.9183673469387754, + "grad_norm": 0.22519494593143463, + "learning_rate": 0.00011347114622258612, + "loss": 1.2957, + "step": 95 + }, + { + "epoch": 1.9387755102040818, + "grad_norm": 0.2590245306491852, + "learning_rate": 0.00011179567171508463, + "loss": 1.2809, + "step": 96 + }, + { + "epoch": 1.9591836734693877, + "grad_norm": 0.2235420197248459, + "learning_rate": 0.00011011683219874323, + "loss": 1.2784, + "step": 97 + }, + { + "epoch": 1.9795918367346939, + "grad_norm": 0.285740464925766, + "learning_rate": 0.00010843510660430447, + "loss": 1.309, + "step": 98 + } + ], + "logging_steps": 1, + "max_steps": 196, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 49, + "total_flos": 2.0418701605994496e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-98/training_args.bin b/checkpoint-98/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..be517173d48b34e38c41c52f74f7e02495562a61 --- /dev/null +++ b/checkpoint-98/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9cc4ff61a4799ac22d5c627a0169a20cc75619ae1b5871f2d114f95284d87a6 +size 5816 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..63c865633310e5244de44820c844cfc9e3b45dbd --- /dev/null +++ b/config.json @@ -0,0 +1,43 @@ +{ + "_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5632, + "max_position_embeddings": 4096, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 22, + "num_key_value_heads": 4, + "pretraining_tp": 1, + "quantization_config": { + "_load_in_4bit": false, + "_load_in_8bit": true, + "bnb_4bit_compute_dtype": "float32", + "bnb_4bit_quant_storage": "uint8", + "bnb_4bit_quant_type": "fp4", + "bnb_4bit_use_double_quant": false, + "llm_int8_enable_fp32_cpu_offload": false, + "llm_int8_has_fp16_weight": false, + "llm_int8_skip_modules": null, + "llm_int8_threshold": 6.0, + "load_in_4bit": false, + "load_in_8bit": true, + "quant_method": "bitsandbytes" + }, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.40.2", + "use_cache": false, + "vocab_size": 32000 +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0da2d3613b2ca0d42b48dea2162c06006f52b837 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,45 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +}