diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d2b6a77a3beb5142b6f7b9970e3581117c0ea094 --- /dev/null +++ b/README.md @@ -0,0 +1,152 @@ +--- +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +library_name: peft +license: apache-2.0 +tags: +- generated_from_trainer +model-index: +- name: outputs/lora-out + results: [] +--- + + + +[Built with Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) +
See axolotl config + +axolotl version: `0.4.1` +```yaml +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +model_type: LlamaForCausalLM +tokenizer_type: LlamaTokenizer + +load_in_8bit: true +load_in_4bit: false +strict: false + +datasets: + - path: mhenrichsen/alpaca_2k_test + type: alpaca +dataset_prepared_path: +val_set_size: 0.05 +output_dir: ./outputs/lora-out + +sequence_len: 4096 +sample_packing: true +eval_sample_packing: false +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_linear: true +lora_fan_in_fan_out: + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +num_epochs: 4 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +train_on_inputs: false +group_by_length: false +bf16: auto +fp16: +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 10 +evals_per_epoch: 4 +saves_per_epoch: 1 +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: + +``` + +

+ +# outputs/lora-out + +This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T) on the None dataset. +It achieves the following results on the evaluation set: +- Loss: 1.2115 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 0.0002 +- train_batch_size: 2 +- eval_batch_size: 2 +- seed: 42 +- gradient_accumulation_steps: 4 +- total_train_batch_size: 8 +- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 +- lr_scheduler_type: cosine +- lr_scheduler_warmup_steps: 10 +- num_epochs: 4 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | +|:-------------:|:------:|:----:|:---------------:| +| 1.4615 | 0.08 | 1 | 1.4899 | +| 1.3846 | 0.24 | 3 | 1.4859 | +| 1.3667 | 0.48 | 6 | 1.4399 | +| 1.267 | 0.72 | 9 | 1.3382 | +| 1.2276 | 0.96 | 12 | 1.2941 | +| 1.2515 | 1.16 | 15 | 1.2793 | +| 1.2275 | 1.4 | 18 | 1.2556 | +| 1.1351 | 1.6400 | 21 | 1.2347 | +| 1.2701 | 1.88 | 24 | 1.2253 | +| 1.1487 | 2.08 | 27 | 1.2213 | +| 1.1518 | 2.32 | 30 | 1.2209 | +| 1.1942 | 2.56 | 33 | 1.2171 | +| 1.1122 | 2.8 | 36 | 1.2147 | +| 1.1513 | 3.04 | 39 | 1.2139 | +| 1.1887 | 3.24 | 42 | 1.2128 | +| 1.1011 | 3.48 | 45 | 1.2114 | +| 1.1887 | 3.7200 | 48 | 1.2115 | + + +### Framework versions + +- PEFT 0.11.1 +- Transformers 4.42.3 +- Pytorch 2.1.2+cu118 +- Datasets 2.19.1 +- Tokenizers 0.19.1 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a8c35d776d16976c573cce6af2cc598939155ec6 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "v_proj", + "down_proj", + "up_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.bin b/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..5252c7f96d531850133cea161eaff4baa955195f --- /dev/null +++ b/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04765d043cca34767e6be85248961c2d2d756a590969242fed181e87e505d68f +size 101036698 diff --git a/checkpoint-12/README.md b/checkpoint-12/README.md new file mode 100644 index 0000000000000000000000000000000000000000..136e7e248b46366ae875395c8633d12be7b75f80 --- /dev/null +++ b/checkpoint-12/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/checkpoint-12/adapter_config.json b/checkpoint-12/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a8c35d776d16976c573cce6af2cc598939155ec6 --- /dev/null +++ b/checkpoint-12/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "v_proj", + "down_proj", + "up_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-12/adapter_model.safetensors b/checkpoint-12/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4a1d143297a39d84f15d5c1f9bc73f198bcbcdd5 --- /dev/null +++ b/checkpoint-12/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2851811fb072817c03540939a7171413f508e32a43b2da99b2cd036dfcc4127e +size 100966336 diff --git a/checkpoint-12/optimizer.pt b/checkpoint-12/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d368548a9677f531ffc9dd31c529b0ba605368f0 --- /dev/null +++ b/checkpoint-12/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f91d2e08263a999235f281f300c3430b0a4cb752a269ac1501fb26b9bbf6ed +size 50916644 diff --git a/checkpoint-12/rng_state.pth b/checkpoint-12/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..dc6d4aaa0150d1902e47b1eb95a826c456421fe4 --- /dev/null +++ b/checkpoint-12/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35bb56ec4b2bbb057a5a3c1c341f4ef749ec10d715336336c3c6e1a4afccacd5 +size 14244 diff --git a/checkpoint-12/scheduler.pt b/checkpoint-12/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..48748cf6e74350ca0094e19264c545ccf14b4f53 --- /dev/null +++ b/checkpoint-12/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e535a15f440e3b1b4d1872998a3c1d64048b2d54e365eb59e3aa3a5899e46b5 +size 1064 diff --git a/checkpoint-12/special_tokens_map.json b/checkpoint-12/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-12/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-12/tokenizer.model b/checkpoint-12/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-12/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-12/tokenizer_config.json b/checkpoint-12/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86 --- /dev/null +++ b/checkpoint-12/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/checkpoint-12/trainer_state.json b/checkpoint-12/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c7e04c0c3ee029a3f14a28ee74dd004426c85c6a --- /dev/null +++ b/checkpoint-12/trainer_state.json @@ -0,0 +1,157 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.96, + "eval_steps": 3, + "global_step": 12, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "grad_norm": 0.17093713581562042, + "learning_rate": 2e-05, + "loss": 1.4615, + "step": 1 + }, + { + "epoch": 0.08, + "eval_loss": 1.4899382591247559, + "eval_runtime": 17.3107, + "eval_samples_per_second": 5.777, + "eval_steps_per_second": 2.888, + "step": 1 + }, + { + "epoch": 0.16, + "grad_norm": 0.19339510798454285, + "learning_rate": 4e-05, + "loss": 1.4241, + "step": 2 + }, + { + "epoch": 0.24, + "grad_norm": 0.1669788658618927, + "learning_rate": 6e-05, + "loss": 1.3846, + "step": 3 + }, + { + "epoch": 0.24, + "eval_loss": 1.4858685731887817, + "eval_runtime": 17.4215, + "eval_samples_per_second": 5.74, + "eval_steps_per_second": 2.87, + "step": 3 + }, + { + "epoch": 0.32, + "grad_norm": 0.14142441749572754, + "learning_rate": 8e-05, + "loss": 1.2219, + "step": 4 + }, + { + "epoch": 0.4, + "grad_norm": 0.15717843174934387, + "learning_rate": 0.0001, + "loss": 1.3617, + "step": 5 + }, + { + "epoch": 0.48, + "grad_norm": 0.16309261322021484, + "learning_rate": 0.00012, + "loss": 1.3667, + "step": 6 + }, + { + "epoch": 0.48, + "eval_loss": 1.439871072769165, + "eval_runtime": 17.6339, + "eval_samples_per_second": 5.671, + "eval_steps_per_second": 2.835, + "step": 6 + }, + { + "epoch": 0.56, + "grad_norm": 0.15078029036521912, + "learning_rate": 0.00014, + "loss": 1.3008, + "step": 7 + }, + { + "epoch": 0.64, + "grad_norm": 0.13603582978248596, + "learning_rate": 0.00016, + "loss": 1.3333, + "step": 8 + }, + { + "epoch": 0.72, + "grad_norm": 0.1104956567287445, + "learning_rate": 0.00018, + "loss": 1.267, + "step": 9 + }, + { + "epoch": 0.72, + "eval_loss": 1.3381670713424683, + "eval_runtime": 17.2986, + "eval_samples_per_second": 5.781, + "eval_steps_per_second": 2.89, + "step": 9 + }, + { + "epoch": 0.8, + "grad_norm": 0.09913735836744308, + "learning_rate": 0.0002, + "loss": 1.2946, + "step": 10 + }, + { + "epoch": 0.88, + "grad_norm": 0.11903145164251328, + "learning_rate": 0.000199658449300667, + "loss": 1.2921, + "step": 11 + }, + { + "epoch": 0.96, + "grad_norm": 0.11169299483299255, + "learning_rate": 0.00019863613034027224, + "loss": 1.2276, + "step": 12 + }, + { + "epoch": 0.96, + "eval_loss": 1.2940881252288818, + "eval_runtime": 17.4061, + "eval_samples_per_second": 5.745, + "eval_steps_per_second": 2.873, + "step": 12 + } + ], + "logging_steps": 1, + "max_steps": 48, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 12, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2500249176244224.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12/training_args.bin b/checkpoint-12/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5c5f7039014fc8dc36129366fc8474e9a2e6db30 --- /dev/null +++ b/checkpoint-12/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b6f97a9990d98ad2e97e692ffff5d13e2f2644982eea955df1c8a971b386400 +size 6008 diff --git a/checkpoint-24/README.md b/checkpoint-24/README.md new file mode 100644 index 0000000000000000000000000000000000000000..136e7e248b46366ae875395c8633d12be7b75f80 --- /dev/null +++ b/checkpoint-24/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/checkpoint-24/adapter_config.json b/checkpoint-24/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a8c35d776d16976c573cce6af2cc598939155ec6 --- /dev/null +++ b/checkpoint-24/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "v_proj", + "down_proj", + "up_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-24/adapter_model.safetensors b/checkpoint-24/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9a12e1d5a83867e86c784d937d0ea95441354a78 --- /dev/null +++ b/checkpoint-24/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ae7f7faeb13406f9baee68ce6df23340ed54f657ec8cf5499068465cafa61c9 +size 100966336 diff --git a/checkpoint-24/optimizer.pt b/checkpoint-24/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e0b07ad8fe87735a40bcdb239ab9e2c437bf2ad --- /dev/null +++ b/checkpoint-24/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b50cd600842200d32a7ba4ccac017eeff6e48deefce732c4a1cb4e5a3e415c87 +size 50916644 diff --git a/checkpoint-24/rng_state.pth b/checkpoint-24/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a11461cc5c8798fd4bb3ef3a630c747127e21c3a --- /dev/null +++ b/checkpoint-24/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e05ca4a3ea886c932e939cdd1e683db542bf08214470c9ad647abdc302b57add +size 14244 diff --git a/checkpoint-24/scheduler.pt b/checkpoint-24/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..202deb39034de2c553e2527f4d7a38b34e0361a4 --- /dev/null +++ b/checkpoint-24/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fd2df94f244c74ef9128181bbcabe340233f441f19aa2de60f32d36a56a9cac +size 1064 diff --git a/checkpoint-24/special_tokens_map.json b/checkpoint-24/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-24/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-24/tokenizer.model b/checkpoint-24/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-24/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-24/tokenizer_config.json b/checkpoint-24/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86 --- /dev/null +++ b/checkpoint-24/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/checkpoint-24/trainer_state.json b/checkpoint-24/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..afd4c602340e31cdb5a0de8c3e7bc09842e02aa1 --- /dev/null +++ b/checkpoint-24/trainer_state.json @@ -0,0 +1,273 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.88, + "eval_steps": 3, + "global_step": 24, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "grad_norm": 0.17093713581562042, + "learning_rate": 2e-05, + "loss": 1.4615, + "step": 1 + }, + { + "epoch": 0.08, + "eval_loss": 1.4899382591247559, + "eval_runtime": 17.3107, + "eval_samples_per_second": 5.777, + "eval_steps_per_second": 2.888, + "step": 1 + }, + { + "epoch": 0.16, + "grad_norm": 0.19339510798454285, + "learning_rate": 4e-05, + "loss": 1.4241, + "step": 2 + }, + { + "epoch": 0.24, + "grad_norm": 0.1669788658618927, + "learning_rate": 6e-05, + "loss": 1.3846, + "step": 3 + }, + { + "epoch": 0.24, + "eval_loss": 1.4858685731887817, + "eval_runtime": 17.4215, + "eval_samples_per_second": 5.74, + "eval_steps_per_second": 2.87, + "step": 3 + }, + { + "epoch": 0.32, + "grad_norm": 0.14142441749572754, + "learning_rate": 8e-05, + "loss": 1.2219, + "step": 4 + }, + { + "epoch": 0.4, + "grad_norm": 0.15717843174934387, + "learning_rate": 0.0001, + "loss": 1.3617, + "step": 5 + }, + { + "epoch": 0.48, + "grad_norm": 0.16309261322021484, + "learning_rate": 0.00012, + "loss": 1.3667, + "step": 6 + }, + { + "epoch": 0.48, + "eval_loss": 1.439871072769165, + "eval_runtime": 17.6339, + "eval_samples_per_second": 5.671, + "eval_steps_per_second": 2.835, + "step": 6 + }, + { + "epoch": 0.56, + "grad_norm": 0.15078029036521912, + "learning_rate": 0.00014, + "loss": 1.3008, + "step": 7 + }, + { + "epoch": 0.64, + "grad_norm": 0.13603582978248596, + "learning_rate": 0.00016, + "loss": 1.3333, + "step": 8 + }, + { + "epoch": 0.72, + "grad_norm": 0.1104956567287445, + "learning_rate": 0.00018, + "loss": 1.267, + "step": 9 + }, + { + "epoch": 0.72, + "eval_loss": 1.3381670713424683, + "eval_runtime": 17.2986, + "eval_samples_per_second": 5.781, + "eval_steps_per_second": 2.89, + "step": 9 + }, + { + "epoch": 0.8, + "grad_norm": 0.09913735836744308, + "learning_rate": 0.0002, + "loss": 1.2946, + "step": 10 + }, + { + "epoch": 0.88, + "grad_norm": 0.11903145164251328, + "learning_rate": 0.000199658449300667, + "loss": 1.2921, + "step": 11 + }, + { + "epoch": 0.96, + "grad_norm": 0.11169299483299255, + "learning_rate": 0.00019863613034027224, + "loss": 1.2276, + "step": 12 + }, + { + "epoch": 0.96, + "eval_loss": 1.2940881252288818, + "eval_runtime": 17.4061, + "eval_samples_per_second": 5.745, + "eval_steps_per_second": 2.873, + "step": 12 + }, + { + "epoch": 1.04, + "grad_norm": 0.1135605201125145, + "learning_rate": 0.00019694002659393305, + "loss": 1.2666, + "step": 13 + }, + { + "epoch": 1.08, + "grad_norm": 0.1120605319738388, + "learning_rate": 0.00019458172417006347, + "loss": 1.2589, + "step": 14 + }, + { + "epoch": 1.16, + "grad_norm": 0.10806083679199219, + "learning_rate": 0.00019157733266550575, + "loss": 1.2515, + "step": 15 + }, + { + "epoch": 1.16, + "eval_loss": 1.2792645692825317, + "eval_runtime": 17.2617, + "eval_samples_per_second": 5.793, + "eval_steps_per_second": 2.897, + "step": 15 + }, + { + "epoch": 1.24, + "grad_norm": 0.09928147494792938, + "learning_rate": 0.0001879473751206489, + "loss": 1.1636, + "step": 16 + }, + { + "epoch": 1.32, + "grad_norm": 0.09012632817029953, + "learning_rate": 0.00018371664782625287, + "loss": 1.1601, + "step": 17 + }, + { + "epoch": 1.4, + "grad_norm": 0.09199394285678864, + "learning_rate": 0.00017891405093963938, + "loss": 1.2275, + "step": 18 + }, + { + "epoch": 1.4, + "eval_loss": 1.2555986642837524, + "eval_runtime": 17.3997, + "eval_samples_per_second": 5.747, + "eval_steps_per_second": 2.874, + "step": 18 + }, + { + "epoch": 1.48, + "grad_norm": 0.096685491502285, + "learning_rate": 0.00017357239106731317, + "loss": 1.228, + "step": 19 + }, + { + "epoch": 1.56, + "grad_norm": 0.08053378760814667, + "learning_rate": 0.00016772815716257412, + "loss": 1.192, + "step": 20 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.07304864376783371, + "learning_rate": 0.0001614212712689668, + "loss": 1.1351, + "step": 21 + }, + { + "epoch": 1.6400000000000001, + "eval_loss": 1.2347127199172974, + "eval_runtime": 17.3396, + "eval_samples_per_second": 5.767, + "eval_steps_per_second": 2.884, + "step": 21 + }, + { + "epoch": 1.72, + "grad_norm": 0.0821811631321907, + "learning_rate": 0.00015469481581224272, + "loss": 1.2031, + "step": 22 + }, + { + "epoch": 1.8, + "grad_norm": 0.07574562728404999, + "learning_rate": 0.00014759473930370736, + "loss": 1.2476, + "step": 23 + }, + { + "epoch": 1.88, + "grad_norm": 0.07235240191221237, + "learning_rate": 0.00014016954246529696, + "loss": 1.2701, + "step": 24 + }, + { + "epoch": 1.88, + "eval_loss": 1.2253003120422363, + "eval_runtime": 17.292, + "eval_samples_per_second": 5.783, + "eval_steps_per_second": 2.892, + "step": 24 + } + ], + "logging_steps": 1, + "max_steps": 48, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 12, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5000498352488448.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-24/training_args.bin b/checkpoint-24/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5c5f7039014fc8dc36129366fc8474e9a2e6db30 --- /dev/null +++ b/checkpoint-24/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b6f97a9990d98ad2e97e692ffff5d13e2f2644982eea955df1c8a971b386400 +size 6008 diff --git a/checkpoint-36/README.md b/checkpoint-36/README.md new file mode 100644 index 0000000000000000000000000000000000000000..136e7e248b46366ae875395c8633d12be7b75f80 --- /dev/null +++ b/checkpoint-36/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/checkpoint-36/adapter_config.json b/checkpoint-36/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a8c35d776d16976c573cce6af2cc598939155ec6 --- /dev/null +++ b/checkpoint-36/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "v_proj", + "down_proj", + "up_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-36/adapter_model.safetensors b/checkpoint-36/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..10daa798f4bee3a40ce8790916d34df66f29e6ff --- /dev/null +++ b/checkpoint-36/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40bc41fd8180f2ca0127e4626c91276aed9de4159efd34dc831e0ddb9a905b7e +size 100966336 diff --git a/checkpoint-36/optimizer.pt b/checkpoint-36/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..221b8b10fdee5948de65dfa639ad6b079c897348 --- /dev/null +++ b/checkpoint-36/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bcab64fb2240d5c9e39264c8e13b177843cb543b2340833b1b74c3f3441a6cc +size 50916644 diff --git a/checkpoint-36/rng_state.pth b/checkpoint-36/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c28b841dc1e17f22e94de1795ce6ae9e944e42f --- /dev/null +++ b/checkpoint-36/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:097d1ea8f5f9af9824d41438adbb887f58098ad9ba06156cee3883694e2e2736 +size 14244 diff --git a/checkpoint-36/scheduler.pt b/checkpoint-36/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c709adb50c980617bac33c11cf5ae0931260631d --- /dev/null +++ b/checkpoint-36/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5918ddd95097cd0d9acd73ea2bf14c23b23f8d6e0bb73e5c46156ea038bd743 +size 1064 diff --git a/checkpoint-36/special_tokens_map.json b/checkpoint-36/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-36/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-36/tokenizer.model b/checkpoint-36/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-36/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-36/tokenizer_config.json b/checkpoint-36/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86 --- /dev/null +++ b/checkpoint-36/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/checkpoint-36/trainer_state.json b/checkpoint-36/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7ab3fed8ad9e87724e8e719df702defef9fb3954 --- /dev/null +++ b/checkpoint-36/trainer_state.json @@ -0,0 +1,389 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.8, + "eval_steps": 3, + "global_step": 36, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "grad_norm": 0.17093713581562042, + "learning_rate": 2e-05, + "loss": 1.4615, + "step": 1 + }, + { + "epoch": 0.08, + "eval_loss": 1.4899382591247559, + "eval_runtime": 17.3107, + "eval_samples_per_second": 5.777, + "eval_steps_per_second": 2.888, + "step": 1 + }, + { + "epoch": 0.16, + "grad_norm": 0.19339510798454285, + "learning_rate": 4e-05, + "loss": 1.4241, + "step": 2 + }, + { + "epoch": 0.24, + "grad_norm": 0.1669788658618927, + "learning_rate": 6e-05, + "loss": 1.3846, + "step": 3 + }, + { + "epoch": 0.24, + "eval_loss": 1.4858685731887817, + "eval_runtime": 17.4215, + "eval_samples_per_second": 5.74, + "eval_steps_per_second": 2.87, + "step": 3 + }, + { + "epoch": 0.32, + "grad_norm": 0.14142441749572754, + "learning_rate": 8e-05, + "loss": 1.2219, + "step": 4 + }, + { + "epoch": 0.4, + "grad_norm": 0.15717843174934387, + "learning_rate": 0.0001, + "loss": 1.3617, + "step": 5 + }, + { + "epoch": 0.48, + "grad_norm": 0.16309261322021484, + "learning_rate": 0.00012, + "loss": 1.3667, + "step": 6 + }, + { + "epoch": 0.48, + "eval_loss": 1.439871072769165, + "eval_runtime": 17.6339, + "eval_samples_per_second": 5.671, + "eval_steps_per_second": 2.835, + "step": 6 + }, + { + "epoch": 0.56, + "grad_norm": 0.15078029036521912, + "learning_rate": 0.00014, + "loss": 1.3008, + "step": 7 + }, + { + "epoch": 0.64, + "grad_norm": 0.13603582978248596, + "learning_rate": 0.00016, + "loss": 1.3333, + "step": 8 + }, + { + "epoch": 0.72, + "grad_norm": 0.1104956567287445, + "learning_rate": 0.00018, + "loss": 1.267, + "step": 9 + }, + { + "epoch": 0.72, + "eval_loss": 1.3381670713424683, + "eval_runtime": 17.2986, + "eval_samples_per_second": 5.781, + "eval_steps_per_second": 2.89, + "step": 9 + }, + { + "epoch": 0.8, + "grad_norm": 0.09913735836744308, + "learning_rate": 0.0002, + "loss": 1.2946, + "step": 10 + }, + { + "epoch": 0.88, + "grad_norm": 0.11903145164251328, + "learning_rate": 0.000199658449300667, + "loss": 1.2921, + "step": 11 + }, + { + "epoch": 0.96, + "grad_norm": 0.11169299483299255, + "learning_rate": 0.00019863613034027224, + "loss": 1.2276, + "step": 12 + }, + { + "epoch": 0.96, + "eval_loss": 1.2940881252288818, + "eval_runtime": 17.4061, + "eval_samples_per_second": 5.745, + "eval_steps_per_second": 2.873, + "step": 12 + }, + { + "epoch": 1.04, + "grad_norm": 0.1135605201125145, + "learning_rate": 0.00019694002659393305, + "loss": 1.2666, + "step": 13 + }, + { + "epoch": 1.08, + "grad_norm": 0.1120605319738388, + "learning_rate": 0.00019458172417006347, + "loss": 1.2589, + "step": 14 + }, + { + "epoch": 1.16, + "grad_norm": 0.10806083679199219, + "learning_rate": 0.00019157733266550575, + "loss": 1.2515, + "step": 15 + }, + { + "epoch": 1.16, + "eval_loss": 1.2792645692825317, + "eval_runtime": 17.2617, + "eval_samples_per_second": 5.793, + "eval_steps_per_second": 2.897, + "step": 15 + }, + { + "epoch": 1.24, + "grad_norm": 0.09928147494792938, + "learning_rate": 0.0001879473751206489, + "loss": 1.1636, + "step": 16 + }, + { + "epoch": 1.32, + "grad_norm": 0.09012632817029953, + "learning_rate": 0.00018371664782625287, + "loss": 1.1601, + "step": 17 + }, + { + "epoch": 1.4, + "grad_norm": 0.09199394285678864, + "learning_rate": 0.00017891405093963938, + "loss": 1.2275, + "step": 18 + }, + { + "epoch": 1.4, + "eval_loss": 1.2555986642837524, + "eval_runtime": 17.3997, + "eval_samples_per_second": 5.747, + "eval_steps_per_second": 2.874, + "step": 18 + }, + { + "epoch": 1.48, + "grad_norm": 0.096685491502285, + "learning_rate": 0.00017357239106731317, + "loss": 1.228, + "step": 19 + }, + { + "epoch": 1.56, + "grad_norm": 0.08053378760814667, + "learning_rate": 0.00016772815716257412, + "loss": 1.192, + "step": 20 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.07304864376783371, + "learning_rate": 0.0001614212712689668, + "loss": 1.1351, + "step": 21 + }, + { + "epoch": 1.6400000000000001, + "eval_loss": 1.2347127199172974, + "eval_runtime": 17.3396, + "eval_samples_per_second": 5.767, + "eval_steps_per_second": 2.884, + "step": 21 + }, + { + "epoch": 1.72, + "grad_norm": 0.0821811631321907, + "learning_rate": 0.00015469481581224272, + "loss": 1.2031, + "step": 22 + }, + { + "epoch": 1.8, + "grad_norm": 0.07574562728404999, + "learning_rate": 0.00014759473930370736, + "loss": 1.2476, + "step": 23 + }, + { + "epoch": 1.88, + "grad_norm": 0.07235240191221237, + "learning_rate": 0.00014016954246529696, + "loss": 1.2701, + "step": 24 + }, + { + "epoch": 1.88, + "eval_loss": 1.2253003120422363, + "eval_runtime": 17.292, + "eval_samples_per_second": 5.783, + "eval_steps_per_second": 2.892, + "step": 24 + }, + { + "epoch": 1.96, + "grad_norm": 0.07071871310472488, + "learning_rate": 0.00013246994692046836, + "loss": 1.2051, + "step": 25 + }, + { + "epoch": 2.04, + "grad_norm": 0.07875131815671921, + "learning_rate": 0.00012454854871407994, + "loss": 1.192, + "step": 26 + }, + { + "epoch": 2.08, + "grad_norm": 0.06468148529529572, + "learning_rate": 0.00011645945902807341, + "loss": 1.1487, + "step": 27 + }, + { + "epoch": 2.08, + "eval_loss": 1.2213425636291504, + "eval_runtime": 17.7102, + "eval_samples_per_second": 5.646, + "eval_steps_per_second": 2.823, + "step": 27 + }, + { + "epoch": 2.16, + "grad_norm": 0.0741707906126976, + "learning_rate": 0.00010825793454723325, + "loss": 1.1673, + "step": 28 + }, + { + "epoch": 2.24, + "grad_norm": 0.06802140176296234, + "learning_rate": 0.0001, + "loss": 1.2054, + "step": 29 + }, + { + "epoch": 2.32, + "grad_norm": 0.06834083795547485, + "learning_rate": 9.174206545276677e-05, + "loss": 1.1518, + "step": 30 + }, + { + "epoch": 2.32, + "eval_loss": 1.220943808555603, + "eval_runtime": 17.4872, + "eval_samples_per_second": 5.718, + "eval_steps_per_second": 2.859, + "step": 30 + }, + { + "epoch": 2.4, + "grad_norm": 0.06714992970228195, + "learning_rate": 8.35405409719266e-05, + "loss": 1.183, + "step": 31 + }, + { + "epoch": 2.48, + "grad_norm": 0.06744072586297989, + "learning_rate": 7.54514512859201e-05, + "loss": 1.2098, + "step": 32 + }, + { + "epoch": 2.56, + "grad_norm": 0.06815183162689209, + "learning_rate": 6.753005307953167e-05, + "loss": 1.1942, + "step": 33 + }, + { + "epoch": 2.56, + "eval_loss": 1.217085599899292, + "eval_runtime": 17.2384, + "eval_samples_per_second": 5.801, + "eval_steps_per_second": 2.901, + "step": 33 + }, + { + "epoch": 2.64, + "grad_norm": 0.07002735882997513, + "learning_rate": 5.983045753470308e-05, + "loss": 1.1896, + "step": 34 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.07221011817455292, + "learning_rate": 5.240526069629265e-05, + "loss": 1.1795, + "step": 35 + }, + { + "epoch": 2.8, + "grad_norm": 0.07009800523519516, + "learning_rate": 4.530518418775733e-05, + "loss": 1.1122, + "step": 36 + }, + { + "epoch": 2.8, + "eval_loss": 1.2147068977355957, + "eval_runtime": 17.3366, + "eval_samples_per_second": 5.768, + "eval_steps_per_second": 2.884, + "step": 36 + } + ], + "logging_steps": 1, + "max_steps": 48, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 12, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7474703266480128.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-36/training_args.bin b/checkpoint-36/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5c5f7039014fc8dc36129366fc8474e9a2e6db30 --- /dev/null +++ b/checkpoint-36/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b6f97a9990d98ad2e97e692ffff5d13e2f2644982eea955df1c8a971b386400 +size 6008 diff --git a/checkpoint-48/README.md b/checkpoint-48/README.md new file mode 100644 index 0000000000000000000000000000000000000000..136e7e248b46366ae875395c8633d12be7b75f80 --- /dev/null +++ b/checkpoint-48/README.md @@ -0,0 +1,202 @@ +--- +base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/checkpoint-48/adapter_config.json b/checkpoint-48/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a8c35d776d16976c573cce6af2cc598939155ec6 --- /dev/null +++ b/checkpoint-48/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "bias": "none", + "fan_in_fan_out": null, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "v_proj", + "down_proj", + "up_proj", + "gate_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-48/adapter_model.safetensors b/checkpoint-48/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cd274d749d309f85d81bfe2aa9eb80bbecdde96f --- /dev/null +++ b/checkpoint-48/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23d730c3d6976448e84ec766f96c2ab0cb6e0a2244da82904d5f04555910ebcf +size 100966336 diff --git a/checkpoint-48/optimizer.pt b/checkpoint-48/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9b20c3d0c7695c6f6989c43026a939f2246846a --- /dev/null +++ b/checkpoint-48/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:767d44130f9670629778470c8a976df6fc095672da1beb3259e9efa426ad6df8 +size 50916644 diff --git a/checkpoint-48/rng_state.pth b/checkpoint-48/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5893e825b8538105e4eae4af252d89bb1e9aaf86 --- /dev/null +++ b/checkpoint-48/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17f72c0a030693438895e7bd5e11e6e5913e59736aaef065212621cb21e6209c +size 14244 diff --git a/checkpoint-48/scheduler.pt b/checkpoint-48/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c9349de67f87d64b3f06a643104e8f5404a2137 --- /dev/null +++ b/checkpoint-48/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b832e0373d616c3d50894a908dda7ef6c28f6cb2f8a92b6d36348dbf67fd1715 +size 1064 diff --git a/checkpoint-48/special_tokens_map.json b/checkpoint-48/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-48/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-48/tokenizer.model b/checkpoint-48/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-48/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-48/tokenizer_config.json b/checkpoint-48/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86 --- /dev/null +++ b/checkpoint-48/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/checkpoint-48/trainer_state.json b/checkpoint-48/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..43fd578dc389c7929d3b7c2080b8b5d726bae664 --- /dev/null +++ b/checkpoint-48/trainer_state.json @@ -0,0 +1,505 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.7199999999999998, + "eval_steps": 3, + "global_step": 48, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "grad_norm": 0.17093713581562042, + "learning_rate": 2e-05, + "loss": 1.4615, + "step": 1 + }, + { + "epoch": 0.08, + "eval_loss": 1.4899382591247559, + "eval_runtime": 17.3107, + "eval_samples_per_second": 5.777, + "eval_steps_per_second": 2.888, + "step": 1 + }, + { + "epoch": 0.16, + "grad_norm": 0.19339510798454285, + "learning_rate": 4e-05, + "loss": 1.4241, + "step": 2 + }, + { + "epoch": 0.24, + "grad_norm": 0.1669788658618927, + "learning_rate": 6e-05, + "loss": 1.3846, + "step": 3 + }, + { + "epoch": 0.24, + "eval_loss": 1.4858685731887817, + "eval_runtime": 17.4215, + "eval_samples_per_second": 5.74, + "eval_steps_per_second": 2.87, + "step": 3 + }, + { + "epoch": 0.32, + "grad_norm": 0.14142441749572754, + "learning_rate": 8e-05, + "loss": 1.2219, + "step": 4 + }, + { + "epoch": 0.4, + "grad_norm": 0.15717843174934387, + "learning_rate": 0.0001, + "loss": 1.3617, + "step": 5 + }, + { + "epoch": 0.48, + "grad_norm": 0.16309261322021484, + "learning_rate": 0.00012, + "loss": 1.3667, + "step": 6 + }, + { + "epoch": 0.48, + "eval_loss": 1.439871072769165, + "eval_runtime": 17.6339, + "eval_samples_per_second": 5.671, + "eval_steps_per_second": 2.835, + "step": 6 + }, + { + "epoch": 0.56, + "grad_norm": 0.15078029036521912, + "learning_rate": 0.00014, + "loss": 1.3008, + "step": 7 + }, + { + "epoch": 0.64, + "grad_norm": 0.13603582978248596, + "learning_rate": 0.00016, + "loss": 1.3333, + "step": 8 + }, + { + "epoch": 0.72, + "grad_norm": 0.1104956567287445, + "learning_rate": 0.00018, + "loss": 1.267, + "step": 9 + }, + { + "epoch": 0.72, + "eval_loss": 1.3381670713424683, + "eval_runtime": 17.2986, + "eval_samples_per_second": 5.781, + "eval_steps_per_second": 2.89, + "step": 9 + }, + { + "epoch": 0.8, + "grad_norm": 0.09913735836744308, + "learning_rate": 0.0002, + "loss": 1.2946, + "step": 10 + }, + { + "epoch": 0.88, + "grad_norm": 0.11903145164251328, + "learning_rate": 0.000199658449300667, + "loss": 1.2921, + "step": 11 + }, + { + "epoch": 0.96, + "grad_norm": 0.11169299483299255, + "learning_rate": 0.00019863613034027224, + "loss": 1.2276, + "step": 12 + }, + { + "epoch": 0.96, + "eval_loss": 1.2940881252288818, + "eval_runtime": 17.4061, + "eval_samples_per_second": 5.745, + "eval_steps_per_second": 2.873, + "step": 12 + }, + { + "epoch": 1.04, + "grad_norm": 0.1135605201125145, + "learning_rate": 0.00019694002659393305, + "loss": 1.2666, + "step": 13 + }, + { + "epoch": 1.08, + "grad_norm": 0.1120605319738388, + "learning_rate": 0.00019458172417006347, + "loss": 1.2589, + "step": 14 + }, + { + "epoch": 1.16, + "grad_norm": 0.10806083679199219, + "learning_rate": 0.00019157733266550575, + "loss": 1.2515, + "step": 15 + }, + { + "epoch": 1.16, + "eval_loss": 1.2792645692825317, + "eval_runtime": 17.2617, + "eval_samples_per_second": 5.793, + "eval_steps_per_second": 2.897, + "step": 15 + }, + { + "epoch": 1.24, + "grad_norm": 0.09928147494792938, + "learning_rate": 0.0001879473751206489, + "loss": 1.1636, + "step": 16 + }, + { + "epoch": 1.32, + "grad_norm": 0.09012632817029953, + "learning_rate": 0.00018371664782625287, + "loss": 1.1601, + "step": 17 + }, + { + "epoch": 1.4, + "grad_norm": 0.09199394285678864, + "learning_rate": 0.00017891405093963938, + "loss": 1.2275, + "step": 18 + }, + { + "epoch": 1.4, + "eval_loss": 1.2555986642837524, + "eval_runtime": 17.3997, + "eval_samples_per_second": 5.747, + "eval_steps_per_second": 2.874, + "step": 18 + }, + { + "epoch": 1.48, + "grad_norm": 0.096685491502285, + "learning_rate": 0.00017357239106731317, + "loss": 1.228, + "step": 19 + }, + { + "epoch": 1.56, + "grad_norm": 0.08053378760814667, + "learning_rate": 0.00016772815716257412, + "loss": 1.192, + "step": 20 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.07304864376783371, + "learning_rate": 0.0001614212712689668, + "loss": 1.1351, + "step": 21 + }, + { + "epoch": 1.6400000000000001, + "eval_loss": 1.2347127199172974, + "eval_runtime": 17.3396, + "eval_samples_per_second": 5.767, + "eval_steps_per_second": 2.884, + "step": 21 + }, + { + "epoch": 1.72, + "grad_norm": 0.0821811631321907, + "learning_rate": 0.00015469481581224272, + "loss": 1.2031, + "step": 22 + }, + { + "epoch": 1.8, + "grad_norm": 0.07574562728404999, + "learning_rate": 0.00014759473930370736, + "loss": 1.2476, + "step": 23 + }, + { + "epoch": 1.88, + "grad_norm": 0.07235240191221237, + "learning_rate": 0.00014016954246529696, + "loss": 1.2701, + "step": 24 + }, + { + "epoch": 1.88, + "eval_loss": 1.2253003120422363, + "eval_runtime": 17.292, + "eval_samples_per_second": 5.783, + "eval_steps_per_second": 2.892, + "step": 24 + }, + { + "epoch": 1.96, + "grad_norm": 0.07071871310472488, + "learning_rate": 0.00013246994692046836, + "loss": 1.2051, + "step": 25 + }, + { + "epoch": 2.04, + "grad_norm": 0.07875131815671921, + "learning_rate": 0.00012454854871407994, + "loss": 1.192, + "step": 26 + }, + { + "epoch": 2.08, + "grad_norm": 0.06468148529529572, + "learning_rate": 0.00011645945902807341, + "loss": 1.1487, + "step": 27 + }, + { + "epoch": 2.08, + "eval_loss": 1.2213425636291504, + "eval_runtime": 17.7102, + "eval_samples_per_second": 5.646, + "eval_steps_per_second": 2.823, + "step": 27 + }, + { + "epoch": 2.16, + "grad_norm": 0.0741707906126976, + "learning_rate": 0.00010825793454723325, + "loss": 1.1673, + "step": 28 + }, + { + "epoch": 2.24, + "grad_norm": 0.06802140176296234, + "learning_rate": 0.0001, + "loss": 1.2054, + "step": 29 + }, + { + "epoch": 2.32, + "grad_norm": 0.06834083795547485, + "learning_rate": 9.174206545276677e-05, + "loss": 1.1518, + "step": 30 + }, + { + "epoch": 2.32, + "eval_loss": 1.220943808555603, + "eval_runtime": 17.4872, + "eval_samples_per_second": 5.718, + "eval_steps_per_second": 2.859, + "step": 30 + }, + { + "epoch": 2.4, + "grad_norm": 0.06714992970228195, + "learning_rate": 8.35405409719266e-05, + "loss": 1.183, + "step": 31 + }, + { + "epoch": 2.48, + "grad_norm": 0.06744072586297989, + "learning_rate": 7.54514512859201e-05, + "loss": 1.2098, + "step": 32 + }, + { + "epoch": 2.56, + "grad_norm": 0.06815183162689209, + "learning_rate": 6.753005307953167e-05, + "loss": 1.1942, + "step": 33 + }, + { + "epoch": 2.56, + "eval_loss": 1.217085599899292, + "eval_runtime": 17.2384, + "eval_samples_per_second": 5.801, + "eval_steps_per_second": 2.901, + "step": 33 + }, + { + "epoch": 2.64, + "grad_norm": 0.07002735882997513, + "learning_rate": 5.983045753470308e-05, + "loss": 1.1896, + "step": 34 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.07221011817455292, + "learning_rate": 5.240526069629265e-05, + "loss": 1.1795, + "step": 35 + }, + { + "epoch": 2.8, + "grad_norm": 0.07009800523519516, + "learning_rate": 4.530518418775733e-05, + "loss": 1.1122, + "step": 36 + }, + { + "epoch": 2.8, + "eval_loss": 1.2147068977355957, + "eval_runtime": 17.3366, + "eval_samples_per_second": 5.768, + "eval_steps_per_second": 2.884, + "step": 36 + }, + { + "epoch": 2.88, + "grad_norm": 0.06943687796592712, + "learning_rate": 3.857872873103322e-05, + "loss": 1.1838, + "step": 37 + }, + { + "epoch": 2.96, + "grad_norm": 0.06811714172363281, + "learning_rate": 3.227184283742591e-05, + "loss": 1.166, + "step": 38 + }, + { + "epoch": 3.04, + "grad_norm": 0.06823349744081497, + "learning_rate": 2.6427608932686843e-05, + "loss": 1.1513, + "step": 39 + }, + { + "epoch": 3.04, + "eval_loss": 1.213902473449707, + "eval_runtime": 17.4349, + "eval_samples_per_second": 5.736, + "eval_steps_per_second": 2.868, + "step": 39 + }, + { + "epoch": 3.08, + "grad_norm": 0.06420764327049255, + "learning_rate": 2.1085949060360654e-05, + "loss": 1.155, + "step": 40 + }, + { + "epoch": 3.16, + "grad_norm": 0.06513047218322754, + "learning_rate": 1.6283352173747145e-05, + "loss": 1.1463, + "step": 41 + }, + { + "epoch": 3.24, + "grad_norm": 0.06813672184944153, + "learning_rate": 1.2052624879351104e-05, + "loss": 1.1887, + "step": 42 + }, + { + "epoch": 3.24, + "eval_loss": 1.2127918004989624, + "eval_runtime": 17.5485, + "eval_samples_per_second": 5.699, + "eval_steps_per_second": 2.849, + "step": 42 + }, + { + "epoch": 3.32, + "grad_norm": 0.06876266747713089, + "learning_rate": 8.422667334494249e-06, + "loss": 1.1745, + "step": 43 + }, + { + "epoch": 3.4, + "grad_norm": 0.06679647415876389, + "learning_rate": 5.418275829936537e-06, + "loss": 1.0975, + "step": 44 + }, + { + "epoch": 3.48, + "grad_norm": 0.06702674925327301, + "learning_rate": 3.059973406066963e-06, + "loss": 1.1011, + "step": 45 + }, + { + "epoch": 3.48, + "eval_loss": 1.2114481925964355, + "eval_runtime": 17.6099, + "eval_samples_per_second": 5.679, + "eval_steps_per_second": 2.839, + "step": 45 + }, + { + "epoch": 3.56, + "grad_norm": 0.06805083155632019, + "learning_rate": 1.3638696597277679e-06, + "loss": 1.1265, + "step": 46 + }, + { + "epoch": 3.64, + "grad_norm": 0.06833863258361816, + "learning_rate": 3.415506993330153e-07, + "loss": 1.1453, + "step": 47 + }, + { + "epoch": 3.7199999999999998, + "grad_norm": 0.06925249099731445, + "learning_rate": 0.0, + "loss": 1.1887, + "step": 48 + }, + { + "epoch": 3.7199999999999998, + "eval_loss": 1.2115026712417603, + "eval_runtime": 17.3158, + "eval_samples_per_second": 5.775, + "eval_steps_per_second": 2.888, + "step": 48 + } + ], + "logging_steps": 1, + "max_steps": 48, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 12, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9974952442724352.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-48/training_args.bin b/checkpoint-48/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5c5f7039014fc8dc36129366fc8474e9a2e6db30 --- /dev/null +++ b/checkpoint-48/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b6f97a9990d98ad2e97e692ffff5d13e2f2644982eea955df1c8a971b386400 +size 6008 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..58562b8f0d91c7373666d814678733eb930d4cf9 --- /dev/null +++ b/config.json @@ -0,0 +1,44 @@ +{ + "_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5632, + "max_position_embeddings": 4096, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 22, + "num_key_value_heads": 4, + "pretraining_tp": 1, + "quantization_config": { + "_load_in_4bit": false, + "_load_in_8bit": true, + "bnb_4bit_compute_dtype": "float32", + "bnb_4bit_quant_storage": "uint8", + "bnb_4bit_quant_type": "fp4", + "bnb_4bit_use_double_quant": false, + "llm_int8_enable_fp32_cpu_offload": false, + "llm_int8_has_fp16_weight": false, + "llm_int8_skip_modules": null, + "llm_int8_threshold": 6.0, + "load_in_4bit": false, + "load_in_8bit": true, + "quant_method": "bitsandbytes" + }, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.42.3", + "use_cache": false, + "vocab_size": 32000 +} diff --git a/merged/config.json b/merged/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e7e60b45c41bbc43b73c4387cd854a6ae4217cd5 --- /dev/null +++ b/merged/config.json @@ -0,0 +1,29 @@ +{ + "_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 5632, + "max_position_embeddings": 4096, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 22, + "num_key_value_heads": 4, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.42.3", + "use_cache": false, + "vocab_size": 32000 +} diff --git a/merged/generation_config.json b/merged/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d6e934ea35e4bf3bbd09b7244f473ade96674184 --- /dev/null +++ b/merged/generation_config.json @@ -0,0 +1,8 @@ +{ + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": 2, + "max_length": 2048, + "pad_token_id": 0, + "transformers_version": "4.42.3" +} diff --git a/merged/pytorch_model.bin b/merged/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..169ffbdd0503ad959ea29d164e2471ee099add7f --- /dev/null +++ b/merged/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0b4790b1581afc03951bb0ad001bef4920aa3456c04314604af9d0281fb35d3 +size 2200164718 diff --git a/merged/special_tokens_map.json b/merged/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/merged/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/merged/tokenizer.model b/merged/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/merged/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/merged/tokenizer_config.json b/merged/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86 --- /dev/null +++ b/merged/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0773857a13ba5a27453a0b462624fe76e8e82a86 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": true +}